diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp index 5fb23824..a239998b 100644 --- a/src/chai/ArrayManager.hpp +++ b/src/chai/ArrayManager.hpp @@ -7,122 +7,10 @@ #ifndef CHAI_ArrayManager_HPP #define CHAI_ArrayManager_HPP -#include "chai/config.hpp" -#include "chai/ChaiMacros.hpp" -#include "chai/ExecutionSpaces.hpp" -#include "chai/PointerRecord.hpp" -#include "chai/Types.hpp" - -#if defined(CHAI_ENABLE_RAJA_PLUGIN) -#include "chai/pluginLinker.hpp" -#endif - -#include - -#include "umpire/Allocator.hpp" -#include "umpire/util/MemoryMap.hpp" - -#if defined(CHAI_ENABLE_CUDA) -#include -#endif -#if defined(CHAI_ENABLE_HIP) -#include "hip/hip_runtime_api.h" -#endif +#include "chai/ChaiManager.hpp" namespace chai { -// CHAI_GPU_ERROR_CHECK macro -#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) - -#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING - -#ifdef CHAI_ENABLE_CUDA -inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) { - fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) { - exit(code); - } - } -} -#elif defined(CHAI_ENABLE_HIP) -inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abort=true) -{ - if (code != hipSuccess) { - fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", hipGetErrorString(code), file, line); - if (abort) { - exit(code); - } - } -} -#endif - - -#define CHAI_GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } -#else // CHAI_ENABLE_GPU_ERROR_CHECKING -#define CHAI_GPU_ERROR_CHECK(code) code -#endif // CHAI_ENABLE_GPU_ERROR_CHECKING - -#endif - -// wrapper for hip/cuda synchronize -inline void synchronize() { -#if defined (CHAI_ENABLE_HIP) &&!defined(__HIP_DEVICE_COMPILE__) - CHAI_GPU_ERROR_CHECK(hipDeviceSynchronize()); -#elif defined (CHAI_ENABLE_CUDA) &&!defined(__CUDA_ARCH__) - CHAI_GPU_ERROR_CHECK(cudaDeviceSynchronize()); -#endif -} - -#if defined(CHAI_GPUCC) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - -// wrapper for hip/cuda free -CHAI_HOST inline void gpuFree(void* buffer) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - free(buffer); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipFree(buffer)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaFree(buffer)); -#endif -} - -// wrapper for hip/cuda malloc -CHAI_HOST inline void gpuMalloc(void** devPtr, size_t size) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - *devPtr = (void*)malloc(size); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipMalloc(devPtr, size)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaMalloc(devPtr, size)); -#endif -} - -// wrapper for hip/cuda managed malloc -CHAI_HOST inline void gpuMallocManaged(void** devPtr, size_t size) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - *devPtr = (void*)malloc(size); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipMallocManaged(devPtr, size)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaMallocManaged(devPtr, size)); -#endif -} - -// wrapper for hip/cuda mem copy -CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMemcpyKind kind) { -#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) - memcpy(dst, src, count); -#elif defined (CHAI_ENABLE_HIP) - CHAI_GPU_ERROR_CHECK(hipMemcpy(dst, src, count, kind)); -#elif defined (CHAI_ENABLE_CUDA) - CHAI_GPU_ERROR_CHECK(cudaMemcpy(dst, src, count, kind)); -#endif -} - -#endif //#if defined(CHAI_GPUCC) - /*! * \brief Singleton that manages caching and movement of ManagedArray objects. * diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt index faab5481..c8397ae7 100644 --- a/src/chai/CMakeLists.txt +++ b/src/chai/CMakeLists.txt @@ -15,6 +15,7 @@ set (chai_headers ArrayManager.hpp ArrayManager.inl ChaiMacros.hpp + ChaiManager.hpp ExecutionSpaces.hpp ManagedArray.hpp ManagedArray.inl @@ -29,6 +30,7 @@ if(CHAI_DISABLE_RM) endif () set (chai_sources + SharedPtrManager.cpp ArrayManager.cpp) set (chai_depends diff --git a/src/chai/ChaiMacros.hpp b/src/chai/ChaiMacros.hpp index 5cf0daa8..574ae1d1 100644 --- a/src/chai/ChaiMacros.hpp +++ b/src/chai/ChaiMacros.hpp @@ -31,6 +31,11 @@ #define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemcpyDefault cudaMemcpyDefault +#define gpuSuccess cudaSuccess +#define gpuError_t cudaError_t +#define gpuGetErrorString cudaGetErrorString +#define gpuDeviceSynchronize cudaDeviceSynchronize + // NOTE: Cannot have if defined(__HIPCC__) in the condition below, since __HIPCC__ comes from the included header hip_runtime below. #elif defined(CHAI_ENABLE_HIP) @@ -48,6 +53,11 @@ #define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemcpyDefault hipMemcpyDefault +#define gpuSuccess hipSuccess +#define gpuError_t hipError_t +#define gpuGetErrorString hipGetErrorString +#define gpuDeviceSynchronize hipDeviceSynchronize + #else #define CHAI_HOST diff --git a/src/chai/ChaiManager.hpp b/src/chai/ChaiManager.hpp new file mode 100644 index 00000000..2675604f --- /dev/null +++ b/src/chai/ChaiManager.hpp @@ -0,0 +1,28 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_ChaiManager_HPP +#define CHAI_ChaiManager_HPP + +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +#include "chai/Types.hpp" + +#include "chai/PointerRecord.hpp" + +#if defined(CHAI_ENABLE_RAJA_PLUGIN) +#include "chai/pluginLinker.hpp" +#endif + +#include + +#include "umpire/Allocator.hpp" +#include "umpire/util/MemoryMap.hpp" + + +#include "chai/util/DeviceHelpers.hpp" + +#endif // CHAI_ChaiManager_HPP diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp index 97be9a2a..2cd84e76 100644 --- a/src/chai/ManagedArray.hpp +++ b/src/chai/ManagedArray.hpp @@ -373,14 +373,14 @@ class ManagedArray : public CHAICopyable // shenanigan reasons need to be defined here. #if !defined(CHAI_DISABLE_RM) // if T is a CHAICopyable, then it is important to initialize all the - // ManagedArrays to nullptr at allocation, since it is extremely easy to + // elements with default constructors, since it is extremely easy to // trigger a moveInnerImpl, which expects inner values to be initialized. template ::value, typename std::enable_if::type = 0> CHAI_HOST bool initInner(size_t start = 0) { for (size_t i = start; i < m_size/sizeof(T); ++i) { - m_active_base_pointer[i] = nullptr; + new (&m_active_base_pointer[i]) T(); } return true; } @@ -392,6 +392,26 @@ class ManagedArray : public CHAICopyable { return false; } + + // if T is a CHAICopyable, then it is important to free all the + // CHAICopyable containers, which expect inner values to be initialized. + template ::value, + typename std::enable_if::type = 0> + CHAI_HOST bool freeInner(size_t start = 0) + { + for (size_t i = start; i < m_size/sizeof(T); ++i) { + m_active_base_pointer[i].~T(); + } + return true; + } + + // Do not deep initialize if T is not a CHAICopyable. + template ::value, + typename std::enable_if::type = 0> + CHAI_HOST bool freeInner(size_t = 0) + { + return false; + } #endif protected: /*! diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl index 6e46248f..c6c93b9c 100644 --- a/src/chai/ManagedArray.inl +++ b/src/chai/ManagedArray.inl @@ -249,12 +249,14 @@ CHAI_HOST void ManagedArray::reallocate(size_t elems) // trigger a moveInnerImpl, which expects inner values to be initialized. if (initInner(old_size/sizeof(T))) { // if we are active on the GPU, we need to send any newly initialized inner members to the device +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) if (m_pointer_record->m_last_space == GPU && old_size < m_size) { umpire::ResourceManager & umpire_rm = umpire::ResourceManager::getInstance(); void *src = (void *)(((char *)(m_pointer_record->m_pointers[CPU])) + old_size); void *dst = (void *)(((char *)(m_pointer_record->m_pointers[GPU])) + old_size); umpire_rm.copy(dst,src,m_size-old_size); } +#endif } CHAI_LOG(Debug, "m_active_ptr reallocated at address: " << m_active_pointer); @@ -276,6 +278,8 @@ CHAI_HOST void ManagedArray::free(ExecutionSpace space) if (m_pointer_record == &ArrayManager::s_null_record) { m_pointer_record = m_resource_manager->makeManaged((void *)m_active_base_pointer,m_size,space,true); } + freeInner(); + m_resource_manager->free(m_pointer_record, space); m_active_pointer = nullptr; m_active_base_pointer = nullptr; @@ -302,6 +306,9 @@ CHAI_HOST void ManagedArray::reset() template CHAI_INLINE CHAI_HOST_DEVICE size_t ManagedArray::size() const { +#if !defined(CHAI_DEVICE_COMPILE) + if (!m_is_slice) m_size = m_pointer_record->m_size; +#endif return m_size/sizeof(T); } diff --git a/src/chai/ManagedSharedPtr.hpp b/src/chai/ManagedSharedPtr.hpp new file mode 100644 index 00000000..1c93335b --- /dev/null +++ b/src/chai/ManagedSharedPtr.hpp @@ -0,0 +1,329 @@ +#ifndef CHAI_MANAGED_SHARED_PTR +#define CHAI_MANAGED_SHARED_PTR + +#include + +#include "chai/config.hpp" + +#include "chai/ArrayManager.hpp" +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +#include "chai/ManagedArray.hpp" +#include "chai/SharedPtrCounter.hpp" +#include "chai/managed_ptr.hpp" + +namespace chai { + + +struct CHAIPoly {}; + +// Type traits for SFINAE +template +struct msp_is_constructible : std::is_convertible::type {}; + +template +struct msp_compatible_with : std::false_type {}; + +template +struct msp_compatible_with : std::is_convertible::type {}; + +template +struct is_CHAICopyable : std::is_base_of::type {}; + +template +struct is_CHAIPoly : std::is_base_of::type {}; + + +template +class ManagedSharedPtr : public CHAICopyable{ + +public: + using element_type = Tp;//typename std::remove_extent::type; + +private: + template + using SafeConv = typename std::enable_if< + msp_is_constructible::value + >::type; + + template + using Compatible = typename std::enable_if< + msp_compatible_with::value, + Res + >::type; + + template + using Assignable = Compatible; + +public: + + /* + * Constructors + */ + CHAI_HOST_DEVICE + constexpr ManagedSharedPtr() noexcept : m_record_count() {} + + //// *Default* Ctor with convertible type Yp -> Tp + template> + ManagedSharedPtr(std::initializer_list&& ptrs, + std::initializer_list&& spaces, + Deleter d) + : m_record_count(Yp{}, + std::forward>(ptrs), + std::forward>(spaces), + std::move(d)) + , m_active_pointer(m_record_count.m_get_pointer(chai::CPU)) + , m_resource_manager(SharedPtrManager::getInstance()) + {} + + /* + * Copy Constructors + */ + CHAI_HOST_DEVICE + ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept + : m_record_count(rhs.m_record_count) + , m_active_pointer(rhs.m_active_pointer) + , m_resource_manager(rhs.m_resource_manager) + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); +#endif + } + + template> + CHAI_HOST_DEVICE + ManagedSharedPtr(ManagedSharedPtr const& rhs) noexcept + : m_record_count(rhs.m_record_count) + , m_active_pointer(rhs.m_active_pointer) + , m_resource_manager(rhs.m_resource_manager) + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) move(ArrayManager::getInstance()->getExecutionSpace()); + //if (m_active_pointer) move(m_resource_manager->getExecutionSpace()); +#endif + } + + CHAI_HOST_DEVICE ManagedSharedPtr& operator=(ManagedSharedPtr const& rhs){ + m_record_count=rhs.m_record_count; + m_active_pointer=rhs.m_active_pointer; + m_resource_manager=rhs.m_resource_manager; + + return *this; + + } + + CHAI_HOST void swap(ManagedSharedPtr& rhs) noexcept { + std::swap(m_active_pointer, rhs.m_active_pointer); + std::swap(m_resource_manager, rhs.m_resource_manager); + m_record_count.m_swap(rhs.m_record_count); + } + + CHAI_HOST void reset() noexcept { + ManagedSharedPtr().swap(*this); + } + + CHAI_HOST_DEVICE void shallowCopy(ManagedSharedPtr const& rhs) { + m_active_pointer = rhs.m_active_pointer; + m_active_pointer=rhs.m_active_pointer; + m_resource_manager=rhs.m_resource_manager; + } + + + /* + * Accessors + */ + CHAI_HOST_DEVICE + const element_type* cget(ExecutionSpace space = chai::CPU) const noexcept { + CHAI_UNUSED_VAR(space); +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) { + move(space, false); + } +#endif + return m_active_pointer; + } + CHAI_HOST_DEVICE + element_type* get(ExecutionSpace space = chai::CPU) const noexcept { + CHAI_UNUSED_VAR(space); +#if !defined(CHAI_DEVICE_COMPILE) + if (m_active_pointer) { + move(space); + } +#endif + return m_active_pointer; + } + + CHAI_HOST_DEVICE + element_type& operator*() const noexcept { assert(get() != nullptr); return *get(); } + + CHAI_HOST_DEVICE + element_type* operator->() const noexcept { assert(get() != nullptr); return get(); } + +private: + + //CHAI_HOST_DEVICE + //element_type* m_get() const noexcept { return static_cast*>(this)->get(); } + +public: + long use_count() const noexcept { return m_record_count.m_get_use_count(); } + + CHAI_INLINE + CHAI_HOST void registerTouch(ExecutionSpace space) { + m_resource_manager->registerTouch(m_record_count.m_get_record(), space); + } + + CHAI_HOST + void move(ExecutionSpace space, + bool registerTouch=(!std::is_const::value || is_CHAICopyable::value)) const { + ExecutionSpace prev_space = m_record_count.m_get_record()->m_last_space; + if (prev_space != GPU && space == GPU) { + /// Move nested ManagedArrays first, so they are working with a valid m_active_pointer for the host, + // and so the meta data associated with them are updated before we move the other array down. + moveInnerImpl(); + } + auto old_pointer = m_active_pointer; + m_active_pointer = static_cast(m_resource_manager->move( + (void *)m_active_pointer, m_record_count.m_get_record(), space, is_CHAIPoly::value)); + if (old_pointer != m_active_pointer) { + } + + if (registerTouch) { + m_resource_manager->registerTouch(m_record_count.m_get_record(), space); + } + if (space != GPU && prev_space == GPU) { + /// Move nested ManagedArrays after the move, so they are working with a valid m_active_pointer for the host, + // and so the meta data associated with them are updated with live GPU data + moveInnerImpl(); + } + + } + /* + * Private Members + */ +private: + template + friend class ManagedSharedPtr; + + msp_record_count m_record_count; + mutable element_type* m_active_pointer = nullptr; + + mutable SharedPtrManager* m_resource_manager = nullptr; + + template ::value, + //template ::value, + typename std::enable_if::type = 0> + CHAI_HOST + void + moveInnerImpl() const + { + m_record_count.moveInnerImpl(); + } + + template ::value, + //template ::value, + typename std::enable_if::type = 0> + CHAI_HOST + void + moveInnerImpl() const + { + } + +}; + +namespace detail { + +#if defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) +namespace impl { + +template +__global__ void msp_dispose_on_device(T* gpuPointer, Deleter d) +{ + d(gpuPointer); +} + +template +__global__ void msp_make_on_device(T* gpuPointer, Args&&... args) +{ + new(gpuPointer) T(std::forward(args)...); +} + +} // namespace impl + +//template +template +CHAI_INLINE +CHAI_HOST Tp* msp_make_on_device(Args&&... args) { + Tp* gpu_ptr = nullptr; + chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); + + auto gpu_allocator = sptr_manager->getAllocator(chai::GPU); + gpu_ptr = static_cast( gpu_allocator.allocate(1*sizeof(Tp)) ); + + impl::msp_make_on_device<<<1,1>>>(gpu_ptr, std::forward(args)...); + + return gpu_ptr; +} +#endif // defined(CHAI_ENABLE_CUDA) of defined(CHAI_ENABLE_HIP) + +template +CHAI_INLINE +CHAI_HOST Tp* msp_make_on_host(Args&&... args) { + chai::SharedPtrManager* sptr_manager = chai::SharedPtrManager::getInstance(); + + auto cpu_allocator = sptr_manager->getAllocator(chai::CPU); + + Tp* cpu_ptr = static_cast( cpu_allocator.allocate(1*sizeof(Tp)) ); + + new (cpu_ptr) Tp{std::forward(args)...}; + + return cpu_ptr; +} + +} // namespace detail + +template +CHAI_INLINE +CHAI_HOST +ManagedSharedPtr make_shared(Args&&... args) { + using Tp_non_const = std::remove_const_t; + + Tp* cpu_pointer = detail::msp_make_on_host(std::forward(args)...); + +#if defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) + + Tp* gpu_pointer = detail::msp_make_on_device(); +#if defined(CHAI_ENABLE_CUDA) + cudaDeviceSynchronize(); +#endif +#if defined(CHAI_ENABLE_HIP) + CHAI_UNUSED_VAR(hipDeviceSynchronize()); +#endif + + auto result = ManagedSharedPtr({cpu_pointer, gpu_pointer}, {CPU, GPU}, + [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} + ); + + result.registerTouch(chai::CPU); + + if (!is_CHAICopyable::value) { + result.move(chai::GPU, false); + result.move(chai::CPU, false); + } + +#else // defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) + + auto result = ManagedSharedPtr({cpu_pointer}, {CPU}, + [] CHAI_HOST_DEVICE (Tp* p){p->~Tp();} + ); + +#endif // defined(CHAI_ENABLE_CUDA) or defined(CHAI_ENABLE_HIP) + + return result; +} + +} // namespace chai + + +#endif // CHAI_MANAGED_SHARED_PTR diff --git a/src/chai/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp index b41a452e..20eb6aa2 100644 --- a/src/chai/RajaExecutionSpacePlugin.cpp +++ b/src/chai/RajaExecutionSpacePlugin.cpp @@ -64,7 +64,7 @@ PluginStrategy::PluginStrategy() = default; #endif // Register plugin with RAJA -RAJA::util::PluginRegistry::add P( +static RAJA::util::PluginRegistry::add P( "RajaExecutionSpacePlugin", "Plugin to set CHAI execution space based on RAJA execution platform"); diff --git a/src/chai/SharedPointerRecord.hpp b/src/chai/SharedPointerRecord.hpp new file mode 100644 index 00000000..890578ed --- /dev/null +++ b/src/chai/SharedPointerRecord.hpp @@ -0,0 +1,57 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPointerRecord_HPP +#define CHAI_SharedPointerRecord_HPP + +#include "chai/ExecutionSpaces.hpp" +#include "chai/SharedPtrManager.hpp" +#include "chai/Types.hpp" + +#include +#include + +namespace chai +{ + +/*! + * \brief Struct holding details about each pointer. + */ +//template +struct msp_pointer_record { + + // Using NUM_EXECUTION_SPACES for the time being, this will help with logical + // control since ExecutionSpaces are already defined. + // Only CPU and GPU spaces will be used. + // If other spaces are enabled they will not be used by ManagedSharedPtr. + void* m_pointers[NUM_EXECUTION_SPACES]; + bool m_touched[NUM_EXECUTION_SPACES]; + bool m_owned[NUM_EXECUTION_SPACES]; + + ExecutionSpace m_last_space; + //UserCallback m_user_callback; + + int m_allocators[NUM_EXECUTION_SPACES]; + + //msp_pointer_record(void* host_p = nullptr, void* device_p = nullptr) : + msp_pointer_record() : + m_last_space(CPU) { + for (int space = 0; space < NUM_EXECUTION_SPACES; ++space ) { + m_pointers[space] = nullptr; + m_touched[space] = false; + m_owned[space] = true; + m_allocators[space] = 0; + } + } + +}; + + + + + +} // end of namespace chai +#endif // CHAI_SharedPointerRecord_HPP diff --git a/src/chai/SharedPtrCounter.hpp b/src/chai/SharedPtrCounter.hpp new file mode 100644 index 00000000..abff134d --- /dev/null +++ b/src/chai/SharedPtrCounter.hpp @@ -0,0 +1,229 @@ + +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPointerCounter_HPP +#define CHAI_SharedPointerCounter_HPP + +#include +#include +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +#include "chai/SharedPtrManager.hpp" + +namespace chai +{ + +class msp_counted_base { +public: + msp_counted_base() noexcept : m_use_count(1) {} + + virtual ~msp_counted_base() noexcept {} + + virtual void m_dispose() noexcept = 0; + virtual void m_destroy() noexcept { delete this; } + + virtual void moveInnerImpl() const = 0; + + void m_add_ref_copy() noexcept { ++m_use_count; } + + void m_release() noexcept { + if(--m_use_count == 0) { + m_dispose(); + m_destroy(); + } + } + + long m_get_use_count() const noexcept { return m_use_count; } + + virtual msp_pointer_record* m_get_record() noexcept = 0; + +private: + msp_counted_base(msp_counted_base const&) = delete; + msp_counted_base& operator=(msp_counted_base const&) = delete; + + long m_use_count = 0; +}; + +template +class msp_counted_ptr final : public msp_counted_base { +public: + msp_counted_ptr(Ptr h_p, Ptr d_p) noexcept + : m_record(SharedPtrManager::getInstance()->makeSharedPtrRecord(h_p, d_p, sizeof(std::remove_pointer), true)) + {} + + virtual void m_dispose() noexcept { delete (Ptr)m_record->m_pointers[chai::CPU]; }// TODO : Other Exec spaces... + virtual void m_destroy() noexcept { delete this; } + + virtual void moveInnerImpl() const { + using T = std::remove_pointer_t; + Ptr host_ptr = (Ptr) m_record->m_pointers[CPU]; + // trigger the copy constructor + std::cout << "Trigger Inner Copy Ctor @ " << host_ptr << std::endl; + T inner = T(*host_ptr); + // ensure the inner type gets the state of the result of the copy + host_ptr->operator=(inner); + } + + msp_counted_ptr(msp_counted_ptr const&) = delete; + msp_counted_ptr& operator=(msp_counted_ptr const&) = delete; + msp_pointer_record* m_get_record() noexcept { return m_record; } +private: + msp_pointer_record* m_record; +}; + +#include + +#if defined(CHAI_GPUCC) +namespace impl { + +template +__global__ void msp_dispose_on_device(T* gpuPointer, Deleter d) +{ + d(gpuPointer); +} + +} // namespace impl +#endif + +template +class msp_counted_deleter final : public msp_counted_base { + + class impl { + public: + template + impl(Ptrs&& ptrs, Spaces&& spaces, Deleter d) + : m_record(SharedPtrManager::getInstance()-> + makeSharedPtrRecord(std::forward(ptrs), std::forward(spaces), + sizeof(std::remove_pointer_t), true)) + , m_deleter(std::move(d)) + {} + ~impl() { if (m_record) delete m_record; } + + Deleter& m_del() noexcept { return m_deleter; } + msp_pointer_record* m_record; + Deleter m_deleter; + }; + +public: + template + msp_counted_deleter(Ptrs&& ptrs, Spaces&& spaces, Deleter d) noexcept + : m_impl(std::forward(ptrs), std::forward(spaces), std::move(d)) + {} + + virtual void m_dispose() noexcept { + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + Ptr ptr = (Ptr)m_impl.m_record->m_pointers[space]; + if (ptr) { + if (space == chai::CPU) m_impl.m_del()(ptr); +#if defined(CHAI_GPUCC) + if (space == chai::GPU) ::chai::impl::msp_dispose_on_device<<<1,1>>>(ptr, m_impl.m_del()); +#endif + SharedPtrManager::getInstance()->free(m_impl.m_record, ExecutionSpace(space)); + } + } + } + + virtual void m_destroy() noexcept { this->~msp_counted_deleter(); } + + virtual void moveInnerImpl() const { + + using T_non_const = std::remove_const_t>; + T_non_const* host_ptr = const_cast((Ptr)m_impl.m_record->m_pointers[CPU]); + + // trigger the copy constructor + std::cout << "Trigger Inner Copy Ctor @ " << host_ptr << std::endl; + T_non_const inner = T_non_const(*host_ptr); + + // ensure the inner type gets the state of the result of the copy + //err_func(host_ptr); + host_ptr->operator=(inner); + } + + msp_counted_deleter(msp_counted_deleter const&) = delete; + msp_counted_deleter& operator=(msp_counted_deleter const&) = delete; + + msp_pointer_record* m_get_record() noexcept { return m_impl.m_record; } +private: + impl m_impl; +}; + + +class msp_record_count { +public: + CHAI_HOST_DEVICE + constexpr msp_record_count() noexcept : m_pi(0) {} + + template + explicit msp_record_count(T, Ptrs&& ptrs, Spaces&& spaces, Deleter d) + : m_pi( new msp_counted_deleter( + std::forward(ptrs) + , std::forward(spaces) + , std::move(d)) ) {} + + CHAI_HOST_DEVICE + ~msp_record_count() noexcept + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_pi) { + m_pi->m_release(); + } +#endif // !defined(CHAI_DEVICE_COMPILE) + } + + CHAI_HOST_DEVICE + msp_record_count(msp_record_count const& rhs) noexcept : m_pi(rhs.m_pi) + { +#if !defined(CHAI_DEVICE_COMPILE) + if (m_pi) m_pi->m_add_ref_copy(); +#endif // !defined(CHAI_DEVICE_COMPILE) + } + + CHAI_HOST_DEVICE + msp_record_count& operator=(msp_record_count const& rhs) noexcept { + CHAI_UNUSED_VAR(rhs); +#if !defined(CHAI_DEVICE_COMPILE) + msp_counted_base* temp = rhs.m_pi; + if (temp != m_pi) + { + if (temp) temp->m_add_ref_copy(); + if (m_pi) m_pi->m_release(); + m_pi = temp; + } +#endif // !defined(CHAI_DEVICE_COMPILE) + return *this; + } + + void m_swap(msp_record_count& rhs) noexcept { + msp_counted_base* temp = rhs.m_pi; + rhs.m_pi = m_pi; + m_pi = temp; + } + + long m_get_use_count() const noexcept + { return m_pi ? m_pi->m_get_use_count() : 0; } + + friend inline bool + operator==(msp_record_count const& a, msp_record_count const& b) noexcept + { return a.m_pi == b.m_pi; } + + msp_pointer_record* m_get_record() const noexcept { return m_pi->m_get_record(); } + + template + Ptr* m_get_pointer(chai::ExecutionSpace space) noexcept { return static_cast(m_get_record()->m_pointers[space]); } + + void moveInnerImpl() const { m_pi->moveInnerImpl(); } + + mutable msp_counted_base* m_pi = nullptr; + +}; + + + +} // end of namespace chai +#endif // CHAI_SharedPointerRecord_HPP diff --git a/src/chai/SharedPtrManager.cpp b/src/chai/SharedPtrManager.cpp new file mode 100644 index 00000000..11c1400b --- /dev/null +++ b/src/chai/SharedPtrManager.cpp @@ -0,0 +1,620 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#include "chai/SharedPtrManager.hpp" +#include + +#include "chai/ExecutionSpaces.hpp" +#include "chai/config.hpp" + +#if defined(CHAI_ENABLE_CUDA) +#if !defined(CHAI_THIN_GPU_ALLOCATE) +#include "cuda_runtime_api.h" +#endif +#endif + +#include "umpire/ResourceManager.hpp" + +namespace chai +{ +thread_local ExecutionSpace SharedPtrManager::m_current_execution_space; +thread_local bool SharedPtrManager::m_synced_since_last_kernel = false; + +msp_pointer_record SharedPtrManager::s_null_record = msp_pointer_record(); + +SharedPtrManager* SharedPtrManager::getInstance() +{ + static SharedPtrManager s_resource_manager_instance; + return &s_resource_manager_instance; +} + +SharedPtrManager::SharedPtrManager() : + m_pointer_map{}, + m_allocators{}, + m_resource_manager{umpire::ResourceManager::getInstance()} + //,m_callbacks_active{true} +{ + m_pointer_map.clear(); + m_current_execution_space = NONE; + m_default_allocation_space = CPU; + + m_allocators[CPU] = + new umpire::Allocator(m_resource_manager.getAllocator("HOST")); + +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + m_allocators[GPU] = + new umpire::Allocator(m_resource_manager.getAllocator("HOST")); +#else + m_allocators[GPU] = + new umpire::Allocator(m_resource_manager.getAllocator("DEVICE")); +#endif +#endif + +#if defined(CHAI_ENABLE_UM) + m_allocators[UM] = + new umpire::Allocator(m_resource_manager.getAllocator("UM")); +#endif + +#if defined(CHAI_ENABLE_PINNED) +#if (defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)) && !defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + m_allocators[PINNED] = + new umpire::Allocator(m_resource_manager.getAllocator("PINNED")); +#else + m_allocators[PINNED] = + new umpire::Allocator(m_resource_manager.getAllocator("HOST")); +#endif +#endif +} + +void SharedPtrManager::registerPointer( + msp_pointer_record* record, + ExecutionSpace space, + bool owned) +{ + std::lock_guard lock(m_mutex); + auto pointer = record->m_pointers[space]; + + // if we are registering a new pointer record for a pointer where there is already + // a pointer record, we assume the old record was somehow abandoned by the host + // application and trigger an ACTION_FOUND_ABANDONED callback + auto found_pointer_record_pair = m_pointer_map.find(pointer); + if (found_pointer_record_pair != m_pointer_map.end()) { + msp_pointer_record ** found_pointer_record_addr = found_pointer_record_pair->second; + if (found_pointer_record_addr != nullptr) { + + msp_pointer_record *foundRecord = *found_pointer_record_addr; + // if it's actually the same pointer record, then we're OK. If it's a different + // one, delete the old one. + if (foundRecord != record) { + CHAI_LOG(Warning, "SharedPtrManager::registerPointer found a record for " << + pointer << " already there. Deleting abandoned pointer record."); + + //callback(foundRecord, ACTION_FOUND_ABANDONED, space); + + for (int fspace = CPU; fspace < NUM_EXECUTION_SPACES; ++fspace) { + foundRecord->m_pointers[fspace] = nullptr; + } + + delete foundRecord; + } + } + } + + CHAI_LOG(Debug, "Registering " << pointer << " in space " << space); + + m_pointer_map.insert(pointer, record); + + for (int i = 0; i < NUM_EXECUTION_SPACES; i++) { + if (!record->m_pointers[i]) record->m_owned[i] = true; + } + record->m_owned[space] = owned; + + if (pointer) { + // if umpire already knows about this pointer, we want to make sure its records and ours + // are consistent + if (m_resource_manager.hasAllocator(pointer)) { + // umpire::util::AllocationRecord *allocation_record = const_cast(m_resource_manager.findAllocationRecord(pointer)); + // //allocation_record->size = record->m_size; + } + // register with umpire if it's not there so that umpire can perform data migrations + else { + umpire::util::AllocationRecord new_allocation_record; + new_allocation_record.ptr = pointer; + //new_allocation_record.size = record->m_size; + new_allocation_record.strategy = m_resource_manager.getAllocator(record->m_allocators[space]).getAllocationStrategy(); + + m_resource_manager.registerAllocation(pointer, new_allocation_record); + } + } +} + +void SharedPtrManager::deregisterPointer(msp_pointer_record* record, bool deregisterFromUmpire) +{ + std::lock_guard lock(m_mutex); + for (int i = 0; i < NUM_EXECUTION_SPACES; i++) { + void * pointer = record->m_pointers[i]; + if (pointer) { + if (deregisterFromUmpire) { + m_resource_manager.deregisterAllocation(pointer); + } + CHAI_LOG(Debug, "De-registering " << pointer); + m_pointer_map.erase(pointer); + } + } + if (record != &s_null_record) { + delete record; + } +} + +//void * SharedPtrManager::frontOfAllocation(void * pointer) { +// if (pointer) { +// if (m_resource_manager.hasAllocator(pointer)) { +// auto allocation_record = m_resource_manager.findAllocationRecord(pointer); +// if (allocation_record) { +// return allocation_record->ptr; +// } +// } +// } +// return nullptr; +//} + +void SharedPtrManager::setExecutionSpace(ExecutionSpace space) +{ +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + if (isGPUSimMode()) { + space = chai::GPU; + } +#endif + + CHAI_LOG(Debug, "Setting execution space to " << space); + + if (chai::GPU == space) { + m_synced_since_last_kernel = false; + } + +#if defined(CHAI_THIN_GPU_ALLOCATE) + if (chai::CPU == space) { + syncIfNeeded(); + } +#endif + + m_current_execution_space = space; +} + +void* SharedPtrManager::move(void* pointer, + msp_pointer_record* pointer_record, + ExecutionSpace space, bool poly) +{ + // Check for default arg (NONE) + if (space == NONE) { + space = m_current_execution_space; + } + + if (space == NONE) { + return pointer; + } + + move(pointer_record, space, poly); + + return pointer_record->m_pointers[space]; +} + +ExecutionSpace SharedPtrManager::getExecutionSpace() +{ + return m_current_execution_space; +} + +void SharedPtrManager::registerTouch(msp_pointer_record* pointer_record) +{ + registerTouch(pointer_record, m_current_execution_space); +} + +void SharedPtrManager::registerTouch(msp_pointer_record* pointer_record, + ExecutionSpace space) +{ + if (pointer_record && pointer_record != &s_null_record) { + + if (space != NONE) { + CHAI_LOG(Debug, pointer_record->m_pointers[space] << " touched in space " << space); + pointer_record->m_touched[space] = true; + pointer_record->m_last_space = space; + } + } +} + + +void SharedPtrManager::resetTouch(msp_pointer_record* pointer_record) +{ + if (pointer_record && pointer_record!= &s_null_record) { + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_touched[space] = false; + } + } +} + + +/* Not all GPU platform runtimes (notably HIP), will give you asynchronous copies to the device by default, so we leverage + * umpire's API for asynchronous copies using camp resources in this method, based off of the CHAI destination space + * */ +static void copy(void * dst_pointer, void * src_pointer, umpire::ResourceManager & manager, ExecutionSpace dst_space, ExecutionSpace src_space, bool poly=false) { + +#ifdef CHAI_ENABLE_CUDA + camp::resources::Resource device_resource(camp::resources::Cuda::get_default()); +#elif defined(CHAI_ENABLE_HIP) + camp::resources::Resource device_resource(camp::resources::Hip::get_default()); +#else + camp::resources::Resource device_resource(camp::resources::Host::get_default()); +#endif + + + //std::cout << "SPtr Manager Copy Call\n"; + //std::cout << "dst_ptr @ " << dst_pointer << std::endl; + //std::cout << "src_ptr @ " << src_pointer << std::endl; + camp::resources::Resource host_resource(camp::resources::Host::get_default()); + if (dst_space == GPU || src_space == GPU) { + // Do the copy using the device resource + //std::cout << "---- Sptr Manager Device Copy\n"; + //std::cout << "---- dst_ptr @ " << dst_pointer << std::endl; + //std::cout << "---- src_ptr @ " << src_pointer << std::endl; + + if (poly) { + //std::cout << "---- POLY COPY\n"; + std::size_t vtable_size = sizeof(void*); + void* poly_src_ptr = ((char*)src_pointer + vtable_size); + void* poly_dst_ptr = ((char*)dst_pointer + vtable_size); + manager.copy(poly_dst_ptr, poly_src_ptr, device_resource); + } else { + //std::cout << "---- STD COPY\n"; + manager.copy(dst_pointer, src_pointer, device_resource); + } + + } else { + // Do the copy using the host resource + manager.copy(dst_pointer, src_pointer, host_resource); + } + // Ensure device to host copies are synchronous + if (dst_space == CPU && src_space == GPU) { + device_resource.wait(); + } +} + +void SharedPtrManager::move(msp_pointer_record* record, ExecutionSpace space, bool poly) +{ + if (space == NONE) { + return; + } + + //callback(record, ACTION_CAPTURED, space); + + if (space == record->m_last_space) { + return; + } + + ExecutionSpace prev_space = record->m_last_space; + + void* src_pointer = record->m_pointers[prev_space]; + void* dst_pointer = record->m_pointers[space]; + + if ( (!record->m_touched[record->m_last_space]) || (! src_pointer )) { + return; + } else if (dst_pointer != src_pointer) { + // Exclude the copy if src and dst are the same (can happen for PINNED memory) + { + chai::copy(dst_pointer, src_pointer, m_resource_manager, space, prev_space, poly); + } + + //callback(record, ACTION_MOVE, space); + } + + resetTouch(record); +} + +void SharedPtrManager::allocate( + msp_pointer_record* pointer_record, + ExecutionSpace space) +{ + //auto size = pointer_record->m_size; + auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[space]); + + pointer_record->m_pointers[space] = alloc.allocate(1); + //callback(pointer_record, ACTION_ALLOC, space); + registerPointer(pointer_record, space); + + CHAI_LOG(Debug, "Allocated array at: " << pointer_record->m_pointers[space]); +} + +void SharedPtrManager::free(msp_pointer_record* pointer_record, ExecutionSpace spaceToFree) +{ + if (!pointer_record) return; + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + if (space == spaceToFree || spaceToFree == NONE) { + if (pointer_record->m_pointers[space]) { + void* space_ptr = pointer_record->m_pointers[space]; + if (pointer_record->m_owned[space]) { +#if defined(CHAI_ENABLE_UM) + if (space_ptr == pointer_record->m_pointers[UM]) { + //callback(pointer_record, + // ACTION_FREE, + // ExecutionSpace(UM)); + + auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[UM]); + alloc.deallocate(space_ptr); + + for (int space_t = CPU; space_t < NUM_EXECUTION_SPACES; ++space_t) { + if (space_ptr == pointer_record->m_pointers[space_t]) { + pointer_record->m_pointers[space_t] = nullptr; + } + } + } else +#endif +#if defined(CHAI_ENABLE_PINNED) + if (space_ptr == pointer_record->m_pointers[PINNED]) { + callback(pointer_record, + ACTION_FREE, + ExecutionSpace(PINNED)); + + auto alloc = m_resource_manager.getAllocator( + pointer_record->m_allocators[PINNED]); + alloc.deallocate(space_ptr); + + for (int space_t = CPU; space_t < NUM_EXECUTION_SPACES; ++space_t) { + if (space_ptr == pointer_record->m_pointers[space_t]) { + pointer_record->m_pointers[space_t] = nullptr; + } + } + } else +#endif + { + // callback(pointer_record, + // ACTION_FREE, + // ExecutionSpace(space)); + + auto alloc = m_resource_manager.getAllocator( + pointer_record->m_allocators[space]); + alloc.deallocate(space_ptr); + + pointer_record->m_pointers[space] = nullptr; + } + } + else + { + m_resource_manager.deregisterAllocation(space_ptr); + } + { + CHAI_LOG(Debug, "DeRegistering " << space_ptr); + std::lock_guard lock(m_mutex); + m_pointer_map.erase(space_ptr); + } + } + } + } + + if (pointer_record != &s_null_record && spaceToFree == NONE) { + delete pointer_record; + } +} + + +void SharedPtrManager::setDefaultAllocationSpace(ExecutionSpace space) +{ + m_default_allocation_space = space; +} + +ExecutionSpace SharedPtrManager::getDefaultAllocationSpace() +{ + return m_default_allocation_space; +} + + +//void SharedPtrManager::setUserCallback(void* pointer, UserCallback const& f) +//{ +// // TODO ?? +// auto pointer_record = getPointerRecord(pointer); +// pointer_record->m_user_callback = f; +//} +// +//void SharedPtrManager::setGlobalUserCallback(UserCallback const& f) +//{ +// m_user_callback = f; +//} + +msp_pointer_record* SharedPtrManager::getPointerRecord(void* pointer) +{ + std::lock_guard lock(m_mutex); + auto record = m_pointer_map.find(pointer); + return record->second ? *record->second : &s_null_record; +} + +// TODO: Need a better way of dealing with non-cuda builds here... +msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, + size_t size, + bool owned) +{ + void* pointer = const_cast(c_pointer); +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + void* d_pointer = const_cast(c_d_pointer); +#endif + + if (pointer == nullptr) { + return &s_null_record ; + } + + m_resource_manager.registerAllocation( + pointer, + {pointer, size, m_allocators[chai::CPU]->getAllocationStrategy()}); + +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + m_resource_manager.registerAllocation( + d_pointer, + {d_pointer, size, m_allocators[chai::GPU]->getAllocationStrategy()}); +#endif + + auto pointer_record = getPointerRecord(pointer); + + if (pointer_record == &s_null_record) { + if (pointer) { + pointer_record = new msp_pointer_record(); + } else { + return pointer_record; + } + } + else { + CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); + //callback(pointer_record, ACTION_FOUND_ABANDONED, space); + } + + pointer_record->m_pointers[chai::CPU] = pointer; + pointer_record->m_owned[chai::CPU] = owned; +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + pointer_record->m_pointers[chai::GPU] = d_pointer; + pointer_record->m_owned[chai::GPU] = owned; +#endif + //pointer_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); + } + + if (pointer) { + registerPointer(pointer_record, chai::CPU, owned); +#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + registerPointer(pointer_record, chai::GPU, owned); +#endif + } + + return pointer_record; +} + +msp_pointer_record* SharedPtrManager::deepCopyRecord(msp_pointer_record const* record) +{ + msp_pointer_record* new_record = new msp_pointer_record{}; + //new_record->m_user_callback = [] (const msp_pointer_record*, Action, ExecutionSpace) {}; + + const ExecutionSpace last_space = record->m_last_space; + new_record->m_last_space = last_space; + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + new_record->m_allocators[space] = record->m_allocators[space]; + } + + allocate(new_record, last_space); + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + new_record->m_owned[space] = true; + new_record->m_touched[space] = false; + } + + new_record->m_touched[last_space] = true; + + void* dst_pointer = new_record->m_pointers[last_space]; + void* src_pointer = record->m_pointers[last_space]; + + chai::copy(dst_pointer, src_pointer, m_resource_manager, last_space, last_space); + + return new_record; +} + +std::unordered_map +SharedPtrManager::getPointerMap() const +{ + std::lock_guard lock(m_mutex); + std::unordered_map mapCopy; + + for (const auto& entry : m_pointer_map) { + mapCopy[entry.first] = *entry.second; + } + + return mapCopy; +} + +size_t SharedPtrManager::getTotalNumSharedPtrs() const { return m_pointer_map.size(); } + +// TODO: Investigate counting memory allocated in each execution space if +// possible +//size_t SharedPtrManager::getTotalSize() const +//{ +// std::lock_guard lock(m_mutex); +// size_t total = 0; +// +// for (const auto& entry : m_pointer_map) { +// total += (*entry.second)->m_size; +// } +// +// return total; +//} + +//void SharedPtrManager::reportLeaks() const +//{ +// std::lock_guard lock(m_mutex); +// for (const auto& entry : m_pointer_map) { +// const void* pointer = entry.first; +// const msp_pointer_record* record = *entry.second; +// +// for (int s = CPU; s < NUM_EXECUTION_SPACES; ++s) { +// if (pointer == record->m_pointers[s]) { +// callback(record, ACTION_LEAKED, ExecutionSpace(s)); +// } +// } +// } +//} + +int +SharedPtrManager::getAllocatorId(ExecutionSpace space) const +{ + return m_allocators[space]->getId(); +} + +void SharedPtrManager::evict(ExecutionSpace space, ExecutionSpace destinationSpace) { + // Check arguments + if (space == NONE) { + // Nothing to be done + return; + } + + if (destinationSpace == NONE) { + // If the destination space is NONE, evicting invalidates all data and + // leaves us in a bad state (if the last touch was in the eviction space). + CHAI_LOG(Warning, "evict does nothing with destinationSpace == NONE!"); + return; + } + + if (space == destinationSpace) { + // It doesn't make sense to evict to the same space, so do nothing + CHAI_LOG(Warning, "evict does nothing with space == destinationSpace!"); + return; + } + + // Now move and evict + std::vector pointersToEvict; + { + std::lock_guard lock(m_mutex); + for (const auto& entry : m_pointer_map) { + // Get the pointer record + auto record = *entry.second; + + // Move the data and register the touches + move(record, destinationSpace); + registerTouch(record, destinationSpace); + + // If the destinationSpace is ever allowed to be NONE, then we will need to + // update the touch in the eviction space and make sure the last space is not + // the eviction space. + + // Mark record for eviction later in this routine + pointersToEvict.push_back(record); + } + } + + // This must be done in a second pass because free erases from m_pointer_map, + // which would invalidate the iterator in the above loop + for (const auto& entry : pointersToEvict) { + free(entry, space); + } +} + + +} // end of namespace chai diff --git a/src/chai/SharedPtrManager.hpp b/src/chai/SharedPtrManager.hpp new file mode 100644 index 00000000..71339784 --- /dev/null +++ b/src/chai/SharedPtrManager.hpp @@ -0,0 +1,439 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPtrManager_HPP +#define CHAI_SharedPtrManager_HPP + +#include "chai/SharedPointerRecord.hpp" +#include "chai/ChaiManager.hpp" + +namespace chai +{ + +/*! + * \brief Singleton that manages caching and movement of ManagedArray objects. + * + * The SharedPtrManager class co-ordinates the allocation and movement of + * ManagedSharedPtr objects. These objects are cached, and data is only copied + * between ExecutionSpaces when necessary. This functionality is typically + * hidden behind a programming model layer, such as RAJA, or the exmaple + * included in util/forall.hpp + * + * The SharedPtrManager is a singleton, so must always be accessed through the + * static getInstance method. Here is an example using the SharedPtrManager: + * + * \code + * const chai::SharedPtrManager* rm = chai::SharedPtrManager::getInstance(); + * rm->setExecutionSpace(chai::CPU); + * // Do something with ManagedSharedPtr on the CPU... but they must be copied! + * rm->setExecutionSpace(chai::NONE); + * \endcode + * + * SharedPtrManager differs from ArrayManager such that it does not support + * reallocation or callbacks (at this time). + */ +class SharedPtrManager +{ +public: + template + using T_non_const = typename std::remove_const::type; + + using PointerMap = umpire::util::MemoryMap; + + CHAISHAREDDLL_API static msp_pointer_record s_null_record; + + /*! + * \brief Get the singleton instance. + * + * \return Pointer to the SharedPtrManager instance. + * + */ + CHAISHAREDDLL_API + static SharedPtrManager* getInstance(); + + /*! + * \brief Set the current execution space. + * + * \param space The space to set as current. + */ + CHAISHAREDDLL_API void setExecutionSpace(ExecutionSpace space); + + /*! + * \brief Get the current execution space. + * + * \return The current execution space.jo + */ + CHAISHAREDDLL_API ExecutionSpace getExecutionSpace(); + + /*! + * \brief Move data in pointer to the current execution space. + * + * \param pointer Pointer to data in any execution space. + * \return Pointer to data in the current execution space. + */ + CHAISHAREDDLL_API void* move(void* pointer, + msp_pointer_record* pointer_record, + ExecutionSpace = NONE, bool = false); + + /*! + * \brief Register a touch of the pointer in the current execution space. + * + * \param pointer Raw pointer to register a touch of. + */ + CHAISHAREDDLL_API void registerTouch(msp_pointer_record* pointer_record); + + /*! + * \brief Register a touch of the pointer in the given execution space. + * + * The pointer doesn't need to exist in the space being touched. + * + * \param pointer Raw pointer to register a touch of. + * \param space Space to register touch. + */ + CHAISHAREDDLL_API void registerTouch(msp_pointer_record* pointer_record, ExecutionSpace space); + + /*! + * \brief Make a new allocation of the data described by the msp_pointer_record in + * the given space. + * + * \param pointer_record + * \param space Space in which to make the allocation. + */ + CHAISHAREDDLL_API void allocate(msp_pointer_record* pointer_record, ExecutionSpace space = CPU); + + /*! + * \brief Reallocate data. + * + * Data is reallocated in all spaces this pointer is associated with. + * + * \param ptr Pointer to address to reallocate + * \param elems The number of elements to allocate. + * \tparam T The type of data to allocate. + * + * \return Pointer to the allocated memory. + */ +// template +// void* reallocate(void* pointer, +// size_t elems, +// msp_pointer_record* record); + + /*! + * \brief Set the default space for new ManagedArray allocations. + * + * ManagedArrays allocated without an explicit ExecutionSpace argument will + * be allocated in space after this routine is called. + * + * \param space New space for default allocations. + */ + CHAISHAREDDLL_API void setDefaultAllocationSpace(ExecutionSpace space); + + /*! + * \brief Get the currently set default allocation space. + * + * See also setDefaultAllocationSpace. + * + * \return Current default space for allocations. + */ + CHAISHAREDDLL_API ExecutionSpace getDefaultAllocationSpace(); + + /*! + * \brief Free allocation(s) associated with the given msp_pointer_record. + * Default (space == NONE) will free all allocations and delete + * the pointer record. + */ + CHAISHAREDDLL_API void free(msp_pointer_record* pointer, ExecutionSpace space = NONE); + +#if defined(CHAI_ENABLE_PICK) + template + T_non_const pick(T* src_ptr, size_t index); + + template + void set(T* dst_ptr, size_t index, const T& val); +#endif + + /*! + * \brief Get the size of the given pointer. + * + * \param pointer Pointer to find the size of. + * \return Size of pointer. + */ + //CHAISHAREDDLL_API size_t getSize(void* pointer); + + template + msp_pointer_record* makeSharedPtrRecord(std::initializer_list pointers, + std::initializer_list spaces, + size_t size, + bool owned); + + CHAISHAREDDLL_API msp_pointer_record* makeSharedPtrRecord(void const* c_pointer, void const* c_d_pointer, + size_t size, + //ExecutionSpace space, + bool owned); + + /*! + * \brief Assign a user-defined callback triggered upon memory operations. + * This callback applies to a single ManagedArray. + */ + //CHAISHAREDDLL_API void setUserCallback(void* pointer, UserCallback const& f); + + /*! + * \brief Assign a user-defined callback triggered upon memory operations. + * This callback applies to all ManagedArrays. + */ + //CHAISHAREDDLL_API void setGlobalUserCallback(UserCallback const& f); + + /*! + * \brief Set touched to false in all spaces for the given msp_pointer_record. + * + * \param pointer_record msp_pointer_record to reset. + */ + CHAISHAREDDLL_API void resetTouch(msp_pointer_record* pointer_record); + + /*! + * \brief Find the msp_pointer_record corresponding to the raw pointer. + * + * \param pointer Raw pointer to find the msp_pointer_record for. + * + * \return msp_pointer_record containing the raw pointer, or an empty + * msp_pointer_record if none found. + */ + CHAISHAREDDLL_API msp_pointer_record* getPointerRecord(void* pointer); + + /*! + * \brief Create a copy of the given msp_pointer_record with a new allocation + * in the active space. + * + * \param record The msp_pointer_record to copy. + * + * \return A copy of the given msp_pointer_record, must be free'd with delete. + */ + CHAISHAREDDLL_API msp_pointer_record* deepCopyRecord(msp_pointer_record const* record); + + /*! + * \brief Create a copy of the pointer map. + * + * \return A copy of the pointer map. Can be used to find memory leaks. + */ + CHAISHAREDDLL_API std::unordered_map getPointerMap() const; + + /*! + * \brief Get the total number of arrays registered with the array manager. + * + * \return The total number of arrays registered with the array manager. + */ + CHAISHAREDDLL_API size_t getTotalNumSharedPtrs() const; + + /*! + * \brief Get the total amount of memory allocated. + * + * \return The total amount of memory allocated. + */ + //CHAISHAREDDLL_API size_t getTotalSize() const; + + /*! + * \brief Calls callbacks of pointers still in the map with ACTION_LEAKED. + */ + //CHAISHAREDDLL_API void reportLeaks() const; + + /*! + * \brief Get the allocator ID + * + * \return The allocator ID. + */ + CHAISHAREDDLL_API int getAllocatorId(ExecutionSpace space) const; + + /*! + * \brief Wraps our resource manager's copy. + */ + CHAISHAREDDLL_API void copy(void * dst, void * src, size_t size); + + /*! + * \brief Registering an allocation with the SharedPtrManager + * + * \param record msp_pointer_record of this allocation. + * \param space Space in which the pointer was allocated. + * \param owned Should the allocation be free'd by CHAI? + */ + CHAISHAREDDLL_API void registerPointer(msp_pointer_record* record, + ExecutionSpace space, + bool owned = true); + + /*! + * \brief Deregister a msp_pointer_record from the SharedPtrManager. + * + * \param record msp_pointer_record of allocation to deregister. + * \param deregisterFromUmpire If true, deregister from umpire as well. + */ + CHAISHAREDDLL_API void deregisterPointer(msp_pointer_record* record, bool deregisterFromUmpire=false); + + /*! + * \brief Returns the front of the allocation associated with this pointer, nullptr if allocation not found. + * + * \param pointer Pointer to address of that we want the front of the allocation for. + */ + //CHAISHAREDDLL_API void * frontOfAllocation(void * pointer); + + /*! + * \brief set the allocator for an execution space. + * + * \param space Execution space to set the default allocator for. + * \param allocator The allocator to use for this space. Will be copied into chai. + */ + void setAllocator(ExecutionSpace space, umpire::Allocator &allocator); + + /*! + * \brief Get the allocator for an execution space. + * + * \param space Execution space of the allocator to get. + * + * \return The allocator for the given space. + */ + umpire::Allocator getAllocator(ExecutionSpace space); + + /*! + * \brief Turn callbacks on. + */ + //void enableCallbacks() { m_callbacks_active = true; } + + /*! + * \brief Turn callbacks off. + */ + //void disableCallbacks() { m_callbacks_active = false; } + + /*! + * \brief synchronize the device if there hasn't been a synchronize since the last kernel + */ + CHAISHAREDDLL_API bool syncIfNeeded(); + +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + /*! + * \brief Turn the GPU simulation mode on or off. + */ + void setGPUSimMode(bool gpuSimMode) { m_gpu_sim_mode = gpuSimMode; } + + /*! + * \brief Return true if GPU simulation mode is on, false otherwise. + */ + bool isGPUSimMode() { return m_gpu_sim_mode; } +#endif + + /*! + * \brief Evicts the data in the given space. + * + * \param space Execution space to evict. + * \param destinationSpace The execution space to move the data to. + * Must not equal space or NONE. + */ + CHAISHAREDDLL_API void evict(ExecutionSpace space, ExecutionSpace destinationSpace); + + +protected: + /*! + * \brief Construct a new SharedPtrManager. + * + * The constructor is a protected member, ensuring that it can + * only be called by the singleton getInstance method. + */ + SharedPtrManager(); + + + +private: + + + /*! + * \brief Move data in msp_pointer_record to the corresponding ExecutionSpace. + * + * \param record + * \param space + */ + void move(msp_pointer_record* record, ExecutionSpace space, bool = false); + + /*! + * \brief Execute a user callback if callbacks are active + * + * \param record The pointer record containing the callback + * \param action The event that occurred + * \param space The space in which the event occurred + * \param size The number of bytes in the array associated with this pointer record + */ +// inline void callback(const msp_pointer_record* record, +// Action action, +// ExecutionSpace space) const { +// if (m_callbacks_active) { +// // Callback for this ManagedArray only +// if (record && record->m_user_callback) { +// record->m_user_callback(record, action, space); +// } +// +// // Callback for all ManagedArrays +// if (m_user_callback) { +// m_user_callback(record, action, space); +// } +// } +// } + + /*! + * Current execution space. + */ + static thread_local ExecutionSpace m_current_execution_space; + + /** + * Default space for new allocations. + */ + ExecutionSpace m_default_allocation_space; + + /*! + * Map of active ManagedArray pointers to their corresponding msp_pointer_record. + */ + PointerMap m_pointer_map; + + /*! + * + * \brief Array of umpire::Allocators, indexed by ExecutionSpace. + */ + umpire::Allocator* m_allocators[NUM_EXECUTION_SPACES]; + + /*! + * \brief The umpire resource manager. + */ + umpire::ResourceManager& m_resource_manager; + + /*! + * \brief Used for thread-safe operations. + */ + mutable std::mutex m_mutex; + + /*! + * \brief A callback triggered upon memory operations on all ManagedArrays. + */ + //UserCallback m_user_callback; + + /*! + * \brief Controls whether or not callbacks are called. + */ + //bool m_callbacks_active; + + /*! + * Whether or not a synchronize has been performed since the launch of the last + * GPU context + */ + static thread_local bool m_synced_since_last_kernel; + +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + /*! + * Used by the RAJA plugin to determine whether the execution space should be + * CPU or GPU. + */ + bool m_gpu_sim_mode = false; +#endif +}; + +} // end of namespace chai + +#include "chai/SharedPtrManager.inl" + +#endif // CHAI_SharedPtrManager_HPP diff --git a/src/chai/SharedPtrManager.inl b/src/chai/SharedPtrManager.inl new file mode 100644 index 00000000..0ae13ba6 --- /dev/null +++ b/src/chai/SharedPtrManager.inl @@ -0,0 +1,127 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#ifndef CHAI_SharedPtrManager_INL +#define CHAI_SharedPtrManager_INL + +#include "chai/config.hpp" + +#include "chai/SharedPtrManager.hpp" +#include "chai/ChaiMacros.hpp" + +#include + +#include "umpire/ResourceManager.hpp" + +//#include + +//#if defined(CHAI_ENABLE_UM) +//#if !defined(CHAI_THIN_GPU_ALLOCATE) +//#include +//#endif +//#endif + +namespace chai { + +template +msp_pointer_record* SharedPtrManager::makeSharedPtrRecord(std::initializer_list pointers, + std::initializer_list spaces, + size_t size, + bool owned) +{ + int i = 0; + for (Ptr* ptr : pointers) { + if (ptr == nullptr) return &s_null_record; + m_resource_manager.registerAllocation(ptr, + {ptr, size, m_allocators[spaces.begin()[i++]]->getAllocationStrategy()} + ); + } + + Ptr* lookup_pointer = const_cast(pointers.begin()[0]); + + auto pointer_record = getPointerRecord(lookup_pointer); + + if (pointer_record == &s_null_record) { + if (lookup_pointer) { + pointer_record = new msp_pointer_record(); + } else { + return pointer_record; + } + } + else { + CHAI_LOG(Warning, "SharedPtrManager::makeManaged found abandoned pointer record!!!"); + //callback(pointer_record, ACTION_FOUND_ABANDONED, space); + } + + i=0; + for (void const* c_ptr : pointers) { + void* ptr = const_cast(c_ptr); + chai::ExecutionSpace space = spaces.begin()[i]; + + pointer_record->m_pointers[space] = ptr; + pointer_record->m_owned[space] = owned; + registerPointer(pointer_record, space, owned); + + i++; + } + + for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) { + pointer_record->m_allocators[space] = getAllocatorId(ExecutionSpace(space)); + } + + return pointer_record; +} + + +#if defined(CHAI_ENABLE_PICK) +template +CHAI_INLINE +typename SharedPtrManager::T_non_const SharedPtrManager::pick(T* src_ptr, size_t index) +{ + T_non_const val; + m_resource_manager.registerAllocation(const_cast*>(&val), umpire::util::AllocationRecord{const_cast*>(&val), sizeof(T), m_resource_manager.getAllocator("HOST").getAllocationStrategy()}); + m_resource_manager.copy(const_cast*>(&val), const_cast*>(src_ptr+index), sizeof(T)); + m_resource_manager.deregisterAllocation(&val); + return val; +} + +template +CHAI_INLINE +void SharedPtrManager::set(T* dst_ptr, size_t index, const T& val) +{ + m_resource_manager.registerAllocation(const_cast*>(&val), umpire::util::AllocationRecord{const_cast*>(&val), sizeof(T), m_resource_manager.getAllocator("HOST").getAllocationStrategy()}); + m_resource_manager.copy(const_cast*>(dst_ptr+index), const_cast*>(&val), sizeof(T)); + m_resource_manager.deregisterAllocation(const_cast*>(&val)); +} +#endif + +CHAI_INLINE +void SharedPtrManager::copy(void * dst, void * src, size_t size) { + m_resource_manager.copy(dst,src,size); +} + +CHAI_INLINE +umpire::Allocator SharedPtrManager::getAllocator(ExecutionSpace space) { + return *m_allocators[space]; +} + +CHAI_INLINE +void SharedPtrManager::setAllocator(ExecutionSpace space, umpire::Allocator &allocator) { + *m_allocators[space] = allocator; +} + +CHAI_INLINE +bool SharedPtrManager::syncIfNeeded() { + if (!m_synced_since_last_kernel) { + synchronize(); + m_synced_since_last_kernel = true; + return true; + } + return false; +} +} // end of namespace chai + +#endif // CHAI_SharedPtrManager_INL diff --git a/src/chai/util/DeviceHelpers.hpp b/src/chai/util/DeviceHelpers.hpp new file mode 100644 index 00000000..6bbbe0d1 --- /dev/null +++ b/src/chai/util/DeviceHelpers.hpp @@ -0,0 +1,88 @@ +#ifndef CHAI_DEVICE_HELPERS_HPP +#define CHAI_DEVICE_HELPERS_HPP + +#include "chai/config.hpp" +#include "chai/ChaiMacros.hpp" + +namespace chai +{ +// CHAI_GPU_ERROR_CHECK macro +#ifdef CHAI_ENABLE_DEVICE + +#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING + +inline void gpuErrorCheck(gpuError_t code, const char *file, int line, bool abort=true) +{ + if (code != gpuSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", gpuGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} + +#define CHAI_GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); } +#else // CHAI_ENABLE_GPU_ERROR_CHECKING +#define CHAI_GPU_ERROR_CHECK(code) code +#endif // CHAI_ENABLE_GPU_ERROR_CHECKING + +#endif + +// wrapper for hip/cuda synchronize +inline void synchronize() { +#if defined(CHAI_ENABLE_DEVICE) && !defined(CHAI_DEVICE_COMPILE) + CHAI_GPU_ERROR_CHECK(gpuDeviceSynchronize()); +#endif +} + +#if defined(CHAI_GPUCC) || defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + +// wrapper for hip/cuda free +CHAI_HOST inline void gpuFree(void* buffer) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + free(buffer); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipFree(buffer)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaFree(buffer)); +#endif +} + +// wrapper for hip/cuda malloc +CHAI_HOST inline void gpuMalloc(void** devPtr, size_t size) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + *devPtr = (void*)malloc(size); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipMalloc(devPtr, size)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaMalloc(devPtr, size)); +#endif +} + +// wrapper for hip/cuda managed malloc +CHAI_HOST inline void gpuMallocManaged(void** devPtr, size_t size) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + *devPtr = (void*)malloc(size); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipMallocManaged(devPtr, size)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaMallocManaged(devPtr, size)); +#endif +} + +// wrapper for hip/cuda mem copy +CHAI_HOST inline void gpuMemcpy(void* dst, const void* src, size_t count, gpuMemcpyKind kind) { +#if defined(CHAI_ENABLE_GPU_SIMULATION_MODE) + memcpy(dst, src, count); +#elif defined (CHAI_ENABLE_HIP) + CHAI_GPU_ERROR_CHECK(hipMemcpy(dst, src, count, kind)); +#elif defined (CHAI_ENABLE_CUDA) + CHAI_GPU_ERROR_CHECK(cudaMemcpy(dst, src, count, kind)); +#endif +} + +#endif //#if defined(CHAI_GPUCC) + +} // namespace chai + +#endif // CHAI_DEVICE_HELPERS_HPP diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt index ca5dbb88..9f34aa7a 100644 --- a/tests/integration/CMakeLists.txt +++ b/tests/integration/CMakeLists.txt @@ -12,6 +12,7 @@ blt_list_append(TO chai_integration_test_depends ELEMENTS blt::hip IF ${CHAI_ENA blt_list_append(TO chai_integration_test_depends ELEMENTS openmp IF ${CHAI_ENABLE_OPENMP}) # ManagedArray tests +if (CHAI_ENABLE_CUDA OR CHAI_ENABLE_HIP) blt_add_executable( NAME managed_array_tests SOURCES managed_array_tests.cpp @@ -25,6 +26,20 @@ blt_add_test( NAME managed_array_test COMMAND managed_array_tests) +blt_add_executable( + NAME managed_shared_ptr_tests + SOURCES managed_shared_ptr_tests.cpp + DEPENDS_ON ${chai_integration_test_depends}) + +target_include_directories( + managed_shared_ptr_tests + PUBLIC ${PROJECT_BINARY_DIR}/include) + +blt_add_test( + NAME managed_shared_ptr_test + COMMAND managed_shared_ptr_tests) +endif() + if (CHAI_ENABLE_MANAGED_PTR) blt_add_executable( NAME managed_ptr_tests diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp index 71aea7d5..49b7cf9e 100644 --- a/tests/integration/managed_array_tests.cpp +++ b/tests/integration/managed_array_tests.cpp @@ -4,6 +4,14 @@ // // SPDX-License-Identifier: BSD-3-Clause ////////////////////////////////////////////////////////////////////////////// +#include "chai/config.hpp" + +#include "../src/util/forall.hpp" + +#include "chai/ManagedArray.hpp" + +#include "umpire/ResourceManager.hpp" + #include "gtest/gtest.h" #define GPU_TEST(X, Y) \ static void gpu_test_##X##Y(); \ @@ -28,16 +36,6 @@ #define assert_empty_map(IGNORED) ASSERT_EQ(chai::ArrayManager::getInstance()->getPointerMap().size(),0) #endif - -#include "chai/config.hpp" - -#include "../src/util/forall.hpp" - -#include "chai/ManagedArray.hpp" - -#include "umpire/ResourceManager.hpp" - - struct my_point { double x; double y; @@ -593,7 +591,48 @@ TEST(ManagedArray, ReallocateCPU) assert_empty_map(true); } +TEST(ManagedArray, ReallocateCopyCPU) +{ + chai::ManagedArray array(10); + chai::ManagedArray array_copy = array; + ASSERT_EQ(array.size(), 10u); + ASSERT_EQ(array_copy.size(), 10u); + + forall(sequential(), 0, 10, [=](int i) { + array[i] = i; + ASSERT_EQ(&array[i], &array_copy[i]); + }); + + array.reallocate(20); + + // This will be incorrect, a call to move, data or copy needs to + // be exectued in order to update the internal active pointer of + // the copied object in order to use operator[] after a reallocation. + ASSERT_NE(&array[0], &array_copy[0]); + + // This would work but for the sake of the test we will check + // operator[] is correct after lambda capture. + //ASSERT_EQ(array.data(), array_copy.data()); + + ASSERT_EQ(array.size(), 20u); + ASSERT_EQ(array_copy.size(), 20u); + + forall(sequential(), 0, 20, [=](int i) { + ASSERT_EQ(&array[i], &array_copy[i]); + if (i < 10) { + ASSERT_EQ(array[i], i); + } else { + array_copy[i] = i; + ASSERT_EQ(array[i], i); + } + }); + + array_copy.free(); + assert_empty_map(true); +} + #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP) + GPU_TEST(ManagedArray, ReallocateGPU) { chai::ManagedArray array(10); @@ -604,12 +643,52 @@ GPU_TEST(ManagedArray, ReallocateGPU) array.reallocate(20); ASSERT_EQ(array.size(), 20u); - forall(sequential(), 0, 20, [=](int i) { + forall(gpu(), 0, 20, [=]__device__(int i) { if (i < 10) { - ASSERT_EQ(array[i], i); + device_assert(array[i] == i); } else { array[i] = i; - ASSERT_EQ(array[i], i); + device_assert(array[i] == i); + } + }); + + array.free(); + assert_empty_map(true); +} + +GPU_TEST(ManagedArray, ReallocateCopyGPU) +{ + chai::ManagedArray array(10); + auto array_copy = array; + ASSERT_EQ(array.size(), 10u); + ASSERT_EQ(array_copy.size(), 10u); + + forall(gpu(), 0, 10, [=] __device__(int i) { + array[i] = i; + device_assert(array.data()[i] == array_copy.data()[i]); + }); + + array.reallocate(20); + ASSERT_EQ(array.size(), 20u); + + // This will be incorrect, a call to move, data or copy needs to + // be exectued in order to update the internal active pointer of + // the copied object in order to use operator[] after a reallocation. + ASSERT_NE(&array[0], &array_copy[0]); + + // This would work but for the sake of the test we will check + // operator[] is correct after lambda capture. + //ASSERT_EQ(array.data(), array_copy.data()); + + + forall(gpu(), 0, 20, [=]__device__(int i) { + device_assert(array.size() == array_copy.size()); + device_assert(array.data()[i] == array_copy.data()[i]); + if (i < 10) { + device_assert(array[i] == i); + } else { + array[i] = i; + device_assert(array[i] == i); } }); diff --git a/tests/integration/managed_shared_ptr_tests.cpp b/tests/integration/managed_shared_ptr_tests.cpp new file mode 100644 index 00000000..1b068ea4 --- /dev/null +++ b/tests/integration/managed_shared_ptr_tests.cpp @@ -0,0 +1,361 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC and CHAI +// project contributors. See the CHAI LICENSE file for details. +// +// SPDX-License-Identifier: BSD-3-Clause +////////////////////////////////////////////////////////////////////////////// +#include +#include "camp/defines.hpp" +#include "chai/ChaiMacros.hpp" +#include "chai/ExecutionSpaces.hpp" +#include "chai/ManagedSharedPtr.hpp" +#include "chai/SharedPtrManager.hpp" +#include "gtest/gtest.h" +#include "umpire/ResourceManager.hpp" + +#define GPU_TEST(X, Y) \ + static void gpu_test_##X##Y(); \ + TEST(X, Y) { gpu_test_##X##Y(); } \ + static void gpu_test_##X##Y() + +#include "chai/config.hpp" +#include "chai/ArrayManager.hpp" +#include "chai/ManagedArray.hpp" +#include "chai/managed_ptr.hpp" +#include "chai/ManagedSharedPtr.hpp" + +#include "../src/util/forall.hpp" + +// Standard library headers +#include + +#if defined(CHAI_ENABLE_CUDA) +inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true) +{ + if (code != cudaSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((cuda##code), __FILE__, __LINE__); } +#elif defined(CHAI_ENABLE_HIP) +inline void gpuErrorCheck(hipError_t code, const char *file, int line, bool abort=true) +{ + if (code != hipSuccess) { + fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", hipGetErrorString(code), file, line); + if (abort) { + exit(code); + } + } +} +#define GPU_ERROR_CHECK(code) { gpuErrorCheck((hip##code), __FILE__, __LINE__); } +#endif + + +#ifdef CHAI_DISABLE_RM +#define assert_empty_array_map(IGNORED) +#define assert_empty_sptr_map(IGNORED) +#else +#define assert_empty_array_map(IGNORED) ASSERT_EQ(chai::ArrayManager::getInstance()->getPointerMap().size(),0) +#define assert_empty_sptr_map(IGNORED) ASSERT_EQ(chai::SharedPtrManager::getInstance()->getPointerMap().size(),0) +#endif + + +class C : chai::CHAIPoly +{ +public: + CHAI_HOST_DEVICE C(void) { printf("++ C has been constructed\n"); } + CHAI_HOST_DEVICE virtual ~C(void) { printf("-- C has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) const = 0; +}; + +class D final : public C +{ +public: + unsigned long long content_D; + CHAI_HOST_DEVICE D(void) : content_D(0xDDDDDDDDDDDDDDDDull) { printf("++ D has been constructed\n"); } + CHAI_HOST_DEVICE ~D(void) { printf("-- D has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) const { printf("%llX\n", content_D); } +}; + + +class A : chai::CHAIPoly +{ +public: + unsigned long long content_A; + D d; + CHAI_HOST_DEVICE A(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } + CHAI_HOST_DEVICE virtual ~A(void) { printf("-- A has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) const = 0; + CHAI_HOST_DEVICE virtual void d_function(void) const = 0; + CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; +}; + +class A2 +{ +public: + CHAI_HOST_DEVICE A2(void) { printf("++ A2 has been constructed\n"); } + CHAI_HOST_DEVICE ~A2(void) { printf("-- A2 has been destructed\n"); } +}; + +class B final : public A, public A2 +{ +public: + unsigned long long content_B; + CHAI_HOST_DEVICE B(void) : content_B(0xBBBBBBBBBBBBBBBBull) { printf("++ B has been constructed\n"); } + CHAI_HOST_DEVICE ~B(void) { printf("-- B has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) const override { printf("%llX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) const override { d.function(); } + CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } +}; + + +class AAbsMem : public chai::CHAICopyable , public chai::CHAIPoly +{ +public: + //chai::ManagedSharedPtr base_member; + chai::ManagedSharedPtr base_member; + unsigned long long content_A; + + CHAI_HOST_DEVICE AAbsMem(void) : content_A(0xAAAAAAAAAAAAAAAAull) { printf("++ A has been constructed\n"); } + + //template::value>::type > + template + CHAI_HOST AAbsMem(Derived const& base_val) + //: base_member(chai::make_shared(base_val)) + : base_member(chai::make_shared(base_val)) + , content_A(0xAAAAAAAAAAAAAAAAull) + { printf("++ A has been constructed\n"); } + + CHAI_HOST_DEVICE virtual ~AAbsMem(void) { printf("-- A has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) const = 0; + CHAI_HOST_DEVICE virtual void d_function(void) const = 0; + CHAI_HOST_DEVICE virtual void set_content(unsigned long long) = 0; +}; + +class BAbsMem final : public AAbsMem +{ +public: + unsigned long long content_B; + + CHAI_HOST_DEVICE BAbsMem() : AAbsMem() + { + printf("++ B has been constructed\n"); + } + + template + CHAI_HOST BAbsMem(Derived const& base_val) + : AAbsMem(base_val) + , content_B(0xBBBBBBBBBBBBBBBBull) + { + printf("++ B has been constructed\n"); + } + + CHAI_HOST_DEVICE ~BAbsMem(void) { printf("-- B has been destructed\n"); } + CHAI_HOST_DEVICE virtual void function(void) const override { printf("%llX\n", content_B); } + CHAI_HOST_DEVICE virtual void d_function(void) const override { base_member->function(); } + CHAI_HOST_DEVICE virtual void set_content(unsigned long long val) override { content_B = val; content_A = val; } +}; + +class NV +{ +public: + unsigned long long content_NV; + CHAI_HOST_DEVICE NV(void) : content_NV(0xFFFFFFFFFFFFFFFFull) { printf("++ NV has been constructed\n"); } + CHAI_HOST_DEVICE ~NV(void) { printf("-- NV has been destructed\n"); } + CHAI_HOST_DEVICE void function(void) const { printf("%llX\n", content_NV); } +}; + +GPU_TEST(managed_shared_ptr, shared_ptr_absmem) +{ + { + using DerivedT = BAbsMem; + using BaseT = AAbsMem; + + D d; + chai::ManagedSharedPtr sptr = chai::make_shared(d); + + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + chai::ManagedSharedPtr sptr2 = sptr; + sptr2->function(); + sptr2->d_function(); + + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int) { + printf("CPU Body\n"); + sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr2->function(); + sptr2->d_function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + } + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + assert_empty_sptr_map(); +} + +GPU_TEST(managed_shared_ptr, shared_ptr_const) +{ + { + using DerivedT = B; + using BaseT = A; + + std::cout << "size of (DerivedT) : " << sizeof(DerivedT) << std::endl; + std::cout << "size of (BaseT) : " << sizeof(BaseT) << std::endl; + + chai::ManagedSharedPtr sptr = chai::make_shared(); + + chai::ManagedSharedPtr sptr2 = sptr; + + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int) { + printf("CPU Body\n"); + sptr->set_content(0xFFFFFFFFFFFFFFFFull); + sptr2->function(); + sptr2->d_function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + sptr2->function(); + sptr2->d_function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + } + assert_empty_sptr_map(); + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; +} + +GPU_TEST(managed_shared_ptr, shared_ptr_nv) +{ + { + using DerivedT = NV; + + chai::ManagedSharedPtr sptr = chai::make_shared(); + + chai::ManagedSharedPtr sptr2 = sptr; + + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + sptr2->function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int) { + printf("CPU Body\n"); + sptr2->function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + sptr2->function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + } + assert_empty_sptr_map(); + std::cout << "Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; +} + + +GPU_TEST(managed_shared_ptr, shared_arr_shared_ptr_absmem) +{ + { + using DerivedT = BAbsMem; + using BaseT = AAbsMem; + + using ElemT = chai::ManagedSharedPtr; + using Container = chai::ManagedArray; + + std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + + Container arr(1); + D d; + arr[0] = chai::make_shared(d); + arr.registerTouch(chai::CPU); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + arr[0]->function(); + arr[0]->d_function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + std::cout << "CPU CALL...\n"; + forall(sequential(), 0, 1, [=] (int) { + printf("CPU Body\n"); + arr[0]->set_content(0xFFFFFFFFFFFFFFFFull); + arr[0]->function(); + arr[0]->d_function(); + }); + + std::cout << "GPU CALL...\n"; + forall(gpu(), 0, 1, [=] __device__ (int) { + printf("GPU Body\n"); + arr[0]->function(); + arr[0]->d_function(); + }); + GPU_ERROR_CHECK( PeekAtLastError() ); + GPU_ERROR_CHECK( DeviceSynchronize() ); + + std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + + std::cout << "arr.free()\n"; + arr.free(); + std::cout << "End of scope\n"; + assert_empty_array_map(); + + } + std::cout << "Sptr Map Sz : " << chai::SharedPtrManager::getInstance()->getPointerMap().size() << std::endl; + std::cout << "Arr Map Sz : " << chai::ArrayManager::getInstance()->getPointerMap().size() << std::endl; + assert_empty_sptr_map(); +} +