diff --git a/CMakeLists.txt b/CMakeLists.txt index ef340ea4f..d489c0474 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,9 @@ else() message(STATUS "ADIOS2 is disabled") endif() +set(OMP_TGT OFF CACHE BOOL + "Enable OpenMP target offloading.") + set(CMAKE_Fortran_MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/include) add_subdirectory(src) add_subdirectory(tests) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 91d2bccec..d00264169 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,6 @@ set(SRC solver.f90 tdsops.f90 time_integrator.f90 - ordering.f90 mesh.f90 mesh_content.f90 field.f90 @@ -45,6 +44,9 @@ set(CUDASRC set(BACKENDSRC backend/omp/backend.f90 ) +set(OMPTGTSRC # For OMP offloading + backend/omp/target/allocator.f90 + ) set(2DECOMPFFTSRC backend/omp/poisson_fft.f90 decomp/decomp_2decompfft.f90 @@ -80,6 +82,11 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR list(APPEND BACKENDSRC backend/cuda/backend.f90) endif() +if(OMP_TGT) + list(APPEND SRC ${OMPTGTSRC}) + list(APPEND BACKENDSRC backend/omp/target/backend.f90) +endif() + # Decide whether 2decomp&fft is supported by the build find_package(decomp2d) @@ -103,8 +110,8 @@ target_include_directories(x3d2_backends INTERFACE ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(x3d2_backends PRIVATE x3d2) add_executable(xcompact xcompact.f90) -target_link_libraries(xcompact PRIVATE x3d2) target_link_libraries(xcompact PRIVATE x3d2_backends) +target_link_libraries(xcompact PRIVATE x3d2) # if CUDA compiler if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR @@ -122,11 +129,16 @@ elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU" OR set(CMAKE_Fortran_FLAGS "-cpp -std=f2018") set(CMAKE_Fortran_FLAGS_DEBUG "-g -Og -Wall -Wpedantic -Werror -Wimplicit-interface -Wimplicit-procedure -Wno-unused-dummy-argument") set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -ffast-math") + if (OMP_TGT) + # A bit of a hack - hardcoded for MI300A + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fopenmp --offload-arch=gfx942") + endif() elseif(${CMAKE_Fortran_COMPILER_ID} STREQUAL "Cray") set(CMAKE_Fortran_FLAGS "-eF -M878") # -M878 suppresses WARNING multiple module includes (not useful) set(CMAKE_Fortran_FLAGS_DEBUG "-G2 -O0") set(CMAKE_Fortran_FLAGS_RELEASE "-O3") target_link_options(x3d2 INTERFACE -h omp) + target_link_options(x3d2_backends INTERFACE -h omp) endif() if(WITH_2DECOMPFFT) diff --git a/src/allocator.f90 b/src/allocator.f90 index 698beda79..0b380541c 100644 --- a/src/allocator.f90 +++ b/src/allocator.f90 @@ -149,6 +149,7 @@ function get_block(self, dir, data_loc) result(handle) ! Apply bounds remapping based on requested direction call handle%set_shape(dims) + end function get_block subroutine release_block(self, handle) diff --git a/src/backend/omp/backend.f90 b/src/backend/omp/backend.f90 index 370108a5a..a2c2fab80 100644 --- a/src/backend/omp/backend.f90 +++ b/src/backend/omp/backend.f90 @@ -71,7 +71,7 @@ function init(mesh, allocator) result(backend) call backend%base_init() select type (allocator) - type is (allocator_t) + class is (allocator_t) ! class level access to the allocator backend%allocator => allocator end select diff --git a/src/backend/omp/target/allocator.f90 b/src/backend/omp/target/allocator.f90 new file mode 100644 index 000000000..f8f66cd45 --- /dev/null +++ b/src/backend/omp/target/allocator.f90 @@ -0,0 +1,179 @@ +!!! backends/omp/target/allocator.f90 +!! +!! Implements an allocator specialised to OMP target offloading + +module m_omptgt_allocator + + use iso_c_binding, only: c_ptr, c_f_pointer, c_sizeof + use omp_lib, only: omp_target_alloc, omp_target_free, omp_get_default_device + + use m_common, only: dp + + use m_allocator, only: allocator_t + use m_mesh, only: mesh_t + use m_field, only: field_t + + implicit none + + private + public :: omptgt_allocator_t + public :: omptgt_field_t + + type, extends(allocator_t) :: omptgt_allocator_t + contains + procedure :: create_block => create_block_omptgt + end type omptgt_allocator_t + + interface omptgt_allocator_t + module procedure omptgt_allocator_init + end interface omptgt_allocator_t + + type, extends(field_t) :: omptgt_field_t + ! A device-resident field + integer, private :: dev_id + type(c_ptr), private :: dev_ptr + real(dp), pointer, private :: p_data_tgt(:) => null() + real(dp), pointer, contiguous :: data_tgt(:, :, :) => null() + contains + procedure :: destroy => omptgt_field_destroy + procedure :: fill => fill_omptgt + procedure :: get_shape => get_shape_omptgt + procedure :: set_shape => set_shape_omptgt + end type omptgt_field_t + + interface omptgt_field_t + module procedure omptgt_field_init + end interface omptgt_field_t + +contains + + ! Constructor for the OMP target offload allocator + type(omptgt_allocator_t) function omptgt_allocator_init(dims, sz) result(a) + integer, intent(in) :: dims(3) + integer, intent(in) :: sz + + a%allocator_t = allocator_t(dims, sz) + end function omptgt_allocator_init + + ! Allocates a device-resident block + function create_block_omptgt(self, next) result(ptr) + class(omptgt_allocator_t), intent(inout) :: self + class(field_t), pointer, intent(in) :: next + type(omptgt_field_t), pointer :: newblock_tgt + class(field_t), pointer :: ptr + + self%next_id = self%next_id + 1 + allocate(newblock_tgt) + newblock_tgt = omptgt_field_t(self%ngrid, next, id=self%next_id) + ptr => newblock_tgt + + end function create_block_omptgt + + ! Constructs a device-resident field + type(omptgt_field_t) function omptgt_field_init(ngrid, next, id) result(f) + integer, intent(in) :: ngrid + class(field_t), pointer, intent(in) :: next + integer, intent(in) :: id + + f%refcount = 0 + f%next => next + f%id = id + + f%dev_id = omp_get_default_device() + f%dev_ptr = omp_target_alloc(ngrid * c_sizeof(0.0_dp), f%dev_id) + call c_f_pointer(f%dev_ptr, f%p_data_tgt, shape=[ngrid]) + + end function omptgt_field_init + + subroutine omptgt_field_destroy(self) + class(omptgt_field_t) :: self + + nullify(self%data_tgt) + nullify(self%p_data_tgt) + call omp_target_free(self%dev_ptr, self%dev_id) + end subroutine + + ! Deallocates device-resident memory before deallocating the base type + subroutine destroy(self) + class(omptgt_allocator_t) :: self + + class(field_t), pointer :: ptr + + ptr => self%first + do + if (.not. associated(ptr)) then + exit + end if + + select type(ptr) + type is(omptgt_field_t) + call ptr%destroy() + end select + + ptr => ptr%next + end do + + call self%allocator_t%destroy() + end subroutine + + subroutine fill_omptgt(self, c) + class(omptgt_field_t) :: self + real(dp), intent(in) :: c + + !call fill_omptgt_(self%p_data_tgt, c, size(self%p_data_tgt)) + call fill_omptgt_3d_(self%data_tgt, c) + + end subroutine fill_omptgt + + subroutine fill_omptgt_(p_data_tgt, c, n) + real(dp), dimension(:), intent(inout) :: p_data_tgt + real(dp), intent(in) :: c + integer, intent(in) :: n + + integer :: i + + !$omp target teams distribute parallel do has_device_addr(p_data_tgt) + do i = 1, n + p_data_tgt(i) = c + end do + !$omp end target teams distribute parallel do + + end subroutine + + subroutine fill_omptgt_3d_(data_tgt, c) + real(dp), dimension(:, :, :), intent(inout) :: data_tgt + real(dp), intent(in) :: c + + integer, dimension(3) :: n + integer :: i, j, k + + n = shape(data_tgt) + + !$omp target teams distribute parallel do collapse(3) has_device_addr(data_tgt) + do k = 1, n(3) + do j = 1, n(2) + do i = 1, n(1) + data_tgt(i, j, k) = c + end do + end do + end do + !$omp end target teams distribute parallel do + end subroutine + + function get_shape_omptgt(self) result(dims) + class(omptgt_field_t) :: self + integer :: dims(3) + + dims = shape(self%data_tgt) + end function + + subroutine set_shape_omptgt(self, dims) + class(omptgt_field_t) :: self + integer, intent(in) :: dims(3) + + call c_f_pointer(self%dev_ptr, self%data_tgt, shape=dims) + + end subroutine + +end module m_omptgt_allocator + diff --git a/src/backend/omp/target/backend.f90 b/src/backend/omp/target/backend.f90 new file mode 100644 index 000000000..15452dd80 --- /dev/null +++ b/src/backend/omp/target/backend.f90 @@ -0,0 +1,310 @@ +!!! src/backend/omp/target/backend.f90 +!! +!! OpenMP target offload backend implementation. +!! +!! Note this extends the CPU (host) OpenMP backend with the intention of being able to use fallback implementations where necessary. + +module m_omptgt_backend + + use m_common, only: dp, DIR_C, get_dirs_from_rdr + + use m_allocator, only: allocator_t + use m_mesh, only: mesh_t + use m_field, only: field_t + use m_ordering, only: get_index_reordering + + use m_omp_common, only: SZ + use m_omp_backend, only: omp_backend_t + + use m_omptgt_allocator, only: omptgt_field_t + + implicit none + + type, extends(omp_backend_t) :: omptgt_backend_t + contains + procedure :: copy_f_to_data => copy_f_to_data_omptgt + procedure :: copy_data_to_f => copy_data_to_f_omptgt + procedure :: reorder => reorder_omptgt + procedure :: vecadd => vecadd_omptgt + procedure :: veccopy => veccopy_omptgt + end type + + interface omptgt_backend_t + module procedure omptgt_backend_init + end interface + + private + public :: omptgt_backend_t + +contains + + type(omptgt_backend_t) function omptgt_backend_init(mesh, allocator) result(backend) + + type(mesh_t), target, intent(inout) :: mesh + class(allocator_t), target, intent(inout) :: allocator + + backend%omp_backend_t = omp_backend_t(mesh, allocator) + end function + + subroutine veccopy_omptgt(self, dst, src) + + class(omptgt_backend_t) :: self + class(field_t), intent(inout) :: dst + class(field_t), intent(in) :: src + + if (src%dir /= dst%dir) then + error stop "Called vector copy with incompatible fields" + end if + + select type(dst) + type is (omptgt_field_t) + select type(src) + type is (omptgt_field_t) + call veccopy_offload_(dst%data_tgt, src%data_tgt) + class default + error stop "Called omptgt vector copy with unsupported source vector" + end select + class default + error stop "Called omptgt vector copy with unsupported destination vector" + end select + end subroutine + + subroutine veccopy_offload_(dst, src) + + real(dp), dimension(:, :, :), intent(inout) :: dst + real(dp), dimension(:, :, :), intent(in) :: src + + integer, dimension(3) :: n + integer :: i, j, k + + n = shape(dst) + + !$omp target teams distribute parallel do collapse(3) has_device_addr(dst, src) + do k = 1, n(3) + do j = 1, n(2) + do i = 1, n(1) + dst(i, j, k) = src(i, j, k) + end do + end do + end do + !$omp end target teams distribute parallel do + + end subroutine + + subroutine vecadd_omptgt(self, a, x, b, y) + + class(omptgt_backend_t) :: self + real(dp), intent(in) :: a + class(field_t), intent(in) :: x + real(dp), intent(in) :: b + class(field_t), intent(inout) :: y + + if (x%dir /= y%dir) then + error stop "Called vector add with incompatible fields" + end if + + select type(x) + type is(omptgt_field_t) + select type(y) + type is (omptgt_field_t) + call vecadd_offload(self, a, x, b, y) + class default + error stop "Device/host fallback not yet implemented" + end select + class default + call self%omp_backend_t%vecadd(a, x, b, y) + end select + + end subroutine + + subroutine vecadd_offload(self, a, x, b, y) + + class(omptgt_backend_t) :: self + real(dp), intent(in) :: a + type(omptgt_field_t), intent(in) :: x + real(dp), intent(in) :: b + type(omptgt_field_t), intent(inout) :: y + + integer, dimension(3) :: dims + + dims = self%allocator%get_padded_dims(x%dir) + + call vecadd_offload_(dims, a, x%data_tgt, b, y%data_tgt) + + end subroutine + + subroutine vecadd_offload_(dims, a, x, b, y) + integer, dimension(3), intent(in) :: dims + real(dp), intent(in) :: a + real(dp), dimension(:, :, :), intent(in) :: x + real(dp), intent(in) :: b + real(dp), dimension(:, :, :), intent(inout) :: y + + integer :: i, j, k + + !$omp target teams distribute parallel do collapse(3) has_device_addr(x, y) + do k = 1, dims(3) + do j = 1, dims(2) + do i = 1, dims(1) + y(i, j, k) = a * x(i, j, k) + b * y(i, j, k) + end do + end do + end do + !$omp end target teams distribute parallel do + end subroutine + + subroutine copy_data_to_f_omptgt(self, f, data) + class(omptgt_backend_t), intent(inout) :: self + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: data + + integer, dimension(3) :: dims + + dims = self%allocator%get_padded_dims(f%dir) + + ! XXX: This could be improved following cuda/backend.f90:resolve_field_t() + select type(f) + type is(omptgt_field_t) + call copy_data_to_f_omptgt_(f%data_tgt, data, dims) + class default + error stop "Unsupported" + end select + + end subroutine copy_data_to_f_omptgt + + subroutine copy_data_to_f_omptgt_(f_arr, d, dims) + real(dp), dimension(:, :, :), intent(inout) :: f_arr + real(dp), dimension(:, :, :), intent(in) :: d + integer, dimension(3), intent(in) :: dims + + integer :: i, j, k + + ! XXX: This could be improved following cuda/backend.f90:resolve_field_t() + !$omp target teams loop collapse(3) map(to:d) has_device_addr(f_arr) + do k = 1, dims(3) + do j = 1, dims(2) + do i = 1, dims(1) + f_arr(i, j, k) = d(i, j, k) + end do + end do + end do + !$omp end target teams loop + + end subroutine + + subroutine copy_f_to_data_omptgt(self, data, f) + class(omptgt_backend_t), intent(inout) :: self + real(dp), dimension(:, :, :), intent(out) :: data + class(field_t), intent(in) :: f + + integer, dimension(3) :: dims + + dims = self%allocator%get_padded_dims(f%dir) + + select type(f) + type is(omptgt_field_t) + call copy_f_to_data_omptgt_(data, f%data_tgt, dims) + class default + error stop "Unsupported" + end select + + end subroutine copy_f_to_data_omptgt + + subroutine copy_f_to_data_omptgt_(data, f_arr, dims) + real(dp), dimension(:, :, :), intent(out) :: data + real(dp), dimension(:, :, :), intent(in) :: f_arr + integer, dimension(3), intent(in) :: dims + + integer :: i, j, k + + !$omp target teams distribute parallel do collapse(3) map(from:data) has_device_addr(f_arr) + do k = 1, dims(3) + do j = 1, dims(2) + do i = 1, dims(1) + data(i, j, k) = f_arr(i, j, k) + end do + end do + end do + !$omp end target teams distribute parallel do + + end subroutine + + subroutine reorder_omptgt(self, u_, u, direction) + class(omptgt_backend_t) :: self + class(field_t), intent(inout) :: u_ + class(field_t), intent(in) :: u + integer, intent(in) :: direction + integer, dimension(3) :: dims, cart_padded + integer :: dir_from, dir_to + + dims = self%allocator%get_padded_dims(u%dir) + cart_padded = self%allocator%get_padded_dims(DIR_C) + call get_dirs_from_rdr(dir_from, dir_to, direction) + + ! XXX: This could be improved following cuda/backend.f90:resolve_field_t() + select type(u_) + type is(omptgt_field_t) + select type(u) + type is(omptgt_field_t) + call reorder_omptgt_dd(u_%data_tgt, u%data_tgt, dims, dir_from, dir_to, cart_padded) + class default + call reorder_omptgt_dh(u_%data_tgt, u%data, dims, dir_from, dir_to, cart_padded) + end select + class default + error stop "Unsupported" + end select + + ! reorder keeps the data_loc the same + call u_%set_data_loc(u%data_loc) + + end subroutine reorder_omptgt + + subroutine reorder_omptgt_dd(u_, u, dims, dir_from, dir_to, cart_padded) + real(dp), dimension(:, :, :), pointer :: u_ + real(dp), dimension(:, :, :), pointer, intent(in) :: u + integer, dimension(3), intent(in) :: dims + integer, intent(in) :: dir_from, dir_to + integer, dimension(3), intent(in) :: cart_padded + + integer :: i, j, k + integer :: out_i, out_j, out_k + + !$omp target teams distribute parallel do private(out_i, out_j, out_k) collapse(3) has_device_addr(u_, u) + do k = 1, dims(3) + do j = 1, dims(2) + do i = 1, dims(1) + call get_index_reordering(out_i, out_j, out_k, i, j, k, & + dir_from, dir_to, SZ, cart_padded) + u_(out_i, out_j, out_k) = u(i, j, k) + end do + end do + end do + !$omp end target teams distribute parallel do + + end subroutine + + subroutine reorder_omptgt_dh(u_, u, dims, dir_from, dir_to, cart_padded) + real(dp), dimension(:, :, :), pointer :: u_ + real(dp), dimension(:, :, :), pointer, intent(in) :: u + integer, dimension(3), intent(in) :: dims + integer, intent(in) :: dir_from, dir_to + integer, dimension(3), intent(in) :: cart_padded + + integer :: i, j, k + integer :: out_i, out_j, out_k + + !$omp target teams distribute parallel do private(out_i, out_j, out_k) collapse(3) map(to:u) has_device_addr(u_) + do k = 1, dims(3) + do j = 1, dims(2) + do i = 1, dims(1) + call get_index_reordering(out_i, out_j, out_k, i, j, k, & + dir_from, dir_to, SZ, cart_padded) + u_(out_i, out_j, out_k) = u(i, j, k) + end do + end do + end do + !$omp end target teams distribute parallel do + + end subroutine + +end module diff --git a/src/io/checkpoint_manager.f90 b/src/io/checkpoint_manager.f90 index 9ae0751c9..0d85a14cc 100644 --- a/src/io/checkpoint_manager.f90 +++ b/src/io/checkpoint_manager.f90 @@ -1,20 +1,20 @@ module m_checkpoint_manager -! @brief Manages the creation and restoration of simulation checkpoints -!! for restart capabilities. -!! -!! @details This module is responsible for periodically saving the full, unstrided -!! simulation state to a file. This allows a simulation to be stopped and resumed -!! from the exact state it was in. -!! -!! Key features include: -!! - Reading all checkpoint settings from a configuration file -!! - Periodically writing the full-resolution simulation state -!! - Handling the full logic for restarting a simulation from -!! a specified checkpoint file. -!! - A safe-write strategy that writes to a temporary file first, -!! then atomically renames it to the final filename to -!! prevent corrupted checkpoints. -!! - Optional cleanup of old checkpoint files to conserve disk space. + ! @brief Manages the creation and restoration of simulation checkpoints + !! for restart capabilities. + !! + !! @details This module is responsible for periodically saving the full, unstrided + !! simulation state to a file. This allows a simulation to be stopped and resumed + !! from the exact state it was in. + !! + !! Key features include: + !! - Reading all checkpoint settings from a configuration file + !! - Periodically writing the full-resolution simulation state + !! - Handling the full logic for restarting a simulation from + !! a specified checkpoint file. + !! - A safe-write strategy that writes to a temporary file first, + !! then atomically renames it to the final filename to + !! prevent corrupted checkpoints. + !! - Optional cleanup of old checkpoint files to conserve disk space. use mpi, only: MPI_COMM_WORLD, MPI_Comm_rank, MPI_Abort use m_common, only: dp, i8, DIR_C, get_argument use m_field, only: field_t diff --git a/src/ordering.f90 b/src/ordering.f90 index 19be0a583..06520db85 100644 --- a/src/ordering.f90 +++ b/src/ordering.f90 @@ -12,6 +12,7 @@ module m_ordering pure subroutine get_index_ijk(i, j, k, dir_i, dir_j, dir_k, dir, & SZ, nx_padded, ny_padded, nz_padded) + !$omp declare target !! Get cartesian index from application storage directional one integer, intent(out) :: i, j, k ! cartesian indices integer, intent(in) :: dir_i, dir_j, dir_k ! application storage indices @@ -41,6 +42,7 @@ end subroutine get_index_ijk pure subroutine get_index_dir(dir_i, dir_j, dir_k, i, j, k, dir, & SZ, nx_padded, ny_padded, nz_padded) + !$omp declare target !! Get application storage directional index from cartesian index integer, intent(out) :: dir_i, dir_j, dir_k ! application storage indices integer, intent(in) :: i, j, k ! cartesian indices @@ -71,6 +73,7 @@ end subroutine get_index_dir pure subroutine get_index_reordering( & out_i, out_j, out_k, in_i, in_j, in_k, dir_from, dir_to, sz, cart_padded & ) + !$omp declare target !! Converts indices in between any two DIR_? integer, intent(out) :: out_i, out_j, out_k ! output indices integer, intent(in) :: in_i, in_j, in_k ! input indices diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bbd291cbd..e288f265d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -25,9 +25,18 @@ function(define_test testfile np backend) endif() find_package(OpenMP REQUIRED) target_link_libraries(${test_name} PRIVATE OpenMP::OpenMP_Fortran) + + if(${backend} STREQUAL omp_tgt) + # Note this is somewhat of a hack - hardcoded to build against MI300A + target_compile_options(${test_name} PRIVATE "-fopenmp") + target_compile_options(${test_name} PRIVATE "--offload-arch=gfx942") + target_compile_options(${test_name} PRIVATE "-DOMP_TGT") + target_link_options(${test_name} PRIVATE "-fopenmp") + target_link_options(${test_name} PRIVATE "--offload-arch=gfx942") + endif() endif() - target_link_libraries(${test_name} PRIVATE x3d2) target_link_libraries(${test_name} PRIVATE x3d2_backends) + target_link_libraries(${test_name} PRIVATE x3d2) if(WITH_2DECOMPFFT) find_package(decomp2d REQUIRED) @@ -78,6 +87,12 @@ endif() define_test(test_setget_field.f90 1 omp) +if(${OMP_TGT}) + define_test(test_vecadd.f90 1 omp_tgt) + define_test(test_time_integrator.f90 1 omp_tgt) + define_test(test_setget_field.f90 1 omp_tgt) +endif() + if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_Fortran_COMPILER_ID} STREQUAL "NVHPC") define_test(test_reordering.f90 1 cuda) diff --git a/tests/omptgt/test_omptgt_allocator.f90 b/tests/omptgt/test_omptgt_allocator.f90 new file mode 100644 index 000000000..0beab10a4 --- /dev/null +++ b/tests/omptgt/test_omptgt_allocator.f90 @@ -0,0 +1,95 @@ +program test_allocator_omptgt + use iso_fortran_env, only: stderr => error_unit + + use m_allocator, only: allocator_t, field_t + use m_common, only: dp, pi, DIR_X + use m_mesh, only: mesh_t + + use m_omptg_allocator, only: omptgt_allocator_t + + implicit none + + logical :: allpass + integer, dimension(3) :: dims, nproc_dir + real(dp) :: L_global(3) + character(len=20) :: BC_x(2), BC_y(2), BC_z(2) + class(allocator_t), allocatable :: allocator + class(mesh_t), allocatable :: mesh + class(field_t), pointer :: ptr1, ptr2, ptr3 + integer, allocatable :: l(:) + integer :: ierr + + call MPI_Init(ierr) + + dims = [8, 8, 8] + nproc_dir = [1, 1, 1] + L_global = [2*pi, 2*pi, 2*pi] + + BC_x = ['periodic', 'periodic'] + BC_y = ['periodic', 'periodic'] + BC_z = ['periodic', 'periodic'] + + mesh = mesh_t(dims, nproc_dir, L_global, BC_x, BC_y, BC_z) + + allocator = omptgt_allocator_t(mesh, 8) + + allpass = .true. + + ! Get the list of ids for free blocks. By default there are none + ! and returned list is [0]. + l = allocator%get_block_ids() + if (size(l) /= 1 .or. l(1) /= 0) then + allpass = .false. + write (stderr, '(a)') 'Free list is initialised empty... failed' + else + write (stderr, '(a)') 'Free list is initialised empty... passed' + end if + + ! Request two blocks and release them in reverse order. List should + ! contain two free blocks. (1 -> 2) + ptr1 => allocator%get_block(DIR_X) + ptr2 => allocator%get_block(DIR_X) + call allocator%release_block(ptr2) + call allocator%release_block(ptr1) + + if (.not. all(allocator%get_block_ids() == [1, 2])) then + allpass = .false. + write (stderr, '(a)') 'Blocks are released correctly... failed' + else + write (stderr, '(a)') 'Blocks are released correctly... passed' + end if + + !! Destroy the free list and check that the list is empty again. + call allocator%destroy() + l = allocator%get_block_ids() + if (size(l) /= 1 .or. l(1) /= 0 .or. allocator%next_id /= 0) then + allpass = .false. + write (stderr, '(a)') 'Free list is correctly destroyed... failed' + else + write (stderr, '(a)') 'Free list is correctly destroyed... passed' + end if + + ! Request a block from a list of three. This should grab the first + ! block on top of the pile and reduce the free list to two blocks. + ptr1 => allocator%get_block(DIR_X) + ptr2 => allocator%get_block(DIR_X) + ptr3 => allocator%get_block(DIR_X) + call allocator%release_block(ptr3) + call allocator%release_block(ptr2) + call allocator%release_block(ptr1) + ptr1 => allocator%get_block(DIR_X) + + if (.not. all(allocator%get_block_ids() == [2, 3])) then + allpass = .false. + write (stderr, '(a)') 'Block is correctly allocated... failed' + else + write (stderr, '(a)') 'Block is correctly allocated... passed' + end if + + ! TODO: Check that data is resident on device + ! TODO: Check that data is freed from device + + call allocator%destroy() + + call MPI_Finalize(ierr) +end program test_allocator_omptgt diff --git a/tests/test_setget_field.f90 b/tests/test_setget_field.f90 index c8ccdc599..3e6af9ab3 100644 --- a/tests/test_setget_field.f90 +++ b/tests/test_setget_field.f90 @@ -10,7 +10,12 @@ program test_setget_field use m_cuda_backend, only: cuda_backend_t use m_cuda_common, only: SZ #else +#ifndef OMP_TGT use m_omp_backend, only: omp_backend_t +#else + use m_omptgt_backend, only: omptgt_backend_t + use m_omptgt_allocator, only: omptgt_allocator_t +#endif use m_omp_common, only: SZ #endif use m_mesh, only: mesh_t @@ -23,8 +28,13 @@ program test_setget_field type(cuda_allocator_t), target :: cuda_allocator type(cuda_backend_t), target :: cuda_backend #else +#ifndef OMP_TGT type(allocator_t), target :: omp_allocator type(omp_backend_t), target :: omp_backend +#else + type(omptgt_allocator_t), target :: omptgt_allocator + type(omptgt_backend_t), target :: omptgt_backend +#endif #endif type(mesh_t) :: mesh @@ -32,41 +42,75 @@ program test_setget_field real(dp), dimension(:, :, :), allocatable :: arr integer, dimension(3) :: shape_c + integer :: irank integer :: ierr call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, irank, ierr) + + print *, "Initialised MPI" mesh = mesh_t([16, 32, 48], [1, 1, 1], [1.0_dp, 1.0_dp, 1.0_dp], & ["periodic", "periodic"], & ["periodic", "periodic"], & ["periodic", "periodic"]) + print *, "Initialised mesh" + #ifdef CUDA cuda_allocator = cuda_allocator_t(mesh%get_dims(VERT), SZ) allocator => cuda_allocator #else +#ifndef OMP_TGT omp_allocator = allocator_t(mesh%get_dims(VERT), SZ) allocator => omp_allocator omp_backend = omp_backend_t(mesh, allocator) backend => omp_backend +#else + omptgt_allocator = omptgt_allocator_t(mesh%get_dims(VERT), SZ) + allocator => omptgt_allocator + + omptgt_backend = omptgt_backend_t(mesh, allocator) + backend => omptgt_backend +#endif #endif + print *, "Initialised backend" + fld => backend%allocator%get_block(DIR_X, VERT) fld_c => backend%allocator%get_block(DIR_C, VERT) shape_c = fld_c%get_shape() + print *, shape_c allocate (arr(shape_c(1), shape_c(2), shape_c(3))) arr = 1.0_dp + + print *, "Initialised data" + call backend%set_field_data(fld, arr) + print *, "Set field data" + if (fld%data_loc /= VERT) then error stop "Field location was changed by set_field_data" end if + arr = 0.0_dp + call backend%get_field_data(arr, fld) + if (any(arr /= 1.0_dp)) then + error stop "Getting/setting field data failed" + end if + + print *, "Get field data" + deallocate (arr) call backend%allocator%release_block(fld) call backend%allocator%release_block(fld_c) + call MPI_Barrier(MPI_COMM_WORLD, ierr) call MPI_Finalize(ierr) + if (irank == 0) then + print *, "PASS" + end if end program test_setget_field diff --git a/tests/test_time_integrator.f90 b/tests/test_time_integrator.f90 index 5f1ef4ec7..8937ce0db 100644 --- a/tests/test_time_integrator.f90 +++ b/tests/test_time_integrator.f90 @@ -15,8 +15,13 @@ program test_omp_adamsbashforth use m_cuda_backend, only: cuda_backend_t use m_cuda_common, only: SZ #else - use m_omp_backend, only: omp_backend_t use m_omp_common, only: SZ +#ifndef OMP_TGT + use m_omp_backend, only: omp_backend_t +#else + use m_omptgt_backend, only: omptgt_backend_t + use m_omptgt_allocator, only: omptgt_allocator_t +#endif #endif implicit none @@ -48,8 +53,13 @@ program test_omp_adamsbashforth type(cuda_allocator_t), target :: cuda_allocator integer :: ndevs, devnum #else - type(omp_backend_t), target :: omp_backend +#ifndef OMP_TGT type(allocator_t), target :: omp_allocator + type(omp_backend_t), target :: omp_backend +#else + type(omptgt_allocator_t), target :: omptgt_allocator + type(omptgt_backend_t), target :: omptgt_backend +#endif #endif class(time_intg_t), allocatable :: time_integrator @@ -89,12 +99,22 @@ program test_omp_adamsbashforth backend => cuda_backend if (nrank == 0) print *, 'CUDA backend instantiated' #else +#ifndef OMP_TGT omp_allocator = allocator_t(mesh%get_dims(VERT), SZ) allocator => omp_allocator +#else + omptgt_allocator = omptgt_allocator_t(mesh%get_dims(VERT), SZ) + allocator => omptgt_allocator +#endif if (nrank == 0) print *, 'OpenMP allocator instantiated' +#ifndef OMP_TGT omp_backend = omp_backend_t(mesh, allocator) backend => omp_backend +#else + omptgt_backend = omptgt_backend_t(mesh, allocator) + backend => omptgt_backend +#endif if (nrank == 0) print *, 'OpenMP backend instantiated' #endif diff --git a/tests/test_vecadd.f90 b/tests/test_vecadd.f90 index df9f336ad..a2a094b67 100644 --- a/tests/test_vecadd.f90 +++ b/tests/test_vecadd.f90 @@ -16,7 +16,12 @@ program test_vecadd use m_cuda_backend, only: cuda_backend_t #else use m_omp_common, only: SZ +#ifndef OMP_TGT use m_omp_backend, only: omp_backend_t +#else + use m_omptgt_backend, only: omptgt_backend_t + use m_omptgt_allocator, only: omptgt_allocator_t +#endif #endif implicit none @@ -35,7 +40,12 @@ program test_vecadd #else type(allocator_t), target :: omp_allocator type(allocator_t), pointer :: host_allocator +#ifndef OMP_TGT type(omp_backend_t), target :: omp_backend +#else + type(omptgt_allocator_t), target :: omptgt_allocator + type(omptgt_backend_t), target :: omptgt_backend +#endif #endif class(field_t), pointer :: a => null() class(field_t), pointer :: b => null() @@ -63,21 +73,7 @@ program test_vecadd mesh = mesh_t(dims_global, nproc_dir, L_global, BC_x, BC_y, BC_z) -#ifdef CUDA - cuda_allocator = cuda_allocator_t(mesh%get_dims(VERT), SZ) - allocator => cuda_allocator - host_allocator = allocator_t(mesh%get_dims(VERT), SZ) - - cuda_backend = cuda_backend_t(mesh, allocator) - backend => cuda_backend -#else - omp_allocator = allocator_t(mesh%get_dims(VERT), SZ) - allocator => omp_allocator - host_allocator => omp_allocator - - omp_backend = omp_backend_t(mesh, allocator) - backend => omp_backend -#endif + call initialise_test() test_pass = .true. @@ -234,6 +230,32 @@ subroutine test_addab(d) end subroutine + subroutine initialise_test() + +#ifdef CUDA + cuda_allocator = cuda_allocator_t(mesh%get_dims(VERT), SZ) + allocator => cuda_allocator + host_allocator = allocator_t(mesh%get_dims(VERT), SZ) + + cuda_backend = cuda_backend_t(mesh, allocator) + backend => cuda_backend +#else + omp_allocator = allocator_t(mesh%get_dims(VERT), SZ) +#ifdef OMP_TGT + omptgt_allocator = omptgt_allocator_t(mesh%get_dims(VERT), SZ) + allocator => omptgt_allocator + omptgt_backend = omptgt_backend_t(mesh, allocator) + backend => omptgt_backend +#else + allocator => omp_allocator + omp_backend = omp_backend_t(mesh, allocator) + backend => omp_backend +#endif + host_allocator => omp_allocator +#endif + + end subroutine + subroutine initialise_data(a, b, c, z, d) integer, intent(in) :: d