Skip to content

Commit a836408

Browse files
authored
Merge branch 'master' into refactor
2 parents 7f0c19d + f8830e5 commit a836408

File tree

20 files changed

+553
-41
lines changed

20 files changed

+553
-41
lines changed

CMakeLists.txt

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ option(MFC_MPI "Build with MPI" ON
2020
option(MFC_OpenACC "Build with OpenACC" OFF)
2121
option(MFC_GCov "Build with GCov" OFF)
2222
option(MFC_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF)
23+
option(MFC_Fastmath "Build with -gpu=fastmath on NV GPUs" OFF)
2324
option(MFC_PRE_PROCESS "Build pre_process" OFF)
2425
option(MFC_SIMULATION "Build simulation" OFF)
2526
option(MFC_POST_PROCESS "Build post_process" OFF)
@@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET)
487488
"-foffload=amdgcn-amdhsa='-march=gfx90a'"
488489
"-foffload-options=-lgfortran\ -lm"
489490
"-fno-exceptions")
491+
if (MFC_Fastmath)
492+
message(WARNING "--fastmath has no effect with the GNU compiler")
493+
endif()
490494
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
491495
foreach (cc ${MFC_CUDA_CC})
492496
target_compile_options(${a_target}
@@ -498,14 +502,20 @@ function(MFC_SETUP_TARGET)
498502
PRIVATE -gpu=keep,ptxinfo,lineinfo
499503
)
500504

505+
if (MFC_Fastmath)
506+
target_compile_options(${a_target}
507+
PRIVATE -gpu=fastmath
508+
)
509+
endif()
510+
501511
# GH-200 Unified Memory Support
502512
if (MFC_Unified)
503513
target_compile_options(${ARGS_TARGET}
504-
PRIVATE -gpu=unified
514+
PRIVATE -gpu=mem:unified:managedalloc -cuda
505515
)
506516
# "This option must appear in both the compile and link lines" -- NVHPC Docs
507517
target_link_options(${ARGS_TARGET}
508-
PRIVATE -gpu=unified
518+
PRIVATE -gpu=mem:unified:managedalloc -cuda
509519
)
510520
endif()
511521

@@ -521,16 +531,28 @@ function(MFC_SETUP_TARGET)
521531
PRIVATE -DFRONTIER_UNIFIED)
522532
endif()
523533

534+
if (MFC_Fastmath)
535+
message(WARNING "--fastmath has no effect with the CCE")
536+
endif()
537+
524538
find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
525539
target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
526540
endif()
527541
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
528542
target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
543+
if (MFC_Fastmath)
544+
message(WARNING "--fastmath has no effect with the CCE")
545+
endif()
529546
endif()
530547

531548
if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
532549
find_package(CUDAToolkit REQUIRED)
533-
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
550+
if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
551+
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
552+
else() # CUDA >= 12.9
553+
target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx)
554+
target_link_options(${a_target} PRIVATE "-cudalib=nvtx3")
555+
endif()
534556
endif()
535557
endforeach()
536558

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python3
2+
import math
3+
import json
4+
5+
N = 799
6+
Nx = N
7+
Ny = 2 * (N + 1) - 1
8+
Nz = 2 * (N + 1) - 1
9+
10+
Re = 1600
11+
L = 1
12+
P0 = 101325
13+
rho0 = 1
14+
C0 = math.sqrt(1.4 * P0)
15+
V0 = 0.1 * C0
16+
mu = V0 * L / Re
17+
18+
cfl = 0.5
19+
dx = 2 * math.pi * L / (Ny + 1)
20+
21+
dt = cfl * dx / (C0)
22+
23+
tC = L / V0
24+
tEnd = 20 * tC
25+
26+
Nt = int(tEnd / dt)
27+
Nt = 10
28+
29+
30+
# Configuring case dictionary
31+
print(
32+
json.dumps(
33+
{
34+
"rdma_mpi": "T",
35+
# Logistics
36+
"run_time_info": "F",
37+
# Computational Domain Parameters
38+
"x_domain%beg": -math.pi * L,
39+
"x_domain%end": math.pi * L,
40+
"y_domain%beg": -math.pi * L,
41+
"y_domain%end": math.pi * L,
42+
"z_domain%beg": -math.pi * L,
43+
"z_domain%end": math.pi * L,
44+
"m": Nx,
45+
"n": Ny,
46+
"p": Nz,
47+
"cyl_coord": "F",
48+
"dt": dt,
49+
"t_step_start": 0,
50+
"t_step_stop": 10, # Nt,
51+
"t_step_save": 10, # int(Nt / 100),
52+
# Simulation Algorithm Parameters
53+
"num_patches": 1,
54+
"model_eqns": 2,
55+
"num_fluids": 1,
56+
"time_stepper": 3,
57+
"bc_x%beg": -1,
58+
"bc_x%end": -1,
59+
"bc_y%beg": -1,
60+
"bc_y%end": -1,
61+
"bc_z%beg": -1,
62+
"bc_z%end": -1,
63+
"igr": "T",
64+
"igr_order": 5,
65+
"igr_iter_solver": 1,
66+
"num_igr_iters": 3,
67+
"num_igr_warm_start_iters": 3,
68+
"alf_factor": 10,
69+
"viscous": "T",
70+
# Formatted Database Files Structure Parameters
71+
"format": 1,
72+
"precision": 2,
73+
"prim_vars_wrt": "T",
74+
"omega_wrt(1)": "T",
75+
"omega_wrt(2)": "T",
76+
"omega_wrt(3)": "T",
77+
"qm_wrt": "T",
78+
"fd_order": 4,
79+
"parallel_io": "T",
80+
# Patch 1: Background (AIR - 2)
81+
"patch_icpp(1)%geometry": 9,
82+
"patch_icpp(1)%x_centroid": 0,
83+
"patch_icpp(1)%y_centroid": 0,
84+
"patch_icpp(1)%z_centroid": 0,
85+
"patch_icpp(1)%length_x": 2 * math.pi * L,
86+
"patch_icpp(1)%length_y": 2 * math.pi * L,
87+
"patch_icpp(1)%length_z": 2 * math.pi * L,
88+
"patch_icpp(1)%vel(1)": 0.0,
89+
"patch_icpp(1)%vel(2)": 0.0,
90+
"patch_icpp(1)%vel(3)": 0,
91+
"patch_icpp(1)%pres": 0.0,
92+
"patch_icpp(1)%hcid": 380,
93+
"patch_icpp(1)%alpha_rho(1)": 1,
94+
"patch_icpp(1)%alpha(1)": 1,
95+
# Fluids Physical Parameters
96+
"fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
97+
"fluid_pp(1)%pi_inf": 0,
98+
"fluid_pp(1)%Re(1)": 1 / mu,
99+
# NVIDIA UVM Options
100+
"nv_uvm_out_of_core": "T",
101+
"nv_uvm_igr_temps_on_gpu": 3,
102+
"nv_uvm_pref_gpu": "T",
103+
}
104+
)
105+
)

misc/nvidia_uvm/bind.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env bash
2+
3+
# -------------------------------- #
4+
# Binding for a single Santis node #
5+
# -------------------------------- #
6+
7+
# Local rank
8+
export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
9+
10+
# Bind to GPU
11+
export CUDA_VISIBLE_DEVICES="$local_rank"
12+
13+
# Binding to NIC
14+
export MPICH_OFI_NIC_POLICY=USER
15+
export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
16+
17+
# Bind to cores ( all cores per socket )
18+
physcores=(0-71 72-143 144-215 216-287)
19+
20+
#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
21+
22+
#set -x
23+
numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
24+
#set +x

misc/nvidia_uvm/nsys.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
#set -x
4+
set -euo pipefail
5+
6+
rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
7+
8+
[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
9+
[[ -z "${NSYS+x}" ]] && NSYS=0
10+
11+
if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
12+
exec nsys profile \
13+
--cpuctxsw=none -b none -s none \
14+
--event-sample=system-wide \
15+
--cpu-socket-events=61,71,265,273 \
16+
--cpu-socket-metrics=103,104 \
17+
--event-sampling-interval=10 \
18+
--trace=nvtx,cuda,openacc \
19+
--force-overwrite=true \
20+
-e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
21+
-o "$NSYS_FILE" "$@"
22+
else
23+
exec "$@"
24+
fi

src/common/include/macros.fpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,55 @@
1212
#endif
1313
#:enddef
1414

15+
! Caution:
16+
! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank.
17+
! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0.
18+
! For an example see misc/nvidia_uvm/bind.sh.
19+
#:def PREFER_GPU(*args)
20+
#ifdef MFC_SIMULATION
21+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
22+
block
23+
! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly.
24+
! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort.
25+
! The cudafor functionality has not changed. But for new users, or users who have needed to
26+
! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose
27+
! interfaces to the CUDA runtime calls described in Chapter 4 of this guide.
28+
! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
29+
#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
30+
use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
31+
#else
32+
use cuda_runtime_api
33+
#endif
34+
integer :: istat
35+
36+
if (nv_uvm_pref_gpu) then
37+
#:for arg in args
38+
!print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
39+
! set preferred location GPU
40+
istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
41+
if (istat /= cudaSuccess) then
42+
write (*, "('Error code: ',I0, ': ')") istat
43+
!write(*,*) cudaGetErrorString(istat)
44+
end if
45+
! set accessed by CPU
46+
istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
47+
if (istat /= cudaSuccess) then
48+
write (*, "('Error code: ',I0, ': ')") istat
49+
!write(*,*) cudaGetErrorString(istat)
50+
end if
51+
! prefetch to GPU - physically populate memory pages
52+
istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
53+
if (istat /= cudaSuccess) then
54+
write (*, "('Error code: ',I0, ': ')") istat
55+
!write(*,*) cudaGetErrorString(istat)
56+
end if
57+
#:endfor
58+
end if
59+
end block
60+
#endif
61+
#endif
62+
#:enddef
63+
1564
#:def ALLOCATE(*args)
1665
@:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
1766
#:set allocated_variables = ', '.join(args)

src/common/m_mpi_common.fpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ module m_mpi_common
3838
!! average primitive variables, for a single computational domain boundary
3939
!! at the time, from the relevant neighboring processor.
4040
41+
#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
4142
$:GPU_DECLARE(create='[buff_send, buff_recv]')
43+
#endif
4244
4345
integer :: halo_size
4446
$:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
7880
7981
$:GPU_UPDATE(device='[halo_size, v_size]')
8082
83+
#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
8184
@:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
85+
#else
86+
allocate (buff_send(0:halo_size), buff_recv(0:halo_size))
87+
$:GPU_ENTER_DATA(create='[capture:buff_send]')
88+
$:GPU_ENTER_DATA(create='[capture:buff_recv]')
89+
#endif
8290
#endif
8391
8492
end subroutine s_initialize_mpi_common_module

src/simulation/m_checker.fpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ contains
3030

3131
if (igr) then
3232
call s_check_inputs_igr
33+
call s_check_inputs_nvidia_uvm
3334
else
3435
if (recon_type == WENO_TYPE) then
3536
call s_check_inputs_weno
@@ -411,4 +412,13 @@ contains
411412
@:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled")
412413
end subroutine s_check_inputs_mhd
413414
415+
impure subroutine s_check_inputs_nvidia_uvm
416+
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
417+
@:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
418+
"nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
419+
@:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
420+
"nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
421+
#endif
422+
end subroutine s_check_inputs_nvidia_uvm
423+
414424
end module m_checker

src/simulation/m_global_parameters.fpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,16 @@ module m_global_parameters
157157
logical :: viscous !< Viscous effects
158158
#:endif
159159

160+
!> @name Variables for our of core IGR computation on NVIDIA
161+
!> @{
162+
logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
163+
integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
164+
! 1 => jac on GPU, jac_rhs and jac_old on CPU
165+
! 2 => jac and jac_rhs on GPU, jac_old on CPU
166+
! 3 => jac, jac_rhs, and jac_old on GPU (default)
167+
logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE)
168+
!> @}
169+
160170
real(wp) :: weno_eps !< Binding for the WENO nonlinear weights
161171
real(wp) :: teno_CT !< Smoothness threshold for TENO
162172
logical :: mp_weno !< Monotonicity preserving (MP) WENO
@@ -573,6 +583,11 @@ contains
573583
t_stop = dflt_real
574584
t_save = dflt_real
575585
586+
! NVIDIA UVM options
587+
nv_uvm_out_of_core = .false.
588+
nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
589+
nv_uvm_pref_gpu = .false.
590+
576591
! Simulation algorithm parameters
577592
model_eqns = dflt_int
578593
mpp_lim = .false.
@@ -1321,16 +1336,25 @@ contains
13211336
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
13221337
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
13231338
@:ALLOCATE(dx(-buff_size:m + buff_size))
1339+
@:PREFER_GPU(x_cb)
1340+
@:PREFER_GPU(x_cc)
1341+
@:PREFER_GPU(dx)
13241342
13251343
if (n == 0) return;
13261344
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
13271345
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
13281346
@:ALLOCATE(dy(-buff_size:n + buff_size))
1347+
@:PREFER_GPU(y_cb)
1348+
@:PREFER_GPU(y_cc)
1349+
@:PREFER_GPU(dy)
13291350
13301351
if (p == 0) return;
13311352
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
13321353
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
13331354
@:ALLOCATE(dz(-buff_size:p + buff_size))
1355+
@:PREFER_GPU(z_cb)
1356+
@:PREFER_GPU(z_cc)
1357+
@:PREFER_GPU(dz)
13341358
13351359
end subroutine s_initialize_global_parameters_module
13361360

0 commit comments

Comments
 (0)