Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7ad04f5
ev_primary to GPU
astro-YYH Oct 15, 2024
859db38
tw->reduce to grav_short_reduce_device
astro-YYH Oct 15, 2024
bd086bd
tw->visit to force_treeev_shortrange_device
astro-YYH Oct 16, 2024
f7f50ff
ev_primary interactions count
astro-YYH Oct 16, 2024
43c4534
ForceTree cuda managed
astro-YYH Oct 18, 2024
822159b
test
astro-YYH Oct 18, 2024
080ccbc
mem alloc free fix
astro-YYH Oct 18, 2024
814e096
clean message
astro-YYH Oct 18, 2024
0857031
initialize tree = {0} is necessary
astro-YYH Oct 18, 2024
60cb3e3
initialization fix tree
astro-YYH Oct 18, 2024
fc0b262
gravity softening fix
astro-YYH Oct 19, 2024
b832437
short range table fix device
astro-YYH Oct 21, 2024
7e3958a
test
astro-YYH Oct 21, 2024
fa6980e
runs but incorrect matter power
astro-YYH Oct 22, 2024
e087272
clean up
astro-YYH Oct 23, 2024
148c299
cudaMallocManaged TreeParams
astro-YYH Oct 24, 2024
18632a9
const argu
astro-YYH Oct 24, 2024
fd4d8e9
test kernel runs
astro-YYH Oct 24, 2024
97dcc26
treewalk cpu flag
astro-YYH Oct 26, 2024
614b44e
separate accn by children for accuracy
astro-YYH Oct 27, 2024
eaf6d6b
clean up
astro-YYH Oct 27, 2024
7fe1f6f
fof still uses cpu
astro-YYH Oct 27, 2024
6602817
treewalk secondary
astro-YYH Oct 29, 2024
2987306
Guard OPENMP flag with CUDACC
sbird Feb 6, 2025
44c0335
Restore const flags
sbird Feb 6, 2025
d7803ca
Add some defaults for CUDA compile flags
sbird Feb 6, 2025
1cc6663
Label strings should be static strings, not heap memory
sbird Feb 6, 2025
5bb4284
Merge pull request #3 from MP-Gadget/treewalk_cuda
astro-YYH Feb 12, 2025
d58586e
Merge branch 'master' into treewalk_cuda
sbird Feb 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
*.a
*.png
*.o
*.d
.*.swp
.kdev4/
.gdb_history
Expand Down
14 changes: 11 additions & 3 deletions Makefile.rules
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
AR ?= ar
MPICC ?= mpic++
LOW_PRECISION ?= double
NVCC ?= nvcc
NVOPTIMIZE ?= -O3 -arch=native

OPTIMIZE ?= -O2 -g -fopenmp -Wall
#Explicitly set C++ standard
Expand All @@ -17,6 +19,10 @@ ifneq ($(findstring -DUSE_CFITSIO, $(OPT)),)
FITSIO_INCL ?= $(shell pkg-config --cflags cfitsio)
FITSIO_LIBS ?= $(shell pkg-config --libs cfitsio)
endif
ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
CUDA_INCL ?= $(shell pkg-config --cflags cuda)
CUDA_LIBS ?= $(shell pkg-config --libs cuda)
endif

ifneq ($(findstring -DUSE_CUDA, $(OPT)),)
CUDA_INCL ?=
Expand Down Expand Up @@ -46,6 +52,8 @@ V ?= 0
mkdir -p `dirname $@`; \
echo Compiling $<; $$cmd

# Rule to compile .cu files (using nvcc)
.objs/%.o: %.cu
$(NVCC) $(NVOPTIMIZE) -c $< -o $@
.objs/%.o: %.cu Makefile $(CONFIG)
@cmd="$(NVCC) -MMD -c -o $@ -I../depends/include $<"; \
if test "x$(V)" = "x1" ; then echo $$cmd; fi; \
mkdir -p `dirname $@`; \
echo Compiling $<; $$cmd
1 change: 1 addition & 0 deletions Options.mk.example
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ OPTIMIZE = -fopenmp -O3 -g -Wall -ffast-math
#OPT += -DDEBUG # print a lot of debugging messages
#Disable openmp locking. This means no threading.
#OPT += -DNO_OPENMP_SPINLOCK
#OPT += -DTREE_CPU #ev_primary will run on CPU
#OPT += -DUSE_CUDA #Enable GPU-specific CUDA code
#-----------
#OPT += -DEXCUR_REION # reionization with excursion set
Expand Down
2 changes: 1 addition & 1 deletion libgadget/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ GADGET_OBJS = \
timestep.o init.o checkpoint.o \
sfr_eff.o cooling.o cooling_rates.o cooling_uvfluc.o cooling_qso_lightup.o \
winds.o veldisp.o density.o metal_return.o \
treewalk.o cosmology.o \
treewalk.o treewalk_kernel.o cosmology.o \
gravshort-tree.o gravshort-pair.o hydra.o timefac.o \
gravpm.o powerspectrum.o \
forcetree.o \
Expand Down
2 changes: 1 addition & 1 deletion libgadget/density.c
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ density(const ActiveParticles * act, int update_hsml, int DoEgyDensity, int Blac
tw->query_type_elsize = sizeof(TreeWalkQueryDensity);
tw->result_type_elsize = sizeof(TreeWalkResultDensity);
tw->priv = priv;
tw->tree = tree;
tw->tree = (ForceTree *) tree;

DENSITY_GET_PRIV(tw)->Left = (MyFloat *) mymalloc("DENS_PRIV->Left", PartManager->NumPart * sizeof(MyFloat));
DENSITY_GET_PRIV(tw)->Right = (MyFloat *) mymalloc("DENS_PRIV->Right", PartManager->NumPart * sizeof(MyFloat));
Expand Down
13 changes: 12 additions & 1 deletion libgadget/forcetree.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "utils/endrun.h"
#include "utils/system.h"
#include "utils/mymalloc.h"
#include "cuda_runtime.h"

/*! \file forcetree.c
* \brief gravitational tree
Expand Down Expand Up @@ -125,6 +126,7 @@ force_tree_full(ForceTree * tree, DomainDecomp * ddecomp, const int HybridNuTrac
mask = GASMASK + DMMASK + STARMASK + BHMASK;

/*No father array by default, only need it for hmax. We want moments.*/
message(0, "Tree construction for all types test.\n");
*tree = force_tree_build(mask, ddecomp, &act, 1, 1, EmergencyOutputDir);
/* This is all particles (even if there are neutrinos)*/
tree->full_particle_tree_flag = 1;
Expand Down Expand Up @@ -1410,5 +1412,14 @@ void force_tree_free(ForceTree * tree)
myfree(tree->Father);
/* Zero everything, especially the allocation flag*/
memset(tree, 0, sizeof(ForceTree));
tree->tree_allocated_flag = 0;
tree->tree_allocated_flag = 0;
// cudaFree(tree);

// Check if the memory was allocated with cudaMallocManaged
cudaPointerAttributes attributes;
cudaPointerGetAttributes(&attributes, tree);

// If tree was allocated with cudaMallocManaged, free it with cudaFree
if (attributes.type == cudaMemoryTypeManaged)
cudaFree(tree);
}
12 changes: 9 additions & 3 deletions libgadget/gravpm.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "cosmology.h"
#include "neutrinos_lra.h"
#include "cuda_runtime.h"

static int pm_mark_region_for_node(int startno, int rid, int * RegionInd, const ForceTree * tt);
static void convert_node_to_region(PetaPM * pm, PetaPMRegion * r, struct NODE * Nodes);
Expand Down Expand Up @@ -92,9 +93,14 @@ gravpm_force(PetaPM * pm, DomainDecomp * ddecomp, Cosmology * CP, double Time, d
}

/* Tree freed in PM*/
ForceTree Tree = {0};
// ForceTree Tree = {0};
ForceTree * Tree_ptr;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this stack-allocated again, but pass it by value (and probably const).

cudaMallocManaged(&Tree_ptr, sizeof(ForceTree));
memset(Tree_ptr, 0, sizeof(ForceTree));
message(0, "Tree allocated by cudaMallocManaged test.\n");
/* We include neutrinos in this tree unconditionally, so all particles have a Region.*/
force_tree_full(&Tree, ddecomp, 0, PowerOutputDir);
force_tree_full(Tree_ptr, ddecomp, 0, PowerOutputDir);
message(0, "Tree constructed test.\n");

/* Set up parameters*/
GravPM.Time = Time;
Expand All @@ -106,7 +112,7 @@ gravpm_force(PetaPM * pm, DomainDecomp * ddecomp, Cosmology * CP, double Time, d
* Therefore the force transfer functions are based on the potential,
* not the density.
* */
petapm_force(pm, _prepare, &global_functions, functions, &pstruct, &Tree);
petapm_force(pm, _prepare, &global_functions, functions, &pstruct, Tree_ptr);
powerspectrum_sum(pm->ps);
/*Now save the power spectrum*/
powerspectrum_save(pm->ps, PowerOutputDir, "powerspectrum", Time, GrowthFactor(CP, Time, 1.0));
Expand Down
41 changes: 29 additions & 12 deletions libgadget/gravshort-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "timestep.h"
#include "gravshort.h"
#include "walltime.h"
#include "cuda_runtime.h"

/*! \file gravtree.c
* \brief main driver routines for gravitational (short-range) force computation
Expand Down Expand Up @@ -95,17 +96,23 @@ force_treeev_shortrange(TreeWalkQueryGravShort * input,
void
grav_short_tree(const ActiveParticles * act, PetaPM * pm, ForceTree * tree, MyFloat (* AccelStore)[3], double rho0, inttime_t Ti_Current)
{
TreeWalk tw[1] = {{0}};
struct GravShortPriv priv;
priv.cellsize = tree->BoxSize / pm->Nmesh;
priv.Rcut = TreeParams.Rcut * pm->Asmth * priv.cellsize;;
priv.G = pm->G;
priv.cbrtrho0 = pow(rho0, 1.0 / 3);
priv.Ti_Current = Ti_Current;
priv.Accel = AccelStore;
TreeWalk *tw;
cudaMallocManaged(&tw, sizeof(TreeWalk)); // Allocate TreeWalk structure with Unified Memory
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one also probably stack-allocated, passed by value.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But let me take care of this after it is merged.

memset(tw, 0, sizeof(TreeWalk)); // Zero-initialize the structure
struct GravShortPriv *priv_ptr;
cudaMallocManaged(&priv_ptr, sizeof(struct GravShortPriv));

// Initialize priv_ptr as usual
priv_ptr->cellsize = tree->BoxSize / pm->Nmesh;
priv_ptr->Rcut = TreeParams.Rcut * pm->Asmth * priv_ptr->cellsize;
priv_ptr->G = pm->G;
priv_ptr->cbrtrho0 = pow(rho0, 1.0 / 3);
priv_ptr->Ti_Current = Ti_Current;
priv_ptr->Accel = AccelStore;

int accelstorealloc = 0;
if(!AccelStore) {
priv.Accel = (MyFloat (*) [3]) mymalloc2("GravAccel", PartManager->NumPart * sizeof(priv.Accel[0]));
priv_ptr->Accel = (MyFloat (*) [3]) mymalloc2("GravAccel", PartManager->NumPart * sizeof(priv_ptr->Accel[0]));
accelstorealloc = 1;
}

Expand All @@ -123,9 +130,15 @@ grav_short_tree(const ActiveParticles * act, PetaPM * pm, ForceTree * tree, MyFl
tw->result_type_elsize = sizeof(TreeWalkResultGravShort);
tw->fill = (TreeWalkFillQueryFunction) grav_short_copy;
tw->tree = tree;
tw->priv = &priv;
tw->priv = priv_ptr;
message(0, "gravity_short_tree: tree structure initialized.\n");
struct gravshort_tree_params *TreeParams_ptr;
cudaMallocManaged(&TreeParams_ptr, sizeof(struct gravshort_tree_params));
*TreeParams_ptr = TreeParams;

treewalk_run(tw, act->ActiveParticle, act->NumActiveParticle);
treewalk_run(tw, act->ActiveParticle, act->NumActiveParticle, TreeParams_ptr);
/* Free the memory */
cudaFree(TreeParams_ptr);

/* Now the force computation is finished */
/* gather some diagnostic information */
Expand All @@ -148,8 +161,12 @@ grav_short_tree(const ActiveParticles * act, PetaPM * pm, ForceTree * tree, MyFl
* avoiding the fully open O(N^2) case.*/
if(TreeParams.TreeUseBH > 1)
TreeParams.TreeUseBH = 0;

if(accelstorealloc)
myfree(priv.Accel);
myfree(priv_ptr->Accel);

cudaFree(priv_ptr);
cudaFree(tw);
}

/* Add the acceleration from a node or particle to the output structure,
Expand Down
5 changes: 5 additions & 0 deletions libgadget/gravshort.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ typedef struct {
MyFloat Potential;
} TreeWalkResultGravShort;

typedef struct {
MyFloat Acc[3];
MyFloat Potential;
} TreeWalkResultChildren;

struct GravShortPriv {
/* Size of a PM cell, in internal units. Box / Nmesh */
double cellsize;
Expand Down
2 changes: 1 addition & 1 deletion libgadget/hydra.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ hydro_force(const ActiveParticles * act, const double atime, struct sph_pred_dat
tw->postprocess = (TreeWalkProcessFunction) hydro_postprocess;
tw->query_type_elsize = sizeof(TreeWalkQueryHydro);
tw->result_type_elsize = sizeof(TreeWalkResultHydro);
tw->tree = tree;
tw->tree = (ForceTree *) tree;
tw->priv = priv;

if(!tree->hmax_computed_flag)
Expand Down
16 changes: 10 additions & 6 deletions libgadget/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "cosmology.h"
#include "gravity.h"
#include "physconst.h"
#include "cuda_runtime.h"

/*! \file init.c
* \brief code for initialisation of a simulation from initial conditions
Expand Down Expand Up @@ -481,11 +482,14 @@ setup_smoothinglengths(int RestartSnapNum, DomainDecomp * ddecomp, Cosmology * C
if(tot_sph + tot_bh == 0)
return;

ForceTree Tree = {0};
// ForceTree Tree = {0};
ForceTree * Tree_ptr;
cudaMallocManaged(&Tree_ptr, sizeof(ForceTree));
memset(Tree_ptr, 0, sizeof(ForceTree));
/* Finds fathers for each gas and BH particle, so need BH*/
force_tree_rebuild_mask(&Tree, ddecomp, GASMASK+BHMASK, NULL);
force_tree_rebuild_mask(Tree_ptr, ddecomp, GASMASK+BHMASK, NULL);
/* Set the initial smoothing length for gas and DM, compute tree moments.*/
set_init_hsml(&Tree, ddecomp, MeanGasSeparation);
set_init_hsml(Tree_ptr, ddecomp, MeanGasSeparation);

/* for clean IC with U input only, we need to iterate to find entropy */
double u_init = (1.0 / GAMMA_MINUS1) * (BOLTZMANN / PROTONMASS) * InitParams.InitGasTemp;
Expand All @@ -509,17 +513,17 @@ setup_smoothinglengths(int RestartSnapNum, DomainDecomp * ddecomp, Cosmology * C
DriftKickTimes times = init_driftkicktime(Ti_Current);
/*At the first time step all particles should be active*/
ActiveParticles act = init_empty_active_particles(PartManager);
density(&act, 1, 0, BlackHoleOn, times, CP, &sph_pred, NULL, &Tree);
density(&act, 1, 0, BlackHoleOn, times, CP, &sph_pred, NULL, Tree_ptr);
slots_free_sph_pred_data(&sph_pred);

if(DensityIndependentSphOn()) {
setup_density_indep_entropy(&act, &Tree, CP, &sph_pred, u_init, a3, BlackHoleOn, Ti_Current);
setup_density_indep_entropy(&act, Tree_ptr, CP, &sph_pred, u_init, a3, BlackHoleOn, Ti_Current);
}
else {
/*Initialize to initial energy*/
#pragma omp parallel for
for(i = 0; i < SlotsManager->info[0].size; i++)
SphP[i].Entropy = GAMMA_MINUS1 * u_init / pow(SphP[i].Density / a3 , GAMMA_MINUS1);
}
force_tree_free(&Tree);
force_tree_free(Tree_ptr);
}
2 changes: 1 addition & 1 deletion libgadget/metal_return.c
Original file line number Diff line number Diff line change
Expand Up @@ -943,7 +943,7 @@ stellar_density(const ActiveParticles * act, MyFloat * StarVolumeSPH, MyFloat *
tw->query_type_elsize = sizeof(TreeWalkQueryStellarDensity);
tw->result_type_elsize = sizeof(TreeWalkResultStellarDensity);
tw->priv = priv;
tw->tree = tree;
tw->tree = (ForceTree *) tree;

int i;

Expand Down
21 changes: 16 additions & 5 deletions libgadget/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include "stats.h"
#include "veldisp.h"
#include "plane.h"
#include "cuda_runtime.h"
#include "treewalk_kernel.h"

static struct ClockTable Clocks;
/* Size of table full of random numbers generated each timestep.*/
Expand Down Expand Up @@ -252,6 +254,7 @@ begrun(const int RestartSnapNum, struct header_data * head)
init_cooling_and_star_formation(All.CoolingOn, All.StarformationOn, &All.CP, head->MassTable[0], head->BoxSize, units);

gravshort_fill_ntab(All.ShortRangeForceWindowType, All.Asmth);
run_gravshort_fill_ntab(All.ShortRangeForceWindowType, All.Asmth);

if(All.LightconeOn)
lightcone_init(&All.CP, head->TimeSnapshot, head->UnitLength_in_cm, All.OutputDir);
Expand Down Expand Up @@ -531,17 +534,23 @@ run(const int RestartSnapNum, const inttime_t ti_init, const struct header_data
/* Gravitational acceleration here*/
if(totgravactive) {
if(All.HierarchicalGravity) {
message(0, "Hierarchical gravity.\n");
/* We need to store a GravAccel for new star particles as well, so we need extra memory.*/
GravAccel.nstore = PartManager->NumPart + SlotsManager->info[0].size;
GravAccel.GravAccel = (MyFloat (*) [3]) mymalloc2("GravAccel", GravAccel.nstore * sizeof(GravAccel.GravAccel[0]));
hierarchical_gravity_accelerations(&Act, &pm, ddecomp, GravAccel, &times, HybridNuTracer, &All.CP, All.OutputDir);
}
else if(All.TreeGravOn && totgravactive) {
ForceTree Tree = {0};
ForceTree * Tree_ptr;
cudaMallocManaged(&Tree_ptr, sizeof(ForceTree));
memset(Tree_ptr, 0, sizeof(ForceTree));
cudaDeviceSynchronize();
message(0, "Tree allocated by cudaMallocManaged.\n");
/* Do a short range pairwise only step if desired*/
const double rho0 = All.CP.Omega0 * 3 * All.CP.Hubble * All.CP.Hubble / (8 * M_PI * All.CP.GravInternal);
force_tree_full(&Tree, ddecomp, HybridNuTracer, All.OutputDir);
grav_short_tree(&Act, &pm, &Tree, NULL, rho0, times.Ti_Current);
force_tree_full(Tree_ptr, ddecomp, HybridNuTracer, All.OutputDir);
grav_short_tree(&Act, &pm, Tree_ptr, NULL, rho0, times.Ti_Current);
cudaFree(Tree_ptr);
}
}
message(0, "Forces computed.\n");
Expand Down Expand Up @@ -832,11 +841,13 @@ runfof(const int RestartSnapNum, const inttime_t Ti_Current, const struct header
force_tree_free(&gasTree);
slots_free_sph_pred_data(&sph_predicted);
}
ForceTree Tree = {0};
ForceTree * Tree_ptr;
cudaMallocManaged(&Tree_ptr, sizeof(ForceTree));
memset(Tree_ptr, 0, sizeof(ForceTree));
struct grav_accel_store gg = {0};
/* Cooling is just for the star formation rate, so does not actually use the random table*/
RandTable rnd = set_random_numbers(All.RandomSeed, RNDTABLE);
cooling_and_starformation(&Act, header->TimeSnapshot, 0, &Tree, gg, ddecomp, &All.CP, GradRho, &rnd, NULL);
cooling_and_starformation(&Act, header->TimeSnapshot, 0, Tree_ptr, gg, ddecomp, &All.CP, GradRho, &rnd, NULL);
free_random_numbers(&rnd);

if(GradRho)
Expand Down
Loading