diff --git a/src/MC_Fast_Timer.cc b/src/MC_Fast_Timer.cc index 70aa7bf9..415860a6 100644 --- a/src/MC_Fast_Timer.cc +++ b/src/MC_Fast_Timer.cc @@ -5,6 +5,10 @@ #include "Globals.hh" #include "portability.hh" +#ifdef USE_CALIPER +#include +#endif + const char *mc_fast_timer_names[MC_Fast_Timer::Num_Timers] = { "main", @@ -101,6 +105,12 @@ void MC_Fast_Timer_Container::Cumulative_Report(int mpi_rank, int num_ranks, MPI "Figure Of Merit", (numSegments / (max_clock[cycleTracking_Index]*1e-6)), "[Num Segments / Cycle Tracking Time]" ); + +#ifdef USE_CALIPER + adiak::value("numSegments", numSegments); + adiak::value("CycleTrackingTime", max_clock[cycleTracking_Index]*1e-6); + adiak::value("FigureOfMerit", numSegments / (max_clock[cycleTracking_Index]*1e-6)); +#endif } } diff --git a/src/MC_Fast_Timer.hh b/src/MC_Fast_Timer.hh index 9cad33a8..88ae46ac 100644 --- a/src/MC_Fast_Timer.hh +++ b/src/MC_Fast_Timer.hh @@ -6,6 +6,10 @@ #include #endif +#ifdef USE_CALIPER +#include +#endif + #include "portability.hh" // needed for uint64_t in this file #include "utilsMpi.hh" // needed for MPI_Comm type in this file @@ -44,12 +48,13 @@ class MC_Fast_Timer class MC_Fast_Timer_Container { public: - MC_Fast_Timer_Container() {} ; // constructor + MC_Fast_Timer_Container() + {} ; // constructor void Cumulative_Report(int mpi_rank, int num_ranks, MPI_Comm comm_world, uint64_t numSegments); void Last_Cycle_Report(int report_time, int mpi_rank, int num_ranks, MPI_Comm comm_world); void Clear_Last_Cycle_Timers(); MC_Fast_Timer timers[MC_Fast_Timer::Num_Timers]; // timers for various routines - + private: void Print_Cumulative_Heading(int mpi_rank); void Print_Last_Cycle_Heading(int mpi_rank); @@ -87,27 +92,49 @@ extern const char *mc_fast_timer_names[MC_Fast_Timer::Num_Timers]; #define MC_FASTTIMER_GET_LASTCYCLE(timerIndex) (float)mcco->fast_timer->timers[timerIndex].lastCycleClock / 1000000. #else // else CHRONO_MISSING is not defined, so high resolution clock is available - - #define MC_FASTTIMER_START(timerIndex) \ - if (omp_get_thread_num() == 0) { \ - mcco->fast_timer->timers[timerIndex].startClock = std::chrono::high_resolution_clock::now(); \ - } - - #define MC_FASTTIMER_STOP(timerIndex) \ - if ( omp_get_thread_num() == 0 ) { \ - mcco->fast_timer->timers[timerIndex].stopClock = std::chrono::high_resolution_clock::now(); \ - mcco->fast_timer->timers[timerIndex].lastCycleClock += \ - std::chrono::duration_cast \ - (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ - mcco->fast_timer->timers[timerIndex].cumulativeClock += \ - std::chrono::duration_cast \ - (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ - mcco->fast_timer->timers[timerIndex].numCalls++; \ - } + #ifdef USE_CALIPER + #define MC_FASTTIMER_START(timerIndex) \ + if (omp_get_thread_num() == 0) { \ + cali_begin_region(mc_fast_timer_names[timerIndex]); \ + mcco->fast_timer->timers[timerIndex].startClock = std::chrono::high_resolution_clock::now(); \ + } + + #define MC_FASTTIMER_STOP(timerIndex) \ + if ( omp_get_thread_num() == 0 ) { \ + mcco->fast_timer->timers[timerIndex].stopClock = std::chrono::high_resolution_clock::now(); \ + mcco->fast_timer->timers[timerIndex].lastCycleClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].cumulativeClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].numCalls++; \ + cali_end_region(mc_fast_timer_names[timerIndex]); \ + } + + #else // not defined USE_CALIPER + + #define MC_FASTTIMER_START(timerIndex) \ + if (omp_get_thread_num() == 0) { \ + mcco->fast_timer->timers[timerIndex].startClock = std::chrono::high_resolution_clock::now(); \ + } + + #define MC_FASTTIMER_STOP(timerIndex) \ + if ( omp_get_thread_num() == 0 ) { \ + mcco->fast_timer->timers[timerIndex].stopClock = std::chrono::high_resolution_clock::now(); \ + mcco->fast_timer->timers[timerIndex].lastCycleClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].cumulativeClock += \ + std::chrono::duration_cast \ + (mcco->fast_timer->timers[timerIndex].stopClock - mcco->fast_timer->timers[timerIndex].startClock).count(); \ + mcco->fast_timer->timers[timerIndex].numCalls++; \ + } + + #endif // end ifdef USE_CALIPER else branch #define MC_FASTTIMER_GET_LASTCYCLE(timerIndex) (float)mcco->fast_timer->timers[timerIndex].lastCycleClock / 1000000. - #endif // end ifdef CHRONO_MISSING else section #endif // end if DISABLE_TIMERS diff --git a/src/Makefile b/src/Makefile index 5867c989..4c362938 100644 --- a/src/Makefile +++ b/src/Makefile @@ -84,6 +84,14 @@ # with some Clang compilers, some older Gnu compilers on BG/Q # and older Intel compilers. # +# -DUSE_CALIPER Define this to enable Caliper instrumentation. Caliper +# is a performance profiling / analysis library for +# tracing, sampling, HW counter measurements, and much more. +# When enabled, Quicksilver will link in the Caliper +# library and Adiak libraries and export its timed regions +# to Caliper and store run metadata in Adiak. See +# https://github.com/LLNL/Caliper for more information. +# # -DUSE_NVTX Define this for some extra NVProf profiling information. # It will create regions that can be visualized in NVVP. # @@ -139,7 +147,6 @@ LDFLAGS = -fgpu-rdc --hip-link --offload-arch=gfx90a #CPPFLAGS = $(OPENMP_FLAGS) #LDFLAGS = $(OPENMP_LDFLAGS) - ############################################################################### ### GCC -- with MPI and OpenMP ############################################################################### @@ -154,6 +161,26 @@ LDFLAGS = -fgpu-rdc --hip-link --offload-arch=gfx90a #LDFLAGS = $(OPENMP_LDFLAGS) +############################################################################### +### GCC -- with MPI and OpenMP and Caliper support +############################################################################### +# ADIAK_DIR = $(spack location --install-dir adiak) +# CALIPER_DIR = $(spack location --install-dir caliper) + +# CALIPER_FLAGS = -I${CALIPER_DIR}/include -I${ADIAK_DIR}/include -DUSE_CALIPER +# CALIPER_LDFLAGS = -Wl,-rpath ${CALIPER_DIR}/lib64 -Wl,-rpath ${ADIAK_DIR}/lib -L${CALIPER_DIR}/lib64 -L${ADIAK_DIR}/lib -lcaliper -ladiak + +# OPENMP_FLAGS = -DHAVE_OPENMP -fopenmp +# OPENMP_LDFLAGS = -fopenmp +# MPI_FLAGS = -DHAVE_MPI +# OPTFLAGS = -g -O2 + +# CXX=mpicxx +# CXXFLAGS = -std=c++11 $(OPTFLAGS) -Wpedantic +# CPPFLAGS = $(MPI_FLAGS) $(OPENMP_FLAGS) $(CALIPER_FLAGS) +# LDFLAGS = $(OPENMP_LDFLAGS) $(CALIPER_LDFLAGS) + + ############################################################################### # LLNL LC BG/Q Comilers # ############################################################################### diff --git a/src/Parameters.cc b/src/Parameters.cc index a7205da6..80488e94 100644 --- a/src/Parameters.cc +++ b/src/Parameters.cc @@ -33,6 +33,10 @@ #include "InputBlock.hh" #include "utilsMpi.hh" +#ifdef USE_CALIPER +#include +#endif + using std::string; using std::ifstream; using std::make_pair; @@ -94,6 +98,100 @@ Parameters getParameters(int argc, char** argv) return params; } +void saveParametersInAdiak(const Parameters& pp) +{ +#ifdef USE_CALIPER + adiak::value("dt", pp.simulationParams.dt); + adiak::value("fMax", pp.simulationParams.fMax); + adiak::value("inputFile", adiak::path(pp.simulationParams.inputFile)); + adiak::value("energySpectrum", pp.simulationParams.energySpectrum); + adiak::value("boundaryCondition", pp.simulationParams.boundaryCondition); + adiak::value("loadBalance", pp.simulationParams.loadBalance); + adiak::value("cycleTimers", pp.simulationParams.cycleTimers); + adiak::value("debugThreads", pp.simulationParams.debugThreads); + adiak::value("lx", pp.simulationParams.lx); + adiak::value("ly", pp.simulationParams.ly); + adiak::value("lz", pp.simulationParams.lz); + adiak::value("nParticles", pp.simulationParams.nParticles); + adiak::value("batchSize", pp.simulationParams.batchSize); + adiak::value("nBatches", pp.simulationParams.nBatches); + adiak::value("nSteps", pp.simulationParams.nSteps); + adiak::value("nx", pp.simulationParams.nx); + adiak::value("ny", pp.simulationParams.ny); + adiak::value("nz", pp.simulationParams.nz); + adiak::value("seed", pp.simulationParams.seed); + adiak::value("xDom", pp.simulationParams.xDom); + adiak::value("yDom", pp.simulationParams.yDom); + adiak::value("zDom", pp.simulationParams.zDom); + adiak::value("eMax", pp.simulationParams.eMax); + adiak::value("eMin", pp.simulationParams.eMin); + adiak::value("nGroups", pp.simulationParams.nGroups); + adiak::value("lowWeightCutoff", pp.simulationParams.lowWeightCutoff); + adiak::value("bTally", pp.simulationParams.balanceTallyReplications); + adiak::value("fTally", pp.simulationParams.fluxTallyReplications); + adiak::value("cTally", pp.simulationParams.cellTallyReplications); + adiak::value("coralBenchmark", pp.simulationParams.coralBenchmark); + adiak::value("crossSectionsOut", pp.simulationParams.crossSectionsOut); + + for (size_t i = 0; i < pp.geometryParams.size(); ++i) { + std::string prefix("geometry."); + prefix.append(std::to_string(i)); + prefix.append("."); + const GeometryParameters& gp = pp.geometryParams[i]; + adiak::value(prefix+"material", gp.materialName); + switch (gp.shape) + { + case GeometryParameters::BRICK: + adiak::value(prefix+"shape", "brick"); + adiak::value(prefix+"xMax", gp.xMax); + adiak::value(prefix+"xMin", gp.xMin); + adiak::value(prefix+"yMax", gp.yMax); + adiak::value(prefix+"yMin", gp.yMin); + adiak::value(prefix+"zMax", gp.zMax); + adiak::value(prefix+"zMin", gp.zMin); + break; + case GeometryParameters::SPHERE: + adiak::value(prefix+"shape", "sphere"); + adiak::value(prefix+"xCenter", gp.xCenter); + adiak::value(prefix+"yCenter", gp.yCenter); + adiak::value(prefix+"zCenter", gp.zCenter); + break; + default: + qs_assert(false); + } + } + + for (const auto& material : pp.materialParams) { + std::string prefix("material."); + prefix.append(material.first); + prefix.append("."); + adiak::value(prefix+"mass", material.second.mass); + adiak::value(prefix+"nIsotopes", material.second.nIsotopes); + adiak::value(prefix+"nReactions", material.second.nReactions); + adiak::value(prefix+"sourceRate", material.second.sourceRate); + adiak::value(prefix+"totalCrossSection", material.second.totalCrossSection); + adiak::value(prefix+"absorptionCrossSection", material.second.absorptionCrossSection); + adiak::value(prefix+"fissionCrossSection", material.second.fissionCrossSection); + adiak::value(prefix+"scatteringCrossSection", material.second.scatteringCrossSection); + adiak::value(prefix+"absorptionCrossSectionRatio", material.second.absorptionCrossSectionRatio); + adiak::value(prefix+"fissionCrossSectionRatio", material.second.fissionCrossSectionRatio); + adiak::value(prefix+"scatteringCrossSectionRatio", material.second.scatteringCrossSectionRatio); + } + + for (const auto& csp : pp.crossSectionParams) { + std::string prefix("crossection."); + prefix.append(csp.first); + prefix.append("."); + adiak::value(prefix+"A", csp.second.aa); + adiak::value(prefix+"B", csp.second.bb); + adiak::value(prefix+"C", csp.second.cc); + adiak::value(prefix+"D", csp.second.dd); + adiak::value(prefix+"E", csp.second.ee); + adiak::value(prefix+"nuBar", csp.second.nuBar); + } +#endif +} + void printParameters(const Parameters& pp, ostream& out) { int rank = -1; @@ -226,6 +324,8 @@ namespace esName[0] = '\0'; char xsec[1024]; xsec[0] = '\0'; + char calicfg[1024]; + calicfg[0] = '\0'; addArg("help", 'h', 0, 'i', &(help), 0, "print this message"); addArg("dt", 'D', 1, 'd', &(sp.dt), 0, "time step (seconds)"); @@ -253,12 +353,14 @@ namespace addArg("bTally", 'B', 1, 'i', &(sp.balanceTallyReplications), 0, "number of balance tally replications"); addArg("fTally", 'F', 1, 'i', &(sp.fluxTallyReplications), 0, "number of scalar flux tally replications"); addArg("cTally", 'C', 1, 'i', &(sp.cellTallyReplications), 0, "number of scalar cell tally replications"); + addArg("caliper-config", 'P', 1, 's', &(calicfg), sizeof(calicfg), "Caliper configuration"); processArgs(argc, argv); sp.inputFile = name; sp.energySpectrum = esName; sp.crossSectionsOut = xsec; + sp.caliperConfig = calicfg; if (help) { diff --git a/src/Parameters.hh b/src/Parameters.hh index 79dfe3cf..54f9e371 100644 --- a/src/Parameters.hh +++ b/src/Parameters.hh @@ -164,6 +164,7 @@ struct SimulationParameters int fluxTallyReplications; //!< Number of replications for the scalar flux tally int cellTallyReplications; //!< Number of replications for the scalar cell tally int coralBenchmark; //!< enable correctness check for Coral2 benchmark + std::string caliperConfig; //!< Caliper configuration string }; struct Parameters @@ -176,6 +177,7 @@ struct Parameters Parameters getParameters(int argc, char** argv); void printParameters(const Parameters& params, std::ostream& out); +void saveParametersInAdiak(const Parameters& parms); std::ostream& operator<<(std::ostream& out, const SimulationParameters& pp); std::ostream& operator<<(std::ostream& out, const GeometryParameters& pp); diff --git a/src/READ.ME.HOW.TO.RUN b/src/READ.ME.HOW.TO.RUN index 24e2a8e8..dd8240be 100644 --- a/src/READ.ME.HOW.TO.RUN +++ b/src/READ.ME.HOW.TO.RUN @@ -115,6 +115,16 @@ There is also, at the end of the run, a coarse breakdown of time spent overall in the above mentioned three code phases, as well as a few other sub timings from cycle tracking. +------------------------------------------------------------------------------- +A note on Caliper: + +Caliper is a powerful performance profiling/tracing library. When configured +with Caliper support, Quicksilver adds Caliper annotations for its timed +regions (cycleTracking, cycleTrackingKernel, etc.) as the "mc.timer" +attribute. Performance measurements can be configured through environment +variables or the caliper.config configuration file. For Caliper documentation, +see https://github.com/LLNL/Caliper. + ------------------------------------------------------------------------------- A note on asserts: diff --git a/src/main.cc b/src/main.cc index 765ef62f..b4e16f61 100644 --- a/src/main.cc +++ b/src/main.cc @@ -26,6 +26,16 @@ #include "git_hash.hh" #include "git_vers.hh" +#ifdef USE_CALIPER +#include +#include +#include +#if _OPENMP +#include +#endif +#endif + +void setupCaliper(const Parameters&); void gameOver(); void cycleInit( bool loadBalance ); void cycleTracking(MonteCarlo* monteCarlo); @@ -43,8 +53,19 @@ int main(int argc, char** argv) Parameters params = getParameters(argc, argv); printParameters(params, cout); - // mcco stores just about everything. - mcco = initMC(params); +#ifdef USE_CALIPER + setupCaliper(params); + + cali::ConfigManager calimgr(params.simulationParams.caliperConfig.c_str()); + + if (calimgr.error()) + std::cerr << "caliper config error: " << calimgr.error_msg() << std::endl; + + calimgr.start(); +#endif + + // mcco stores just about everything. + mcco = initMC(params); int loadBalance = params.simulationParams.loadBalance; @@ -52,8 +73,15 @@ int main(int argc, char** argv) const int nSteps = params.simulationParams.nSteps; +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_BEGIN(mainloop, "mainloop"); +#endif for (int ii=0; iiprocessor_info->num_processors, mcco->processor_info->comm_mc_world ); } - +#ifdef USE_CALIPER + CALI_CXX_MARK_LOOP_END(mainloop); +#endif MC_FASTTIMER_STOP(MC_Fast_Timer::main); @@ -79,11 +109,49 @@ int main(int argc, char** argv) delete mcco; #endif +#ifdef USE_CALIPER + calimgr.flush(); +#endif + mpiFinalize(); - + return 0; } +void setupCaliper(const Parameters& params) +{ +#ifdef USE_CALIPER + cali_config_preset("CALI_CALIPER_ATTRIBUTE_DEFAULT_SCOPE", "process"); + + adiak::value("git_version", GIT_VERS); + adiak::value("git_hash", GIT_HASH); + adiak::collect_all(); + +#if _OPENMP + adiak::value("max_threads", omp_get_max_threads()); +#else + adiak::value("max_threads", 1); +#endif +#ifdef GPU_NATIVE + adiak::value("gpu_native", 1); +#else + adiak::value("gpu_native", 0); +#endif +#ifdef HAVE_CUDA + adiak::value("have_cuda", 1); +#else + adiak::value("have_cuda", 0); +#endif +#ifdef HAVE_HIP + adiak::value("have_hip", 1); +#else + adiak::value("have_hip", 0); +#endif + + saveParametersInAdiak(params); +#endif +} + void gameOver() { mcco->fast_timer->Cumulative_Report(mcco->processor_info->rank, @@ -112,7 +180,7 @@ void cycleInit( bool loadBalance ) mcco->particle_buffer->Initialize(); MC_SourceNow(mcco); - + PopulationControl(mcco, loadBalance); // controls particle population RouletteLowWeightParticles(mcco); // Delete particles with low statistical weight @@ -125,7 +193,7 @@ void cycleInit( bool loadBalance ) GLOBAL void CycleTrackingKernel( MonteCarlo* monteCarlo, int num_particles, ParticleVault* processingVault, ParticleVault* processedVault ) { - int global_index = getGlobalThreadID(); + int global_index = getGlobalThreadID(); if( global_index < num_particles ) { @@ -167,9 +235,9 @@ void cycleTracking(MonteCarlo *monteCarlo) ParticleVault *processingVault = my_particle_vault.getTaskProcessingVault(processing_vault); ParticleVault *processedVault = my_particle_vault.getTaskProcessedVault(processed_vault); - + int numParticles = processingVault->size(); - + if ( numParticles != 0 ) { NVTX_Range trackingKernel("cycleTracking_TrackingKernel"); // range ends at end of scope @@ -187,28 +255,28 @@ void cycleTracking(MonteCarlo *monteCarlo) dim3 grid(1,1,1); dim3 block(1,1,1); int runKernel = ThreadBlockLayout( grid, block, numParticles); - + //Call Cycle Tracking Kernel if( runKernel ) CycleTrackingKernel<<>>( monteCarlo, numParticles, processingVault, processedVault ); - + //Synchronize the stream so that memory is copied back before we begin MPI section gpuPeekAtLastError(); gpuDeviceSynchronize(); #endif } break; - + case gpuWithOpenMP: { int nthreads=128; - if (numParticles < 64*56 ) + if (numParticles < 64*56 ) nthreads = 64; int nteams = (numParticles + nthreads - 1 ) / nthreads; nteams = nteams > 1 ? nteams : 1; #ifdef HAVE_OPENMP_TARGET - #pragma omp target enter data map(to:monteCarlo[0:1]) - #pragma omp target enter data map(to:processingVault[0:1]) + #pragma omp target enter data map(to:monteCarlo[0:1]) + #pragma omp target enter data map(to:processingVault[0:1]) #pragma omp target enter data map(to:processedVault[0:1]) #pragma omp target teams distribute parallel for num_teams(nteams) thread_limit(128) #endif @@ -245,7 +313,7 @@ void cycleTracking(MonteCarlo *monteCarlo) // Next, communicate particles that have crossed onto // other MPI ranks. NVTX_Range cleanAndComm("cycleTracking_clean_and_comm"); - + SendQueue &sendQueue = *(my_particle_vault.getSendQueue()); monteCarlo->particle_buffer->Allocate_Send_Buffer( sendQueue ); @@ -314,7 +382,7 @@ void cycleFinalize() mcco->_tallies->_balanceTask[0]._end = mcco->_particleVaultContainer->sizeProcessed(); // Update the cumulative tally data. - mcco->_tallies->CycleFinalize(mcco); + mcco->_tallies->CycleFinalize(mcco); mcco->time_info->cycle++; @@ -322,4 +390,3 @@ void cycleFinalize() MC_FASTTIMER_STOP(MC_Fast_Timer::cycleFinalize); } -