Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ env:
- BUILD_TYPE=Debug
- TECA_DIR=/travis_teca_dir
- TECA_PYTHON_VERSION=3
- TECA_DATA_REVISION=163
- TECA_DATA_REVISION=164
- TECA_TELITE_REVISION=188dc11d
jobs:
- DOCKER_IMAGE=ubuntu IMAGE_VERSION=22.04 IMAGE_NAME=ubuntu_22_04 REQUIRE_NETCDF_MPI=TRUE
- DOCKER_IMAGE=ubuntu IMAGE_VERSION=22.04 IMAGE_NAME=ubuntu_22_04 REQUIRE_NETCDF_MPI=FALSE
#- DOCKER_IMAGE=ubuntu IMAGE_VERSION=22.04 IMAGE_NAME=ubuntu_22_04 REQUIRE_NETCDF_MPI=FALSE
- DOCKER_IMAGE=fedora IMAGE_VERSION=37 IMAGE_NAME=fedora_37 REQUIRE_NETCDF_MPI=TRUE
- DOCKER_IMAGE=fedora IMAGE_VERSION=37 IMAGE_NAME=fedora_37 REQUIRE_NETCDF_MPI=FALSE
#- DOCKER_IMAGE=fedora IMAGE_VERSION=37 IMAGE_NAME=fedora_37 REQUIRE_NETCDF_MPI=FALSE
- NO_DOCKER=TRUE

jobs:
Expand Down Expand Up @@ -66,6 +67,7 @@ install:
"export TECA_PYTHON_VERSION=${TECA_PYTHON_VERSION} &&
export TECA_DATA_REVISION=${TECA_DATA_REVISION} &&
export REQUIRE_NETCDF_MPI=${REQUIRE_NETCDF_MPI} &&
export TECA_TELITE_REVISION=${TECA_TELITE_REVISION} &&
${TECA_DIR}/test/travis_ci/install_${IMAGE_NAME}.sh";
fi

Expand All @@ -83,5 +85,6 @@ script:
export DOCKER_IMAGE=${DOCKER_IMAGE} &&
export IMAGE_VERSION=${IMAGE_VERSION} &&
export REQUIRE_NETCDF_MPI=${REQUIRE_NETCDF_MPI} &&
export TECA_TELITE_REVISION=${TECA_TELITE_REVISION} &&
${TECA_DIR}/test/travis_ci/ctest_linux.sh";
fi
1 change: 1 addition & 0 deletions CMake/FindNetCDF.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ if (NOT NC_TMP_FOUND OR NOT NC_TMP_LINK_LIBRARIES OR NOT NC_TMP_LIBRARY_DIRS OR
endif()

# look for header file that indicates MPI support
set(NETCDF_INCLUDE_DIR ${NETCDF_INCLUDE_DIR} /usr/lib/x86_64-linux-gnu/netcdf/mpi/include)
set(NETCDF_IS_PARALLEL FALSE)
find_file(NETCDF_PAR_INCLUDE_DIR netcdf_par.h
PATHS ${NETCDF_INCLUDE_DIR} NO_DEFAULT_PATH)
Expand Down
13 changes: 13 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,19 @@ if (NOT TECA_HAS_PARAVIEW)
endif()
set(TECA_HAS_VTK ${tmp} CACHE BOOL "VTK features")

# configure for TELite
set(tmp OFF)
find_package(TELite QUIET)
if (TELite_FOUND AND ((DEFINED TECA_HAS_TELITE AND TECA_HAS_TELITE) OR (NOT DEFINED TECA_HAS_TELITE)))
message(STATUS "TELite features -- enabled")
set(tmp ON)
elseif (REQUIRE_TELITE)
message(FATAL_ERROR "TELite features -- required but not found. set TELite_DIR to enable.")
else()
message(STATUS "TELite features -- not found. set TELite_DIR to enable.")
endif()
set(TECA_HAS_TELITE ${tmp} CACHE BOOL "TELite features")

#configure for Boost
set(tmp OFF)
find_package(Boost QUIET COMPONENTS program_options)
Expand Down
8 changes: 8 additions & 0 deletions alg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ set(teca_alg_cxx_srcs
teca_cartesian_mesh_source.cxx
teca_cartesian_mesh_subset.cxx
teca_cartesian_mesh_regrid.cxx
teca_mesh_join.cxx
teca_connected_components.cxx
teca_2d_component_area.cxx
teca_component_area_filter.cxx
Expand Down Expand Up @@ -65,8 +66,11 @@ set(teca_alg_cxx_srcs
teca_vorticity.cxx
teca_dataset_diff.cxx
teca_temporal_reduction.cxx
teca_detect_nodes.cxx
teca_stitch_nodes.cxx
)


set(teca_alg_cuda_srcs)
if (TECA_HAS_CUDA)
set(teca_alg_cuda_srcs
Expand Down Expand Up @@ -107,6 +111,10 @@ if (TECA_HAS_BOOST)
list(APPEND teca_alg_link ${Boost_LIBRARIES})
endif()

if (TECA_HAS_TELITE)
list(APPEND teca_alg_link TELite)
endif()

if (TECA_HAS_CUDA)
set_source_files_properties(${teca_alg_cxx_srcs} PROPERTIES LANGUAGE CUDA)
endif()
Expand Down
165 changes: 165 additions & 0 deletions alg/cuCompactor.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* cuCompactor.h
*
* Created on: 21/mag/2015
* Author: knotman
*/

#ifndef CUCOMPACTOR_H_
#define CUCOMPACTOR_H_

#include <thrust/scan.h>
#include <thrust/device_vector.h>

#include <stdio.h>
#include <stdlib.h>

// Define this to turn on error checking
#define CUDA_ERROR_CHECK

#define CUDASAFECALL( err ) __cudaSafeCall( err, __FILE__, __LINE__ )

inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
#ifdef CUDA_ERROR_CHECK
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );

fprintf( stdout, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
#endif

return;
}

namespace cuCompactor {

#define warpSize (32)
#define FULL_MASK 0xffffffff

__host__ __device__ int divup(int x, int y) { return x / y + (x % y ? 1 : 0); }

__device__ __inline__ int pow2i (int e){
return 1<<e;
}


template <typename T,typename Predicate>
__global__ void computeBlockCounts(T* d_input,int length,int*d_BlockCounts,Predicate predicate){
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if(idx < length){
int pred = predicate(d_input[idx]);
int BC=__syncthreads_count(pred);

if(threadIdx.x==0){
d_BlockCounts[blockIdx.x]=BC; // BC will contain the number of valid elements in all threads of this thread block
}
}
}



template <typename T,typename Predicate>
__global__ void compactK(T* d_input,int length, T* d_output,int* d_BlocksOffset,Predicate predicate ){
int idx = threadIdx.x + blockIdx.x*blockDim.x;
extern __shared__ int warpTotals[];
if(idx < length){
int pred = predicate(d_input[idx]);
int w_i = threadIdx.x/warpSize; //warp index
int w_l = idx % warpSize;//thread index within a warp

// compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
int t_m = FULL_MASK >> (warpSize-w_l); //thread mask
#if (CUDART_VERSION < 9000)
int b = __ballot(pred) & t_m; //ballot result = number whose ith bit is one if the ith's thread pred is true masked up to the current index in warp
#else
int b = __ballot_sync(FULL_MASK,pred) & t_m;
#endif
int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX

// last thread in warp computes total valid counts for the warp
if(w_l==warpSize-1){
warpTotals[w_i]=t_u+pred;
}

// need all warps in thread block to fill in warpTotals before proceeding
__syncthreads();

// first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
int numWarps = blockDim.x/warpSize;
unsigned int numWarpsMask = FULL_MASK >> (warpSize-numWarps);
if(w_i==0 && w_l<numWarps){
int w_i_u=0;
for(int j=0;j<=5;j++){ // must include j=5 in loop in case any elements of warpTotals are identically equal to 32
#if (CUDART_VERSION < 9000)
int b_j =__ballot( warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
#else
int b_j =__ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j) );
#endif
w_i_u += (__popc(b_j & t_m) ) << j;
//printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
}
warpTotals[w_l]=w_i_u;
}

// need all warps in thread block to wait until prefix sum is calculated in warpTotals
__syncthreads();

// if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
if(pred){
d_output[t_u+warpTotals[w_i]+d_BlocksOffset[blockIdx.x]]= d_input[idx];
}


}
}

template <class T>
__global__ void printArray_GPU(T* hd_data, int size,int newline){
int w=0;
for(int i=0;i<size;i++){
if(i%newline==0) {
printf("\n%i -> ",w);
w++;
}
printf("%i ",hd_data[i]);
}
printf("\n");
}

template <typename T,typename Predicate>
int compact(T* d_input,T* d_output,int length, Predicate predicate, int blockSize){
int numBlocks = divup(length,blockSize);
int* d_BlocksCount;
int* d_BlocksOffset;
CUDASAFECALL (cudaMalloc(&d_BlocksCount,sizeof(int)*numBlocks));
CUDASAFECALL (cudaMalloc(&d_BlocksOffset,sizeof(int)*numBlocks));
thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);

//phase 1: count number of valid elements in each thread block
computeBlockCounts<<<numBlocks,blockSize>>>(d_input,length,d_BlocksCount,predicate);

//phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
thrust::exclusive_scan(thrustPrt_bCount, thrustPrt_bCount + numBlocks, thrustPrt_bOffset);

//phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
compactK<<<numBlocks,blockSize,sizeof(int)*(blockSize/warpSize)>>>(d_input,length,d_output,d_BlocksOffset,predicate);

// determine number of elements in the compacted list
int compact_length = thrustPrt_bOffset[numBlocks-1] + thrustPrt_bCount[numBlocks-1];

cudaFree(d_BlocksCount);
cudaFree(d_BlocksOffset);

return compact_length;
}



} /* namespace cuCompactor */
#endif /* CUCOMPACTOR_H_ */
Loading