LBL-EESA · amandasd · May 10, 2023 · May 12, 2023 · Mar 8, 2023 · Sep 3, 2024
diff --git a/.travis.yml b/.travis.yml
@@ -19,12 +19,13 @@ env:
     - BUILD_TYPE=Debug
     - TECA_DIR=/travis_teca_dir
     - TECA_PYTHON_VERSION=3
-    - TECA_DATA_REVISION=163
+    - TECA_DATA_REVISION=164
+    - TECA_TELITE_REVISION=188dc11d
   jobs:
     - DOCKER_IMAGE=ubuntu IMAGE_VERSION=22.04 IMAGE_NAME=ubuntu_22_04 REQUIRE_NETCDF_MPI=TRUE
-    - DOCKER_IMAGE=ubuntu IMAGE_VERSION=22.04 IMAGE_NAME=ubuntu_22_04 REQUIRE_NETCDF_MPI=FALSE
+    #- DOCKER_IMAGE=ubuntu IMAGE_VERSION=22.04 IMAGE_NAME=ubuntu_22_04 REQUIRE_NETCDF_MPI=FALSE
     - DOCKER_IMAGE=fedora IMAGE_VERSION=37 IMAGE_NAME=fedora_37 REQUIRE_NETCDF_MPI=TRUE
-    - DOCKER_IMAGE=fedora IMAGE_VERSION=37 IMAGE_NAME=fedora_37 REQUIRE_NETCDF_MPI=FALSE
+    #- DOCKER_IMAGE=fedora IMAGE_VERSION=37 IMAGE_NAME=fedora_37 REQUIRE_NETCDF_MPI=FALSE
     - NO_DOCKER=TRUE
 
 jobs:
@@ -66,6 +67,7 @@ install:
     "export TECA_PYTHON_VERSION=${TECA_PYTHON_VERSION} &&
     export TECA_DATA_REVISION=${TECA_DATA_REVISION} &&
     export REQUIRE_NETCDF_MPI=${REQUIRE_NETCDF_MPI} &&
+    export TECA_TELITE_REVISION=${TECA_TELITE_REVISION} &&
     ${TECA_DIR}/test/travis_ci/install_${IMAGE_NAME}.sh";
     fi
 
@@ -83,5 +85,6 @@ script:
     export DOCKER_IMAGE=${DOCKER_IMAGE} &&
     export IMAGE_VERSION=${IMAGE_VERSION} &&
     export REQUIRE_NETCDF_MPI=${REQUIRE_NETCDF_MPI} &&
+    export TECA_TELITE_REVISION=${TECA_TELITE_REVISION} &&
     ${TECA_DIR}/test/travis_ci/ctest_linux.sh";
     fi
diff --git a/CMake/FindNetCDF.cmake b/CMake/FindNetCDF.cmake
@@ -95,6 +95,7 @@ if (NOT NC_TMP_FOUND OR NOT NC_TMP_LINK_LIBRARIES OR NOT NC_TMP_LIBRARY_DIRS OR
 endif()
 
 # look for header file that indicates MPI support
+set(NETCDF_INCLUDE_DIR ${NETCDF_INCLUDE_DIR} /usr/lib/x86_64-linux-gnu/netcdf/mpi/include)
 set(NETCDF_IS_PARALLEL FALSE)
 find_file(NETCDF_PAR_INCLUDE_DIR netcdf_par.h
     PATHS ${NETCDF_INCLUDE_DIR} NO_DEFAULT_PATH)

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -281,6 +281,19 @@ if (NOT TECA_HAS_PARAVIEW)
 endif()
 set(TECA_HAS_VTK ${tmp} CACHE BOOL "VTK features")
 
+# configure for TELite
+set(tmp OFF)
+find_package(TELite QUIET)
+if (TELite_FOUND AND ((DEFINED TECA_HAS_TELITE AND TECA_HAS_TELITE) OR (NOT DEFINED TECA_HAS_TELITE)))
+    message(STATUS "TELite features -- enabled")
+    set(tmp ON)
+elseif (REQUIRE_TELITE)
+    message(FATAL_ERROR "TELite features -- required but not found. set TELite_DIR to enable.")
+else()
+    message(STATUS "TELite features -- not found. set TELite_DIR to enable.")
+endif()
+set(TECA_HAS_TELITE ${tmp} CACHE BOOL "TELite features")
+
 #configure for Boost
 set(tmp OFF)
 find_package(Boost QUIET COMPONENTS program_options)

diff --git a/alg/CMakeLists.txt b/alg/CMakeLists.txt
@@ -18,6 +18,7 @@ set(teca_alg_cxx_srcs
     teca_cartesian_mesh_source.cxx
     teca_cartesian_mesh_subset.cxx
     teca_cartesian_mesh_regrid.cxx
+    teca_mesh_join.cxx
     teca_connected_components.cxx
     teca_2d_component_area.cxx
     teca_component_area_filter.cxx
@@ -65,8 +66,11 @@ set(teca_alg_cxx_srcs
     teca_vorticity.cxx
     teca_dataset_diff.cxx
     teca_temporal_reduction.cxx
+    teca_detect_nodes.cxx
+    teca_stitch_nodes.cxx
     )
 
+
 set(teca_alg_cuda_srcs)
 if (TECA_HAS_CUDA)
     set(teca_alg_cuda_srcs
@@ -107,6 +111,10 @@ if (TECA_HAS_BOOST)
     list(APPEND teca_alg_link ${Boost_LIBRARIES})
 endif()
 
+if (TECA_HAS_TELITE)
+    list(APPEND teca_alg_link TELite)
+endif()
+
 if (TECA_HAS_CUDA)
     set_source_files_properties(${teca_alg_cxx_srcs} PROPERTIES LANGUAGE CUDA)
 endif()

diff --git a/alg/cuCompactor.cuh b/alg/cuCompactor.cuh
@@ -0,0 +1,165 @@
+/*
+ * cuCompactor.h
+ *
+ *  Created on: 21/mag/2015
+ *      Author: knotman
+ */
+
+#ifndef CUCOMPACTOR_H_
+#define CUCOMPACTOR_H_
+
+#include <thrust/scan.h>
+#include <thrust/device_vector.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Define this to turn on error checking
+#define CUDA_ERROR_CHECK
+
+#define CUDASAFECALL( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
+
+inline void __cudaSafeCall( cudaError err, const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+	if ( cudaSuccess != err )
+	{
+		fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+				file, line, cudaGetErrorString( err ) );
+
+		fprintf( stdout, "cudaSafeCall() failed at %s:%i : %s\n",
+						file, line, cudaGetErrorString( err ) );
+		exit( -1 );
+	}
+#endif
+
+	return;
+}
+
+namespace cuCompactor {
+
+#define warpSize (32)
+#define FULL_MASK 0xffffffff
+
+__host__ __device__ int divup(int x, int y) { return x / y + (x % y ? 1 : 0); }
+
+__device__ __inline__ int pow2i (int e){
+	return 1<<e;
+}
+
+
+template <typename T,typename Predicate>
+__global__ void computeBlockCounts(T* d_input,int length,int*d_BlockCounts,Predicate predicate){
+	int idx = threadIdx.x + blockIdx.x*blockDim.x;
+	if(idx < length){
+		int pred = predicate(d_input[idx]);
+		int BC=__syncthreads_count(pred);
+
+		if(threadIdx.x==0){
+			d_BlockCounts[blockIdx.x]=BC; // BC will contain the number of valid elements in all threads of this thread block
+		}
+	}
+}
+
+
+
+template <typename T,typename Predicate>
+__global__ void compactK(T* d_input,int length, T* d_output,int* d_BlocksOffset,Predicate predicate ){
+	int idx = threadIdx.x + blockIdx.x*blockDim.x;
+	extern __shared__ int warpTotals[];
+	if(idx < length){
+		int pred = predicate(d_input[idx]);
+		int w_i = threadIdx.x/warpSize; //warp index
+		int w_l = idx % warpSize;//thread index within a warp
+
+		// compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
+		int t_m = FULL_MASK >> (warpSize-w_l); //thread mask
+		#if (CUDART_VERSION < 9000)
+		int b   = __ballot(pred) & t_m; //ballot result = number whose ith bit is one if the ith's thread pred is true masked up to the current index in warp
+		#else
+		int b	= __ballot_sync(FULL_MASK,pred) & t_m;
+		#endif
+		int t_u	= __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
+
+		// last thread in warp computes total valid counts for the warp
+		if(w_l==warpSize-1){
+			warpTotals[w_i]=t_u+pred;
+		}
+
+		// need all warps in thread block to fill in warpTotals before proceeding
+		__syncthreads();
+
+		// first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
+		int numWarps = blockDim.x/warpSize;
+		unsigned int numWarpsMask = FULL_MASK >> (warpSize-numWarps);
+		if(w_i==0 && w_l<numWarps){
+			int w_i_u=0;
+			for(int j=0;j<=5;j++){ // must include j=5 in loop in case any elements of warpTotals are identically equal to 32
+				#if (CUDART_VERSION < 9000)
+		                int b_j =__ballot( warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
+				#else
+				int b_j =__ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j) );
+				#endif
+				w_i_u += (__popc(b_j & t_m)  ) << j;
+				//printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
+			}
+			warpTotals[w_l]=w_i_u;
+		}
+
+		// need all warps in thread block to wait until prefix sum is calculated in warpTotals
+		__syncthreads(); 
+
+		// if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
+		if(pred){
+			d_output[t_u+warpTotals[w_i]+d_BlocksOffset[blockIdx.x]]= d_input[idx];
+		}
+
+
+	}
+}
+
+template <class T>
+__global__  void printArray_GPU(T* hd_data, int size,int newline){
+	int w=0;
+	for(int i=0;i<size;i++){
+		if(i%newline==0) {
+			printf("\n%i -> ",w);
+			w++;
+		}
+		printf("%i ",hd_data[i]);
+	}
+	printf("\n");
+}
+
+template <typename T,typename Predicate>
+int compact(T* d_input,T* d_output,int length, Predicate predicate, int blockSize){
+	int numBlocks = divup(length,blockSize);
+	int* d_BlocksCount;
+	int* d_BlocksOffset;
+	CUDASAFECALL (cudaMalloc(&d_BlocksCount,sizeof(int)*numBlocks));
+	CUDASAFECALL (cudaMalloc(&d_BlocksOffset,sizeof(int)*numBlocks));
+	thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
+	thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
+
+	//phase 1: count number of valid elements in each thread block
+	computeBlockCounts<<<numBlocks,blockSize>>>(d_input,length,d_BlocksCount,predicate);
+
+	//phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
+	thrust::exclusive_scan(thrustPrt_bCount, thrustPrt_bCount + numBlocks, thrustPrt_bOffset);
+
+	//phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
+	compactK<<<numBlocks,blockSize,sizeof(int)*(blockSize/warpSize)>>>(d_input,length,d_output,d_BlocksOffset,predicate);
+
+	// determine number of elements in the compacted list
+	int compact_length = thrustPrt_bOffset[numBlocks-1] + thrustPrt_bCount[numBlocks-1];
+
+	cudaFree(d_BlocksCount);
+	cudaFree(d_BlocksOffset);
+
+	return compact_length;
+}
+
+
+
+} /* namespace cuCompactor */
+#endif /* CUCOMPACTOR_H_ */