|
| 1 | +#include <stdio.h> |
| 2 | + |
| 3 | +// Recall that __restrict__ is a hint to the compiler that the pointers do not |
| 4 | +// overlap in memory. |
| 5 | +__global__ void compact_kernel(const int * __restrict__ input, |
| 6 | + int * __restrict__ output, |
| 7 | + int * __restrict__ out_count, |
| 8 | + int n) { |
| 9 | + extern __shared__ int scan[]; // shared memory for flags + prefix sum |
| 10 | + |
| 11 | + int tid = threadIdx.x; |
| 12 | + |
| 13 | + int x = 0; |
| 14 | + int flag = 0; |
| 15 | + if (tid < n) { |
| 16 | + x = input[tid]; |
| 17 | + flag = (x != 0); // 1 = include, 0 = discard |
| 18 | + } |
| 19 | + |
| 20 | + // tore flags in shared memory |
| 21 | + scan[tid] = flag; |
| 22 | + |
| 23 | + // syncthread is a memory barrier, like a counter for the thread which needs |
| 24 | + // to be reached by all threads before any can proceed. |
| 25 | + __syncthreads(); |
| 26 | + |
| 27 | + |
| 28 | + for (int offset = 1; offset < blockDim.x; offset <<= 1) { |
| 29 | + int val = 0; |
| 30 | + if (tid >= offset) { |
| 31 | + val = scan[tid - offset]; |
| 32 | + } |
| 33 | + |
| 34 | + __syncthreads(); |
| 35 | + |
| 36 | + scan[tid] += val; |
| 37 | + |
| 38 | + __syncthreads(); |
| 39 | + } |
| 40 | + |
| 41 | + if (tid < n && flag == 1) { |
| 42 | + // convert to zero based index |
| 43 | + int outIndex = scan[tid] - 1; |
| 44 | + output[outIndex] = x; |
| 45 | + } |
| 46 | + |
| 47 | + // The last prefix value contains the total number of kept elements, similar |
| 48 | + // to using vector.back() in C++ to get it. |
| 49 | + if (tid == blockDim.x - 1) { |
| 50 | + *out_count = scan[tid]; |
| 51 | + } |
| 52 | +} |
| 53 | + |
| 54 | +int main() { |
| 55 | + const int N = 8; |
| 56 | + int h_in[N] = {3, 0, 5, 0, 2, 7, 0, 4}; |
| 57 | + |
| 58 | + int * d_in = nullptr; |
| 59 | + int * d_out = nullptr; |
| 60 | + int * d_count = nullptr; |
| 61 | + |
| 62 | + cudaMalloc(&d_in, N * sizeof(int)); |
| 63 | + cudaMalloc(&d_out, N * sizeof(int)); |
| 64 | + cudaMalloc(&d_count, sizeof(int)); |
| 65 | + |
| 66 | + cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice); |
| 67 | + |
| 68 | + dim3 block(N); |
| 69 | + dim3 grid(1); |
| 70 | + size_t shmemBytes = N * sizeof(int); // shared memory size for scan[] |
| 71 | + |
| 72 | + compact_kernel<<<grid, block, shmemBytes>>>(d_in, d_out, d_count, N); |
| 73 | + cudaDeviceSynchronize(); |
| 74 | + |
| 75 | + int h_out[N]; |
| 76 | + int h_count = 0; |
| 77 | + cudaMemcpy(h_out, d_out, N * sizeof(int), cudaMemcpyDeviceToHost); |
| 78 | + cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost); |
| 79 | + |
| 80 | + printf("Kept %d elements:\n", h_count); |
| 81 | + for (int i = 0; i < h_count; ++i) { |
| 82 | + printf("%d ", h_out[i]); |
| 83 | + } |
| 84 | + printf("\n"); |
| 85 | + |
| 86 | + cudaFree(d_in); |
| 87 | + cudaFree(d_out); |
| 88 | + cudaFree(d_count); |
| 89 | + return 0; |
| 90 | +} |
0 commit comments