cuda: add prefix-sum example

danbev · danbev · commit fe875d8fe47d · 2025-11-20T05:56:25.000+01:00
diff --git a/gpu/cuda/.gitignore b/gpu/cuda/.gitignore
@@ -9,3 +9,7 @@ array-add.ptx
 matrix-mul
 matrix-mul-tiled
 template
+prefix-sum
+*.cubin
+*.ptx
+*.sass
diff --git a/gpu/cuda/Makefile b/gpu/cuda/Makefile
@@ -47,6 +47,12 @@ array-add-ptx:
 template: src/template.cu
 	nvcc -lnppc -o $@ $<
 
+prefix-sum: src/prefix-sum.cu
+	nvcc -arch=sm_86 -ptx $< -o $@.ptx
+	nvcc -arch=sm_86 -cubin $< -o $@.cubin
+	cuobjdump -sass $@.cubin > $@.sass
+	nvcc -lnppc -G -g -o $@ $<
+
 .PHONY: clean
 clean:
-	@${RM} threads inc hello-world.ptx info wmma streams graphs array-add matrix-mul
+	@${RM} threads inc hello-world.ptx info wmma streams graphs array-add matrix-mul prefix-sum
diff --git a/gpu/cuda/src/prefix-sum.cu b/gpu/cuda/src/prefix-sum.cu
@@ -0,0 +1,90 @@
+#include <stdio.h>
+
+// Recall that __restrict__ is a hint to the compiler that the pointers do not
+// overlap in memory.
+__global__ void compact_kernel(const int * __restrict__ input,
+                                     int * __restrict__ output,
+                                     int * __restrict__ out_count,
+                                     int n) {
+    extern __shared__ int scan[];  // shared memory for flags + prefix sum
+
+    int tid = threadIdx.x;
+
+    int x = 0;
+    int flag = 0;
+    if (tid < n) {
+        x = input[tid];
+        flag = (x != 0);  // 1 = include, 0 = discard
+    }
+
+    // tore flags in shared memory
+    scan[tid] = flag;
+
+    // syncthread is a memory barrier, like a counter for the thread which needs
+    // to be reached by all threads before any can proceed.
+    __syncthreads();
+
+
+    for (int offset = 1; offset < blockDim.x; offset <<= 1) {
+        int val = 0;
+        if (tid >= offset) {
+            val = scan[tid - offset];
+        }
+
+        __syncthreads();
+
+        scan[tid] += val;
+
+        __syncthreads();
+    }
+
+    if (tid < n && flag == 1) {
+        // convert to zero based index
+        int outIndex = scan[tid] - 1;
+        output[outIndex] = x;
+    }
+
+    // The last prefix value contains the total number of kept elements, similar
+    // to using vector.back() in C++ to get it.
+    if (tid == blockDim.x - 1) {
+        *out_count = scan[tid];
+    }
+}
+
+int main() {
+    const int N = 8;
+    int h_in[N] = {3, 0, 5, 0, 2, 7, 0, 4};
+
+    int * d_in    = nullptr;
+    int * d_out   = nullptr;
+    int * d_count = nullptr;
+
+    cudaMalloc(&d_in,    N * sizeof(int));
+    cudaMalloc(&d_out,   N * sizeof(int));
+    cudaMalloc(&d_count, sizeof(int));
+
+    cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
+
+    dim3 block(N);
+    dim3 grid(1);
+    size_t shmemBytes = N * sizeof(int);  // shared memory size for scan[]
+
+    compact_kernel<<<grid, block, shmemBytes>>>(d_in, d_out, d_count, N);
+    cudaDeviceSynchronize();
+
+    int h_out[N];
+    int h_count = 0;
+    cudaMemcpy(h_out,   d_out,   N * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&h_count, d_count, sizeof(int),    cudaMemcpyDeviceToHost);
+
+    printf("Kept %d elements:\n", h_count);
+    for (int i = 0; i < h_count; ++i) {
+        printf("%d ", h_out[i]);
+    }
+    printf("\n");
+
+    cudaFree(d_in);
+    cudaFree(d_out);
+    cudaFree(d_count);
+    return 0;
+}