Skip to content

Commit fe875d8

Browse files
committed
cuda: add prefix-sum example
1 parent bc4be79 commit fe875d8

File tree

3 files changed

+101
-1
lines changed

3 files changed

+101
-1
lines changed

gpu/cuda/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,7 @@ array-add.ptx
99
matrix-mul
1010
matrix-mul-tiled
1111
template
12+
prefix-sum
13+
*.cubin
14+
*.ptx
15+
*.sass

gpu/cuda/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ array-add-ptx:
4747
template: src/template.cu
4848
nvcc -lnppc -o $@ $<
4949

50+
prefix-sum: src/prefix-sum.cu
51+
nvcc -arch=sm_86 -ptx $< -o $@.ptx
52+
nvcc -arch=sm_86 -cubin $< -o $@.cubin
53+
cuobjdump -sass $@.cubin > $@.sass
54+
nvcc -lnppc -G -g -o $@ $<
55+
5056
.PHONY: clean
5157
clean:
52-
@${RM} threads inc hello-world.ptx info wmma streams graphs array-add matrix-mul
58+
@${RM} threads inc hello-world.ptx info wmma streams graphs array-add matrix-mul prefix-sum

gpu/cuda/src/prefix-sum.cu

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#include <stdio.h>
2+
3+
// Recall that __restrict__ is a hint to the compiler that the pointers do not
4+
// overlap in memory.
5+
__global__ void compact_kernel(const int * __restrict__ input,
6+
int * __restrict__ output,
7+
int * __restrict__ out_count,
8+
int n) {
9+
extern __shared__ int scan[]; // shared memory for flags + prefix sum
10+
11+
int tid = threadIdx.x;
12+
13+
int x = 0;
14+
int flag = 0;
15+
if (tid < n) {
16+
x = input[tid];
17+
flag = (x != 0); // 1 = include, 0 = discard
18+
}
19+
20+
// tore flags in shared memory
21+
scan[tid] = flag;
22+
23+
// syncthread is a memory barrier, like a counter for the thread which needs
24+
// to be reached by all threads before any can proceed.
25+
__syncthreads();
26+
27+
28+
for (int offset = 1; offset < blockDim.x; offset <<= 1) {
29+
int val = 0;
30+
if (tid >= offset) {
31+
val = scan[tid - offset];
32+
}
33+
34+
__syncthreads();
35+
36+
scan[tid] += val;
37+
38+
__syncthreads();
39+
}
40+
41+
if (tid < n && flag == 1) {
42+
// convert to zero based index
43+
int outIndex = scan[tid] - 1;
44+
output[outIndex] = x;
45+
}
46+
47+
// The last prefix value contains the total number of kept elements, similar
48+
// to using vector.back() in C++ to get it.
49+
if (tid == blockDim.x - 1) {
50+
*out_count = scan[tid];
51+
}
52+
}
53+
54+
int main() {
55+
const int N = 8;
56+
int h_in[N] = {3, 0, 5, 0, 2, 7, 0, 4};
57+
58+
int * d_in = nullptr;
59+
int * d_out = nullptr;
60+
int * d_count = nullptr;
61+
62+
cudaMalloc(&d_in, N * sizeof(int));
63+
cudaMalloc(&d_out, N * sizeof(int));
64+
cudaMalloc(&d_count, sizeof(int));
65+
66+
cudaMemcpy(d_in, h_in, N * sizeof(int), cudaMemcpyHostToDevice);
67+
68+
dim3 block(N);
69+
dim3 grid(1);
70+
size_t shmemBytes = N * sizeof(int); // shared memory size for scan[]
71+
72+
compact_kernel<<<grid, block, shmemBytes>>>(d_in, d_out, d_count, N);
73+
cudaDeviceSynchronize();
74+
75+
int h_out[N];
76+
int h_count = 0;
77+
cudaMemcpy(h_out, d_out, N * sizeof(int), cudaMemcpyDeviceToHost);
78+
cudaMemcpy(&h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost);
79+
80+
printf("Kept %d elements:\n", h_count);
81+
for (int i = 0; i < h_count; ++i) {
82+
printf("%d ", h_out[i]);
83+
}
84+
printf("\n");
85+
86+
cudaFree(d_in);
87+
cudaFree(d_out);
88+
cudaFree(d_count);
89+
return 0;
90+
}

0 commit comments

Comments
 (0)