CUDA-Programming/ArraySum.cu at master · ShreyTiwari/CUDA-Programming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*
Program to find the sum of all the array elements given that the length of the array is a power of 2.
Benchmarking is been done to compare the performance of the CPU (squential with extremely powerful cores) and
the GPU (parallel with modestly powerful cores).
*/

/*
Sequential codes takes n-1 steps whereas parallel code takes log(n) base 2 steps.
Note: Algorithm can handle only sizes less than or equal to 2^11.
*/

// Importing the required headers
#include<stdio.h>
#include<time.h>
#include<cuda.h>

// Returns the duration from start to end times in sec
double time_elapsed(struct timespec *start, struct timespec *end)
{
	double t;
	t = (end->tv_sec - start->tv_sec); // diff in seconds
	t += (end->tv_nsec - start->tv_nsec) * 0.000000001; //diff in nanoseconds
	return t;
}

// GPU Kernel
__global__ void GPU_Sum1(int *a, int len)   //GPU code without shared memory.
{
    int id = threadIdx.x;
    int n = blockDim.x;

    while(n > 0)
    {
        if(id < n) a[id] += a[id + n];
        __syncthreads();
        n = n/2;
    }

    return;
}

// GPU Kernel
__global__ void GPU_Sum2(int *array, int len)   //GPU code with shared memory.
{
    extern __shared__ int a[];

    int id = threadIdx.x;
    int n = blockDim.x;

    a[id] = array[id];
    a[id+n] = array[id+n];
    __syncthreads();

    while(n > 0)
    {
        if(id < n) a[id] += a[id + n];
        __syncthreads();
        n = n/2;
    }

    if(id == 0) array[0] = a[0];

    return;
}

// CPU Function
void CPU_Sum(int *a, int n)
{
    for(int i = n-1; i > 0; i--)
        a[i-1] += a[i];

    return;
}

// Code execution begins here
int main()
{
    struct timespec start1, end1;	//variables to store time for GPU
    struct timespec start2, end2;	//variables to store time for GPU
    struct timespec start3, end3;	//variables to store time for CPU

    int n;      //Length of the array

    printf("Enter the value of n: ");       //Get length
    scanf("%d", &n);

    int *a1, *a2, *a3;

    if(cudaMallocManaged(&a1, n*sizeof(int)) != cudaSuccess)      //Allocate memory
    {
        printf("Malloc Error!\n");
        return 0;
    }

    if(cudaMallocManaged(&a2, n*sizeof(int)) != cudaSuccess)      //Allocate memory
    {
        printf("Malloc Error!\n");
        cudaFree(a1);
        return 0;
    }

    if(cudaMallocManaged(&a3, n*sizeof(int)) != cudaSuccess)      //Allocate memory
    {
        printf("Malloc Error!\n");
        cudaFree(a1);
        cudaFree(a2);
        return 0;
    }

    for(int i = 0; i < n; i++)      //Assign random values
    {
        a1[i] = rand()%10;
        a2[i] = a1[i];
        a3[i] = a2[i];
    }

    clock_gettime(CLOCK_REALTIME, &start1); //start timestamp
    GPU_Sum1<<<1, n/2>>>(a1, n);
    cudaDeviceSynchronize();
    clock_gettime(CLOCK_REALTIME, &end1);	//end timestamp

    clock_gettime(CLOCK_REALTIME, &start2); //start timestamp
    GPU_Sum2<<<1, n/2, n*sizeof(int)>>>(a2, n);
    cudaDeviceSynchronize();
    clock_gettime(CLOCK_REALTIME, &end2);	//end timestamp

    clock_gettime(CLOCK_REALTIME, &start3); //start timestamp
    CPU_Sum(a3, n);
    clock_gettime(CLOCK_REALTIME, &end3);	//end timestamp

    printf("\nResult of the GPU (without shared memory) : %d\n", a1[0]);
    printf("Result of the GPU (with shared memory)      : %d\n", a2[0]);
    printf("Result of the CPU                           : %d\n", a3[0]);

    printf("\nTime taken by GPU (no shared memory) is  : %lf\n", time_elapsed(&start1, &end1));	 //print result for GPU
    printf("Time taken by GPU (with shared memory) is  : %lf\n", time_elapsed(&start2, &end2));	 //print result for GPU
    printf("Time taken by CPU is                       : %lf\n", time_elapsed(&start3, &end3));	 //print result for CPU

    cudaFree(a1);
    cudaFree(a2);
    cudaFree(a3);

    cudaDeviceReset();

    return 0;
}