-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathArraySum.cu
More file actions
146 lines (116 loc) · 3.76 KB
/
ArraySum.cu
File metadata and controls
146 lines (116 loc) · 3.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*
Program to find the sum of all the array elements given that the length of the array is a power of 2.
Benchmarking is been done to compare the performance of the CPU (squential with extremely powerful cores) and
the GPU (parallel with modestly powerful cores).
*/
/*
Sequential codes takes n-1 steps whereas parallel code takes log(n) base 2 steps.
Note: Algorithm can handle only sizes less than or equal to 2^11.
*/
// Importing the required headers
#include<stdio.h>
#include<time.h>
#include<cuda.h>
// Returns the duration from start to end times in sec
double time_elapsed(struct timespec *start, struct timespec *end)
{
double t;
t = (end->tv_sec - start->tv_sec); // diff in seconds
t += (end->tv_nsec - start->tv_nsec) * 0.000000001; //diff in nanoseconds
return t;
}
// GPU Kernel
__global__ void GPU_Sum1(int *a, int len) //GPU code without shared memory.
{
int id = threadIdx.x;
int n = blockDim.x;
while(n > 0)
{
if(id < n) a[id] += a[id + n];
__syncthreads();
n = n/2;
}
return;
}
// GPU Kernel
__global__ void GPU_Sum2(int *array, int len) //GPU code with shared memory.
{
extern __shared__ int a[];
int id = threadIdx.x;
int n = blockDim.x;
a[id] = array[id];
a[id+n] = array[id+n];
__syncthreads();
while(n > 0)
{
if(id < n) a[id] += a[id + n];
__syncthreads();
n = n/2;
}
if(id == 0) array[0] = a[0];
return;
}
// CPU Function
void CPU_Sum(int *a, int n)
{
for(int i = n-1; i > 0; i--)
a[i-1] += a[i];
return;
}
// Code execution begins here
int main()
{
struct timespec start1, end1; //variables to store time for GPU
struct timespec start2, end2; //variables to store time for GPU
struct timespec start3, end3; //variables to store time for CPU
int n; //Length of the array
printf("Enter the value of n: "); //Get length
scanf("%d", &n);
int *a1, *a2, *a3;
if(cudaMallocManaged(&a1, n*sizeof(int)) != cudaSuccess) //Allocate memory
{
printf("Malloc Error!\n");
return 0;
}
if(cudaMallocManaged(&a2, n*sizeof(int)) != cudaSuccess) //Allocate memory
{
printf("Malloc Error!\n");
cudaFree(a1);
return 0;
}
if(cudaMallocManaged(&a3, n*sizeof(int)) != cudaSuccess) //Allocate memory
{
printf("Malloc Error!\n");
cudaFree(a1);
cudaFree(a2);
return 0;
}
for(int i = 0; i < n; i++) //Assign random values
{
a1[i] = rand()%10;
a2[i] = a1[i];
a3[i] = a2[i];
}
clock_gettime(CLOCK_REALTIME, &start1); //start timestamp
GPU_Sum1<<<1, n/2>>>(a1, n);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &end1); //end timestamp
clock_gettime(CLOCK_REALTIME, &start2); //start timestamp
GPU_Sum2<<<1, n/2, n*sizeof(int)>>>(a2, n);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &end2); //end timestamp
clock_gettime(CLOCK_REALTIME, &start3); //start timestamp
CPU_Sum(a3, n);
clock_gettime(CLOCK_REALTIME, &end3); //end timestamp
printf("\nResult of the GPU (without shared memory) : %d\n", a1[0]);
printf("Result of the GPU (with shared memory) : %d\n", a2[0]);
printf("Result of the CPU : %d\n", a3[0]);
printf("\nTime taken by GPU (no shared memory) is : %lf\n", time_elapsed(&start1, &end1)); //print result for GPU
printf("Time taken by GPU (with shared memory) is : %lf\n", time_elapsed(&start2, &end2)); //print result for GPU
printf("Time taken by CPU is : %lf\n", time_elapsed(&start3, &end3)); //print result for CPU
cudaFree(a1);
cudaFree(a2);
cudaFree(a3);
cudaDeviceReset();
return 0;
}