-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu_replication_soa.cu
116 lines (92 loc) · 3.43 KB
/
gpu_replication_soa.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/*************************************************************************
Copyright (C) 2016 Evandro Taquary
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*************************************************************************/
#include <iostream>
#include <sys/time.h>
#include "modcpy.h"
using namespace std;
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
cout << "Error: " << __FILE__ ": " << __LINE__ << ", "; \
cout << "code: "<< error << ", reason: " << cudaGetErrorString(error) << endl; \
exit(EXIT_FAILURE); \
} \
}
int main(int argc, char *argv[])
{
cudaSetDevice(0);
if(argc != 3){
cout << "Usage: " << argv[0] << " #nodes #replications" << endl;
exit(EXIT_FAILURE);
}
const int NODS = atoi(argv[1]);
const int REPS = atoi(argv[2]);
size_t treeBytes = 3*sizeof(short)*NODS + sizeof(double)*NODS;
int r = treeBytes%sizeof(int4);
treeBytes += r ? sizeof(int4)-r : 0;
size_t repBytes = treeBytes * REPS;
void *d_tree;
void *h_tree = (void*) malloc (treeBytes);
memset(h_tree,0,treeBytes);
short *parent = (short*) h_tree;
short *child1 = parent+NODS;
short *child2 = child1+NODS;
double *branch = (double*) (child2+NODS);
for(short i=0; i<NODS; i++){
parent[i] = i;
child1[i] = i;
child2[i] = i;
branch[i] = i;
}
void *h_replics = (void*) malloc(repBytes);
void *d_replics;
cudaDeviceProp device;
cudaGetDeviceProperties(&device,0);
int blockSize = device.warpSize*32;
int gridSize = (repBytes/sizeof(int4) + (blockSize-1)) / blockSize;
dim3 grid = dim3(gridSize);
dim3 block = dim3(blockSize);
CHECK(cudaMalloc((void**) &d_tree, treeBytes));
CHECK(cudaMemcpy(d_tree, h_tree, treeBytes, cudaMemcpyHostToDevice));
CHECK(cudaMalloc((void **) &d_replics, repBytes));
struct timeval begin, end;
float time_spent;
/*******************************PARALLEL MEASUREMENT*******************************/
gettimeofday(&begin, NULL);
modcpy<<<grid, block>>>(d_replics, d_tree, repBytes, treeBytes);
CHECK(cudaDeviceSynchronize());
gettimeofday(&end, NULL);
time_spent = (float) (end.tv_usec - begin.tv_usec)/1000 + (end.tv_sec - begin.tv_sec)*1000;
cout << "Time spent:\t" << time_spent << "ms " << endl;
/*******************************PARALLEL MEASUREMENT*******************************/
CHECK(cudaMemcpy(h_replics, d_replics, repBytes, cudaMemcpyDeviceToHost));
for(int i=0; i<REPS; i++){
parent = (short*) (h_replics+treeBytes*i);
child1 = parent+NODS;
child2 = child1+NODS;
branch = (double*) (child2+NODS);
for(int j=0; j<NODS; j++)
{
if(parent[j] != j || child1[j] != j || child2[j] != j || branch[j] != j ){
cout << "Data doesn't match!" << endl;
exit(EXIT_FAILURE);
}
}
}
cout << "Data does match!" << endl;
cudaDeviceReset();
exit(EXIT_SUCCESS);
}