-
Notifications
You must be signed in to change notification settings - Fork 9
/
device_tools.cu
160 lines (137 loc) · 6.92 KB
/
device_tools.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/**
* @author Daniel Thun and Christoph Schaefer
*
* @section LICENSE
* Copyright (c) 2019 Christoph Schaefer
*
* This file is part of miluphcuda.
*
* miluphcuda is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* miluphcuda is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with miluphcuda. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include <stdio.h>
#include "device_tools.h"
/*
device informations
authors: Daniel Thun and Christoph Schaefer
mainly taken from cuda samples
*/
/* ********************************************************************* */
inline int _ConvertSMVer2Cores(int major, int minor)
/*!
* Helper function to calculate the number of CUDA core.
* Taken from cuda_samples/common/inc/helper_cuda.h
*********************************************************************** */
{
/* Defines for GPU Architecture types (using the SM version to determine the # of cores per SM */
typedef struct {
int SM; /* 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version */
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{
{ 0x10, 8 }, /* Tesla Generation (SM 1.0) G80 class */
{ 0x11, 8 }, /* Tesla Generation (SM 1.1) G8x class */
{ 0x12, 8 }, /* Tesla Generation (SM 1.2) G9x class */
{ 0x13, 8 }, /* Tesla Generation (SM 1.3) GT200 class */
{ 0x20, 32 }, /* Fermi Generation (SM 2.0) GF100 class */
{ 0x21, 48 }, /* Fermi Generation (SM 2.1) GF10x class */
{ 0x30, 192}, /* Kepler Generation (SM 3.0) GK10x class */
{ 0x32, 192}, /* Kepler Generation (SM 3.2) GK10x class */
{ 0x35, 192}, /* Kepler Generation (SM 3.5) GK11x class */
{ 0x37, 192}, /* Kepler Generation (SM 3.7) GK21x class */
{ 0x50, 128}, /* Maxwell Generation (SM 5.0) GM10x class */
{ 0x52, 128},
{ 0x53, 128},
{ 0x60, 64},
{ 0x61, 128},
{ 0x62, 128},
{ 0x70, 64},
{ 0x72, 64},
{ 0x75, 64},
{ 0x80, 64},
{ 0x86, 128},
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
/* If we don't find the values, we default use the previous one to run properly */
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
major, minor, nGpuArchCoresPerSM[index-1].Cores);
return nGpuArchCoresPerSM[index-1].Cores;
}
/* ********************************************************************* */
void printfDeviceInformation(void)
/*!
* printfs some basic information about detected CUDA devices.
* Taken from cuda samples/1_Utilities/deviceQuery
*
*********************************************************************** */
{
int i, device_count, driverVersion = 0, runtimeVersion = 0;
struct cudaDeviceProp prop;
cudaGetDeviceCount(&device_count);
if (device_count == 0) {
printf("\nNo device(s) that support CUDA found!\n");
exit(1);
}
for (i = 0; i < device_count; i++) {
// cudaSetDevice(i);
cudaGetDeviceProperties(&prop, i);
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("\nGeneral Information for Device %d -- %s\n\n", i, prop.name);
printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10);
printf(" Compute capability: %d.%d\n\n", prop.major, prop.minor);
printf(" Multiprocessors: %d\n", prop.multiProcessorCount);
printf(" CUDA Cores / Multiprocessor: %d\n", _ConvertSMVer2Cores(prop.major, prop.minor));
printf(" Total amount of CUDA Cores: %d\n", _ConvertSMVer2Cores(prop.major, prop.minor)*prop.multiProcessorCount);
printf(" GPU clock rate: %0.f MHz\n\n", prop.clockRate * 1e-3f);
#if CUDART_VERSION >= 5000
/* This is supported in CUDA 5.0 (runtime API device properties) */
printf(" Memory Clock rate: %.0f Mhz\n", prop.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n", prop.memoryBusWidth);
if (prop.l2CacheSize) {
printf(" L2 Cache Size: %d bytes\n\n", prop.l2CacheSize);
}
#endif
printf(" Total amount of global memory: %.0f MBytes\n", (float)prop.totalGlobalMem/1048576.0f);
printf(" Total amount of constant memory: %lu bytes\n", prop.totalConstMem);
printf(" Total amount of shared memory per block: %lu bytes\n", prop.sharedMemPerBlock);
/* if prop.major >= 3 set shared memory bank size to 8 byte */
if (prop.major >= 3) {
cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
printf(" Shared memory bank size %d bytes\n", 8);
}
else {
printf(" Shared memory bank size %d bytes\n", 4);
}
printf(" Total number of registers available per block: %d\n", prop.regsPerBlock);
printf(" Warp size: %d\n\n", prop.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No");
printf(" Device has ECC support: %s\n", prop.ECCEnabled ? "Enabled" : "Disabled");
}
}