forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
MemoryAccess.cuh
237 lines (205 loc) · 8.12 KB
/
MemoryAccess.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#pragma once
#include <cstdint>
#include <type_traits>
#include <c10/util/Exception.h>
#include <c10/macros/Macros.h>
#include <ATen/detail/FunctionTraits.h>
#include <ATen/cuda/detail/OffsetCalculator.cuh>
// References:
// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/
namespace at { namespace native { namespace memory {
namespace detail {
// What does the `static_unroll` do?
//
// We want to do something like:
//
// using args_t = typename traits::ArgsTuple;
// args_t args;
// #pragma unroll
// for (int i = 0; i < traits::arity; i++) {
// std::get<i>(args) = ....
// }
//
// but unfortunately the above code does not work because
// the template argument has to be a compile time constant
// so `static_unroll` is created to simulate `#pragma unroll`
// using template metaprogramming.
template<template<int i> typename func, int end, int current=0>
struct static_unroll {
template<typename... Args>
static inline C10_HOST_DEVICE void with_args(Args&&... args) {
func<current>::apply(std::forward<Args>(args)...);
static_unroll<func, end, current+1>::with_args(args...);
}
};
template<template<int i> typename func, int end>
struct static_unroll<func, end, end> {
template<typename... Args>
static inline C10_HOST_DEVICE void with_args(Args... args) {}
};
// helper structs to be used with static_unroll to load arguments
// one by one
template<int arg_index>
struct vectorized_load_helper {
template <typename args_t, typename policy_t>
static __device__ void apply(policy_t &self, args_t *args, int idx) {
using arg_t = std::tuple_element_t<arg_index, args_t>;
// `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
// need a +1 offset to get the input
auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + block_work_size * idx;
auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get<arg_index>(args[thread_unroll_idx]); };
self.load_single_arg(args_accessor, ptr);
}
};
template<int arg_index>
struct unroll_load_helper {
template <typename args_t, typename policy_t, typename offset_t>
static __device__ void apply(policy_t &self, args_t *args, offset_t offset, int j) {
using arg_t = std::tuple_element_t<arg_index, args_t>;
// `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
// need a +1 offset to get the input
auto ptr = reinterpret_cast<arg_t *>(self.data[arg_index + 1]) + offset[arg_index];
std::get<arg_index>(args[j]) = *ptr;
}
};
} // namespace detail
// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};
namespace policies {
// Assumption:
// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
template<typename data_t, typename inp_calc_t, typename out_calc_t>
struct unroll {
data_t data;
int remaining;
inp_calc_t input_offset_calculator;
out_calc_t output_offset_calculator;
__device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc):
data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {}
__device__ inline bool check_inbounds(int thread_work_elem) {
return ((threadIdx.x + thread_work_elem*num_threads) < remaining);
}
template<typename args_t>
__device__ inline void load(args_t *args, int idx) {
constexpr int arity = std::tuple_size<args_t>::value;
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < thread_work_size; i++) {
if (thread_idx >= remaining) {
return;
}
int linear_idx = thread_idx + block_work_size * idx;
auto offset = input_offset_calculator.get(linear_idx);
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, i);
thread_idx += num_threads;
}
}
template<typename scalar_t>
__device__ inline void store(scalar_t *from, int idx) {
int thread_idx = threadIdx.x;
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
#pragma unroll
for (int i = 0; i < thread_work_size; i++) {
if (thread_idx >= remaining) {
return;
}
int linear_idx = thread_idx + block_work_size * idx;
int offset = output_offset_calculator.get(linear_idx)[0];
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + offset;
*to = from[i];
thread_idx += num_threads;
}
}
};
// Assumption:
// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
// Note:
// Functions in vectorized policy does not do boundary check. It assumes the whole block
// has its job to do. So the reminders should be handled by the the caller manually.
template <int vec_size, typename data_t> // vec_size: number of scalars, can be 1, 2, or 4.
struct vectorized {
static_assert(thread_work_size % vec_size == 0, "The workload per thread must be a multiple of vec_size");
static constexpr int loop_size = thread_work_size / vec_size;
data_t data;
__device__ vectorized(data_t data) : data(data) {}
__device__ inline constexpr bool check_inbounds(int thread_work_elem) {
return true;
}
template<typename accessor_t, typename scalar_t>
__device__ inline void load_single_arg(accessor_t to, scalar_t *from) {
using vec_t = aligned_vector<scalar_t, vec_size>;
vec_t *from_ = reinterpret_cast<vec_t *>(from);
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < loop_size; i++) {
int index = thread_idx + i * num_threads;
vec_t v = from_[index];
#pragma unroll
for (int j = 0; j < vec_size; j++) {
to(vec_size * i + j) = v.val[j];
}
}
}
template<typename args_t>
__device__ inline void load(args_t *args, int idx) {
constexpr int arity = std::tuple_size<args_t>::value;
detail::static_unroll<detail::vectorized_load_helper, arity>::with_args(*this, args, idx);
}
template<typename scalar_t>
__device__ inline void store(scalar_t *from, int idx) {
using vec_t = aligned_vector<scalar_t, vec_size>;
scalar_t *to = reinterpret_cast<scalar_t *>(data[0]) + block_work_size * idx;
vec_t *to_ = reinterpret_cast<vec_t *>(to);
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < loop_size; i++) {
int index = thread_idx + i * num_threads;
vec_t v;
for (int j = 0; j < vec_size; j++) {
v.val[j] = from[vec_size * i + j];
}
to_[index] = v;
}
}
};
} // namespace policies
// This is only used in host, but we will wrap this into some templates
// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE
// in order to compile
template<typename scalar_t>
inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) {
uint64_t address = reinterpret_cast<uint64_t>(pointer);
constexpr int vec2_alignment = std::alignment_of<aligned_vector<scalar_t, 2>>::value;
constexpr int vec4_alignment = std::alignment_of<aligned_vector<scalar_t, 4>>::value;
if (address % vec4_alignment == 0) {
return 4;
} else if (address % vec2_alignment == 0) {
return 2;
}
return 1;
}
template<int i>
struct can_vectorize_up_to_helper {
template <typename array_t, typename traits>
static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) {
using arg_t = typename traits::template arg<i>::type;
// `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we
// need a +1 offset to get the input
result = std::min<int>(result, can_vectorize_up_to<arg_t>(pointers[i + 1]));
}
};
template<typename func_t, typename array_t>
inline int can_vectorize_up_to(array_t pointers) {
using traits = function_traits<func_t>;
using return_t = typename traits::result_type;
constexpr int arity = traits::arity;
int result = can_vectorize_up_to<return_t>(pointers[0]);
// We need to get the type for each argument of `func_t`, this can only
// be done at compile time.
detail::static_unroll<can_vectorize_up_to_helper, arity>::with_args(result, pointers, traits());
return result;
}
}}} // namespace at::native::memory