forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LinearAlgebra.cu
151 lines (131 loc) · 5.52 KB
/
LinearAlgebra.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#include <ATen/ATen.h>
#include <ATen/LegacyTHFunctionsCUDA.h>
#include <ATen/cuda/CUDABlas.h>
namespace at { namespace native {
Tensor baddbmm_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
return legacy::cuda::_th_baddbmm(self, batch1, batch2, beta, alpha);
}
Tensor& baddbmm_out_cuda(Tensor &result, const Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
return legacy::cuda::_th_baddbmm_out(result, self, batch1, batch2, beta, alpha);
}
Tensor& baddbmm__cuda(Tensor& self, const Tensor& batch1, const Tensor& batch2, Scalar beta, Scalar alpha) {
return legacy::cuda::_th_baddbmm_out(self, self, batch1, batch2, beta, alpha);
}
Tensor bmm_cuda(const Tensor& self, const Tensor& mat2) {
return legacy::cuda::_th_bmm(self, mat2);
}
Tensor& bmm_out_cuda(Tensor &result, const Tensor& batch1, const Tensor& batch2) {
return legacy::cuda::_th_bmm_out(result, batch1, batch2);
}
Tensor prepare_matrix_for_cublas(Tensor& tensor, bool& transpose_tensor) {
Tensor tensor_;
IntArrayRef tensor_strides = tensor.strides();
if ((tensor_strides[0] == 1) && (tensor_strides[1] != 0)) {
tensor_ = tensor;
transpose_tensor = false;
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] != 0)) {
tensor_ = tensor;
transpose_tensor = true;
} else {
transpose_tensor = true;
tensor_ = tensor.clone(at::MemoryFormat::Contiguous);
}
return tensor_;
}
// Check https://github.com/pytorch/pytorch/issues/22078
// for information about the bug. We don't know the exact conditions that trigger it,
// but using Sgemm or Hgemm on Maxwell or Pascal seems to be a
// necessary condition.
static void checkCuda90Bug(int i_m, int i_n, int i_k)
{
#if CUDA_VERSION < 9200 && CUDA_VERSION >= 9000
static std::once_flag alreadyWarned;
const int LIMIT = 1 << 21;
if (i_m > LIMIT || i_n > LIMIT || i_k > LIMIT) {
cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
if (prop->major == 5 || prop->major == 6) {
std::call_once(alreadyWarned, []() {
TORCH_WARN("Matrix multiplication for dimensions larger than 2^21 has known bugs on your combination of CUDA version and device type. Please consider upgrading to CUDA 9.2 or later.");
});
}
}
#endif
}
Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, Scalar beta, Scalar alpha) {
TORCH_CHECK(
(mat1.dim() == 2) && (mat2.dim() == 2) &&
(self.dim() == 2) && (result.dim() == 2),
"tensors must be 2-D"
);
IntArrayRef mat1_sizes = mat1.sizes();
IntArrayRef mat2_sizes = mat2.sizes();
IntArrayRef self_sizes = self.sizes();
TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0");
TORCH_CHECK(self_sizes[0] == mat1_sizes[0], "self dim 0 must match mat1 dim 0");
TORCH_CHECK(self_sizes[1] == mat2_sizes[1], "self dim 1 must match mat2 dim 1");
// If self and result either point to the same data or if beta is zero,
// we can avoid copying self into result. Otherwise, we need to copy.
if (beta.to<double>() != 0.0) {
if ((result.data_ptr() != self.data_ptr()) || (result.strides() != self.strides())) {
result.copy_(self);
}
}
IntArrayRef result_sizes = result.sizes();
if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
return result;
}
bool transpose_result;
Tensor result_ = prepare_matrix_for_cublas(result, transpose_result);
bool transpose_mat1;
bool transpose_mat2;
Tensor mat1_ = transpose_result ? mat2 : mat1;
Tensor mat2_ = transpose_result ? mat1 : mat2;
mat1_ = prepare_matrix_for_cublas(mat1_, transpose_mat1);
mat2_ = prepare_matrix_for_cublas(mat2_, transpose_mat2);
if (transpose_result) {
transpose_mat1 = !transpose_mat1;
transpose_mat2 = !transpose_mat2;
mat1_sizes = mat1_.sizes();
mat2_sizes = mat2_.sizes();
}
int64_t m = mat1_sizes[transpose_result ? 1 : 0];
int64_t k = mat1_sizes[transpose_result ? 0 : 1];
int64_t n = mat2_sizes[transpose_result ? 0 : 1];
int64_t mat1_ld = mat1_.stride((transpose_mat1 == transpose_result) ? 1 : 0);
int64_t mat2_ld = mat2_.stride((transpose_mat2 == transpose_result) ? 1 : 0);
int64_t result_ld = result_.stride(transpose_result ? 0 : 1);
at::ScalarType scalar_type = self.scalar_type();
AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] {
if (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::Float) {
checkCuda90Bug(static_cast<int>(m), static_cast<int>(n), static_cast<int>(k));
}
scalar_t alpha_val = alpha.to<scalar_t>();
scalar_t beta_val = beta.to<scalar_t>();
scalar_t* mat1_ptr = mat1_.data_ptr<scalar_t>();
scalar_t* mat2_ptr = mat2_.data_ptr<scalar_t>();
scalar_t* result_ptr = result_.data_ptr<scalar_t>();
at::cuda::blas::gemm<scalar_t>(
transpose_mat1 ? 't' : 'n',
transpose_mat2 ? 't' : 'n',
m, n, k,
alpha_val,
mat1_ptr, mat1_ld,
mat2_ptr, mat2_ld,
beta_val,
result_ptr, result_ld
);
});
if (result.data_ptr() != result_.data_ptr()) {
result.copy_(result_);
}
return result;
}
Tensor& mm_out_cuda(Tensor& result, const Tensor& self, const Tensor& mat2) {
result.resize_({ self.size(0), mat2.size(1) });
return addmm_out_cuda_impl(result, result, self, mat2, 0, 1);
}
Tensor mm_cuda(const Tensor& self, const Tensor& mat2) {
Tensor result = at::empty({ self.size(0), mat2.size(1) }, self.options());
return addmm_out_cuda_impl(result, result, self, mat2, 0, 1);
}
} }