-
Notifications
You must be signed in to change notification settings - Fork 204
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #633 from Ka-zam/arctan
New kernels for arctan
- Loading branch information
Showing
7 changed files
with
324 additions
and
324 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
# | ||
# Copyright 2011-2020 Free Software Foundation, Inc. | ||
# Copyright 2023 Magnus Lundmark <[email protected]> | ||
# | ||
# This file is part of VOLK | ||
# | ||
|
@@ -249,6 +250,7 @@ install(FILES | |
${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h | ||
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h | ||
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h | ||
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h | ||
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h | ||
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h | ||
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* -*- c++ -*- */ | ||
/* | ||
* Copyright 2023 Magnus Lundmark <[email protected]> | ||
* | ||
* This file is part of VOLK | ||
* | ||
* SPDX-License-Identifier: LGPL-3.0-or-later | ||
*/ | ||
|
||
/* | ||
* This file is intended to hold AVX2 FMA intrinsics of intrinsics. | ||
* They should be used in VOLK kernels to avoid copy-paste. | ||
*/ | ||
|
||
#ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ | ||
#define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ | ||
#include <immintrin.h> | ||
|
||
/* | ||
* Approximate arctan(x) via polynomial expansion | ||
* on the interval [-1, 1] | ||
* | ||
* Maximum relative error ~6.5e-7 | ||
* Polynomial evaluated via Horner's method | ||
*/ | ||
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x) | ||
{ | ||
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); | ||
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); | ||
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); | ||
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); | ||
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); | ||
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); | ||
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); | ||
|
||
const __m256 x_times_x = _mm256_mul_ps(x, x); | ||
__m256 arctan; | ||
arctan = a13; | ||
arctan = _mm256_fmadd_ps(x_times_x, arctan, a11); | ||
arctan = _mm256_fmadd_ps(x_times_x, arctan, a9); | ||
arctan = _mm256_fmadd_ps(x_times_x, arctan, a7); | ||
arctan = _mm256_fmadd_ps(x_times_x, arctan, a5); | ||
arctan = _mm256_fmadd_ps(x_times_x, arctan, a3); | ||
arctan = _mm256_fmadd_ps(x_times_x, arctan, a1); | ||
arctan = _mm256_mul_ps(x, arctan); | ||
|
||
return arctan; | ||
} | ||
|
||
#endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
/* -*- c++ -*- */ | ||
/* | ||
* Copyright 2015 Free Software Foundation, Inc. | ||
* Copyright 2023 Magnus Lundmark <[email protected]> | ||
* | ||
* This file is part of VOLK | ||
* | ||
|
@@ -16,6 +17,43 @@ | |
#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ | ||
#include <immintrin.h> | ||
|
||
/* | ||
* Approximate arctan(x) via polynomial expansion | ||
* on the interval [-1, 1] | ||
* | ||
* Maximum relative error ~6.5e-7 | ||
* Polynomial evaluated via Horner's method | ||
*/ | ||
static inline __m256 _m256_arctan_poly_avx(const __m256 x) | ||
{ | ||
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); | ||
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); | ||
const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); | ||
const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); | ||
const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); | ||
const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); | ||
const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); | ||
|
||
const __m256 x_times_x = _mm256_mul_ps(x, x); | ||
__m256 arctan; | ||
arctan = a13; | ||
arctan = _mm256_mul_ps(x_times_x, arctan); | ||
arctan = _mm256_add_ps(arctan, a11); | ||
arctan = _mm256_mul_ps(x_times_x, arctan); | ||
arctan = _mm256_add_ps(arctan, a9); | ||
arctan = _mm256_mul_ps(x_times_x, arctan); | ||
arctan = _mm256_add_ps(arctan, a7); | ||
arctan = _mm256_mul_ps(x_times_x, arctan); | ||
arctan = _mm256_add_ps(arctan, a5); | ||
arctan = _mm256_mul_ps(x_times_x, arctan); | ||
arctan = _mm256_add_ps(arctan, a3); | ||
arctan = _mm256_mul_ps(x_times_x, arctan); | ||
arctan = _mm256_add_ps(arctan, a1); | ||
arctan = _mm256_mul_ps(x, arctan); | ||
|
||
return arctan; | ||
} | ||
|
||
static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) | ||
{ | ||
__m256 yl, yh, tmp1, tmp2; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
/* -*- c++ -*- */ | ||
/* | ||
* Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc. | ||
* Copyright 2023 Magnus Lundmark <[email protected]> | ||
* | ||
* This file is part of VOLK | ||
* | ||
|
@@ -166,6 +167,50 @@ static inline float log2f_non_ieee(float f) | |
// Constant used to do log10 calculations as faster log2 | ||
//////////////////////////////////////////////////////////////////////// | ||
// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr | ||
#define volk_log2to10factor 3.01029995663981209120 | ||
#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120 | ||
|
||
//////////////////////////////////////////////////////////////////////// | ||
// arctan(x) | ||
//////////////////////////////////////////////////////////////////////// | ||
static inline float volk_arctan_poly(const float x) | ||
{ | ||
/* | ||
* arctan(x) polynomial expansion on the interval [-1, 1] | ||
* Maximum relative error < 6.6e-7 | ||
*/ | ||
const float a1 = +0x1.ffffeap-1f; | ||
const float a3 = -0x1.55437p-2f; | ||
const float a5 = +0x1.972be6p-3f; | ||
const float a7 = -0x1.1436ap-3f; | ||
const float a9 = +0x1.5785aap-4f; | ||
const float a11 = -0x1.2f3004p-5f; | ||
const float a13 = +0x1.01a37cp-7f; | ||
|
||
const float x_times_x = x * x; | ||
float arctan = a13; | ||
arctan = fmaf(x_times_x, arctan, a11); | ||
arctan = fmaf(x_times_x, arctan, a9); | ||
arctan = fmaf(x_times_x, arctan, a7); | ||
arctan = fmaf(x_times_x, arctan, a5); | ||
arctan = fmaf(x_times_x, arctan, a3); | ||
arctan = fmaf(x_times_x, arctan, a1); | ||
arctan *= x; | ||
|
||
return arctan; | ||
} | ||
|
||
static inline float volk_arctan(const float x) | ||
{ | ||
/* | ||
* arctan(x) + arctan(1 / x) == sign(x) * pi / 2 | ||
*/ | ||
const float pi_over_2 = 0x1.921fb6p0f; | ||
|
||
if (fabs(x) < 1.f) { | ||
return volk_arctan_poly(x); | ||
} else { | ||
return copysignf(pi_over_2, x) - volk_arctan_poly(1.f / x); | ||
} | ||
} | ||
|
||
#endif /*INCLUDED_LIBVOLK_COMMON_H*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
/* -*- c++ -*- */ | ||
/* | ||
* Copyright 2015 Free Software Foundation, Inc. | ||
* Copyright 2023 Magnus Lundmark <[email protected]> | ||
* | ||
* This file is part of VOLK | ||
* | ||
|
@@ -16,6 +17,43 @@ | |
#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ | ||
#include <xmmintrin.h> | ||
|
||
/* | ||
* Approximate arctan(x) via polynomial expansion | ||
* on the interval [-1, 1] | ||
* | ||
* Maximum relative error ~6.5e-7 | ||
* Polynomial evaluated via Horner's method | ||
*/ | ||
static inline __m128 _mm_arctan_poly_sse(const __m128 x) | ||
{ | ||
const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f); | ||
const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f); | ||
const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f); | ||
const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f); | ||
const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f); | ||
const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f); | ||
const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f); | ||
|
||
const __m128 x_times_x = _mm_mul_ps(x, x); | ||
__m128 arctan; | ||
arctan = a13; | ||
arctan = _mm_mul_ps(x_times_x, arctan); | ||
arctan = _mm_add_ps(arctan, a11); | ||
arctan = _mm_mul_ps(x_times_x, arctan); | ||
arctan = _mm_add_ps(arctan, a9); | ||
arctan = _mm_mul_ps(x_times_x, arctan); | ||
arctan = _mm_add_ps(arctan, a7); | ||
arctan = _mm_mul_ps(x_times_x, arctan); | ||
arctan = _mm_add_ps(arctan, a5); | ||
arctan = _mm_mul_ps(x_times_x, arctan); | ||
arctan = _mm_add_ps(arctan, a3); | ||
arctan = _mm_mul_ps(x_times_x, arctan); | ||
arctan = _mm_add_ps(arctan, a1); | ||
arctan = _mm_mul_ps(x, arctan); | ||
|
||
return arctan; | ||
} | ||
|
||
static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) | ||
{ | ||
__m128 iValue, qValue; | ||
|
Oops, something went wrong.