From e573b2b5742441c00ff6ad40362da37325a11864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Kn=C3=A1pek?= Date: Wed, 21 Aug 2024 16:34:19 +0200 Subject: [PATCH 1/3] LibCrypto: Improve GHash / GCM performance --- .../LibCrypto/Authentication/GHash.cpp | 159 ++++++++++++++---- 1 file changed, 129 insertions(+), 30 deletions(-) diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp index a8db6576cac381..3c5ff6ba6981ab 100644 --- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp +++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp @@ -84,39 +84,138 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher) return digest; } -/// Galois Field multiplication using . -/// Note that x, y, and z are strictly BE. void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) { - // Note: Copied upfront to stack to avoid memory access in the loop. - u32 x[4] { _x[0], _x[1], _x[2], _x[3] }; - u32 const y[4] { _y[0], _y[1], _y[2], _y[3] }; - u32 z[4] { 0, 0, 0, 0 }; - - // Unrolled by 32, the access in y[3-(i/32)] can be cached throughout the loop. -#pragma GCC unroll 32 - for (ssize_t i = 127, j = 0; i > -1; --i, j++) { - auto r = -((y[j / 32] >> (i % 32)) & 1); - z[0] ^= x[0] & r; - z[1] ^= x[1] & r; - z[2] ^= x[2] & r; - z[3] ^= x[3] & r; - auto a0 = x[0] & 1; - x[0] >>= 1; - auto a1 = x[1] & 1; - x[1] >>= 1; - x[1] |= a0 << 31; - auto a2 = x[2] & 1; - x[2] >>= 1; - x[2] |= a1 << 31; - auto a3 = x[3] & 1; - x[3] >>= 1; - x[3] |= a2 << 31; - - x[0] ^= 0xe1000000 & -a3; - } + static auto const mul_32_x_32_64 = [](u32 const& a, u32 const& b) -> u64 { + return static_cast(a) * static_cast(b); + }; + + static auto const clmul_32_x_32_64 = [](u32 const& a, u32 const& b, u32& lo, u32& hi) -> void { + u32 ta[4]; + u32 tb[4]; + u64 tu64[4]; + u64 tc[4]; + u64 cc; + + ta[0] = a & static_cast(0x11111111ul); + ta[1] = a & static_cast(0x22222222ul); + ta[2] = a & static_cast(0x44444444ul); + ta[3] = a & static_cast(0x88888888ul); + tb[0] = b & static_cast(0x11111111ul); + tb[1] = b & static_cast(0x22222222ul); + tb[2] = b & static_cast(0x44444444ul); + tb[3] = b & static_cast(0x88888888ul); + tu64[0] = mul_32_x_32_64(ta[0], tb[0]); + tu64[1] = mul_32_x_32_64(ta[1], tb[3]); + tu64[2] = mul_32_x_32_64(ta[2], tb[2]); + tu64[3] = mul_32_x_32_64(ta[3], tb[1]); + tc[0] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; + tu64[0] = mul_32_x_32_64(ta[0], tb[1]); + tu64[1] = mul_32_x_32_64(ta[1], tb[0]); + tu64[2] = mul_32_x_32_64(ta[2], tb[3]); + tu64[3] = mul_32_x_32_64(ta[3], tb[2]); + tc[1] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; + tu64[0] = mul_32_x_32_64(ta[0], tb[2]); + tu64[1] = mul_32_x_32_64(ta[1], tb[1]); + tu64[2] = mul_32_x_32_64(ta[2], tb[0]); + tu64[3] = mul_32_x_32_64(ta[3], tb[3]); + tc[2] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; + tu64[0] = mul_32_x_32_64(ta[0], tb[3]); + tu64[1] = mul_32_x_32_64(ta[1], tb[2]); + tu64[2] = mul_32_x_32_64(ta[2], tb[1]); + tu64[3] = mul_32_x_32_64(ta[3], tb[0]); + tc[3] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; + tc[0] &= static_cast(0x1111111111111111ull); + tc[1] &= static_cast(0x2222222222222222ull); + tc[2] &= static_cast(0x4444444444444444ull); + tc[3] &= static_cast(0x8888888888888888ull); + cc = tc[0] | tc[1] | tc[2] | tc[3]; + lo = static_cast((cc >> (0 * 32)) & 0xfffffffful); + hi = static_cast((cc >> (1 * 32)) & 0xfffffffful); + }; - memcpy(_z, z, sizeof(z)); + u32 aa[4]; + u32 bb[4]; + u32 ta[9]; + u32 tb[9]; + u32 tc[4]; + u32 tu32[4]; + u32 td[4]; + u32 te[4]; + u32 z[8]; + + aa[3] = _x[0]; + aa[2] = _x[1]; + aa[1] = _x[2]; + aa[0] = _x[3]; + bb[3] = _y[0]; + bb[2] = _y[1]; + bb[1] = _y[2]; + bb[0] = _y[3]; + ta[0] = aa[0]; + ta[1] = aa[1]; + ta[2] = ta[0] ^ ta[1]; + ta[3] = aa[2]; + ta[4] = aa[3]; + ta[5] = ta[3] ^ ta[4]; + ta[6] = ta[0] ^ ta[3]; + ta[7] = ta[1] ^ ta[4]; + ta[8] = ta[6] ^ ta[7]; + tb[0] = bb[0]; + tb[1] = bb[1]; + tb[2] = tb[0] ^ tb[1]; + tb[3] = bb[2]; + tb[4] = bb[3]; + tb[5] = tb[3] ^ tb[4]; + tb[6] = tb[0] ^ tb[3]; + tb[7] = tb[1] ^ tb[4]; + tb[8] = tb[6] ^ tb[7]; + for (int i = 0; i != 9; ++i) { + clmul_32_x_32_64(ta[i], tb[i], ta[i], tb[i]); + } + tc[0] = ta[0]; + tc[1] = ta[0] ^ ta[1] ^ ta[2] ^ tb[0]; + tc[2] = ta[1] ^ tb[0] ^ tb[1] ^ tb[2]; + tc[3] = tb[1]; + td[0] = ta[3]; + td[1] = ta[3] ^ ta[4] ^ ta[5] ^ tb[3]; + td[2] = ta[4] ^ tb[3] ^ tb[4] ^ tb[5]; + td[3] = tb[4]; + te[0] = ta[6]; + te[1] = ta[6] ^ ta[7] ^ ta[8] ^ tb[6]; + te[2] = ta[7] ^ tb[6] ^ tb[7] ^ tb[8]; + te[3] = tb[7]; + te[0] ^= (tc[0] ^ td[0]); + te[1] ^= (tc[1] ^ td[1]); + te[2] ^= (tc[2] ^ td[2]); + te[3] ^= (tc[3] ^ td[3]); + tc[2] ^= te[0]; + tc[3] ^= te[1]; + td[0] ^= te[2]; + td[1] ^= te[3]; + z[0] = tc[0] << 1; + z[1] = (tc[1] << 1) | (tc[0] >> 31); + z[2] = (tc[2] << 1) | (tc[1] >> 31); + z[3] = (tc[3] << 1) | (tc[2] >> 31); + z[4] = (td[0] << 1) | (tc[3] >> 31); + z[5] = (td[1] << 1) | (td[0] >> 31); + z[6] = (td[2] << 1) | (td[1] >> 31); + z[7] = (td[3] << 1) | (td[2] >> 31); + for (int i = 0; i != 4; ++i) { + tu32[0] = z[i] << 31; + tu32[1] = z[i] << 30; + tu32[2] = z[i] << 25; + z[i + 3] ^= (tu32[0] ^ tu32[1] ^ tu32[2]); + tu32[0] = z[i] >> 0; + tu32[1] = z[i] >> 1; + tu32[2] = z[i] >> 2; + tu32[3] = z[i] >> 7; + z[i + 4] ^= (tu32[0] ^ tu32[1] ^ tu32[2] ^ tu32[3]); + } + _z[0] = z[7]; + _z[1] = z[6]; + _z[2] = z[5]; + _z[3] = z[4]; } } From d70e79890e0a107b504bae33bcb080c543211a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Kn=C3=A1pek?= Date: Wed, 21 Aug 2024 23:53:51 +0200 Subject: [PATCH 2/3] LibCrypto: SIMDify GHash --- AK/SIMD.h | 8 ++ AK/SIMDExtras.h | 70 ++++++++++++++ .../LibCrypto/Authentication/GHash.cpp | 93 +++++++++++-------- 3 files changed, 132 insertions(+), 39 deletions(-) diff --git a/AK/SIMD.h b/AK/SIMD.h index 28c901016f71e5..7cb4d5cac2a209 100644 --- a/AK/SIMD.h +++ b/AK/SIMD.h @@ -108,11 +108,19 @@ struct IndexVectorFor { }; #endif +template +struct MakeVectorImpl { + using Type __attribute__((vector_size(sizeof(T) * element_count))) = T; +}; + } template using IndexVectorFor = typename Detail::IndexVectorFor::Type; +template +using MakeVector = typename Detail::MakeVectorImpl::Type; + static_assert(IsSame, i8x16>); static_assert(IsSame, u32x4>); static_assert(IsSame, u64x4>); diff --git a/AK/SIMDExtras.h b/AK/SIMDExtras.h index e03c70c6a2d771..f48003ecb9e724 100644 --- a/AK/SIMDExtras.h +++ b/AK/SIMDExtras.h @@ -253,6 +253,40 @@ ALWAYS_INLINE static T elementwise_byte_reverse_impl(T a, IndexSequence) } } +template +ALWAYS_INLINE static ElementOf reduce_or_impl(T const& a, IndexSequence const&) +{ + static_assert(is_power_of_two(vector_length)); + static_assert(vector_length == sizeof...(Idx) * 2); + + using E = ElementOf; + + constexpr size_t N = sizeof...(Idx); + + if constexpr (N == 1) { + return a[0] | a[1]; + } else { + return reduce_or_impl(MakeVector { (a[Idx])... }, MakeIndexSequence()) | reduce_or_impl(MakeVector { (a[N + Idx])... }, MakeIndexSequence()); + } +} + +template +ALWAYS_INLINE static ElementOf reduce_xor_impl(T const& a, IndexSequence const&) +{ + static_assert(is_power_of_two(vector_length)); + static_assert(vector_length == sizeof...(Idx) * 2); + + using E = ElementOf; + + constexpr size_t N = sizeof...(Idx); + + if constexpr (N == 1) { + return a[0] ^ a[1]; + } else { + return reduce_xor_impl(MakeVector { (a[Idx])... }, MakeIndexSequence()) ^ reduce_xor_impl(MakeVector { (a[N + Idx])... }, MakeIndexSequence()); + } +} + } // FIXME: Shuffles only work with integral types for now @@ -286,4 +320,40 @@ ALWAYS_INLINE static T elementwise_byte_reverse(T a) return Detail::elementwise_byte_reverse_impl(a, MakeIndexSequence>()); } +template +ALWAYS_INLINE static ElementOf reduce_or(T const& a) +{ + static_assert(is_power_of_two(vector_length)); + static_assert(IsUnsigned>); + +#if defined __has_builtin +# if __has_builtin(__builtin_reduce_or) + if (true) { + return __builtin_reduce_or(a); + } else +# endif +#endif + { + return Detail::reduce_or_impl(a, MakeIndexSequence / 2>()); + } +} + +template +ALWAYS_INLINE static ElementOf reduce_xor(T const& a) +{ + static_assert(is_power_of_two(vector_length)); + static_assert(IsUnsigned>); + +#if defined __has_builtin +# if __has_builtin(__builtin_reduce_xor) + if (true) { + return __builtin_reduce_xor(a); + } else +# endif +#endif + { + return Detail::reduce_xor_impl(a, MakeIndexSequence / 2>()); + } +} + } diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp index 3c5ff6ba6981ab..9f440ff55f2159 100644 --- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp +++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -86,50 +88,63 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher) void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) { - static auto const mul_32_x_32_64 = [](u32 const& a, u32 const& b) -> u64 { - return static_cast(a) * static_cast(b); + using namespace AK::SIMD; + + static auto const rotate_left = [](u32x4 const& x) -> u32x4 { + return u32x4 { x[3], x[0], x[1], x[2] }; + }; + + static auto const mul_32_x_32_64 = [](u32x4 const& a, u32x4 const& b) -> u64x4 { + u64x2 r1; + u64x2 r2; + +#if defined __has_builtin +# if __has_builtin(__builtin_ia32_pmuludq128) + if (true) { + r1 = simd_cast(__builtin_ia32_pmuludq128(simd_cast(u32x4 { a[0], 0, a[1], 0 }), simd_cast(u32x4 { b[0], 0, b[1], 0 }))); + r2 = simd_cast(__builtin_ia32_pmuludq128(simd_cast(u32x4 { a[2], 0, a[3], 0 }), simd_cast(u32x4 { b[2], 0, b[3], 0 }))); + } else +# endif +#endif + { + r1 = u64x2 { static_cast(a[0]) * static_cast(b[0]), static_cast(a[1]) * static_cast(b[1]) }; + r2 = u64x2 { static_cast(a[2]) * static_cast(b[2]), static_cast(a[3]) * static_cast(b[3]) }; + } + return u64x4 { r1[0], r1[1], r2[0], r2[1] }; }; static auto const clmul_32_x_32_64 = [](u32 const& a, u32 const& b, u32& lo, u32& hi) -> void { - u32 ta[4]; - u32 tb[4]; - u64 tu64[4]; - u64 tc[4]; + constexpr u32x4 mask32 = { 0x11111111, 0x22222222, 0x44444444, 0x88888888 }; + constexpr u64x4 mask64 = { 0x1111111111111111ull, 0x2222222222222222ull, 0x4444444444444444ull, 0x8888888888888888ull }; + + u32x4 ta; + u32x4 tb; + u64x4 tu64; + u64x4 tc; u64 cc; - ta[0] = a & static_cast(0x11111111ul); - ta[1] = a & static_cast(0x22222222ul); - ta[2] = a & static_cast(0x44444444ul); - ta[3] = a & static_cast(0x88888888ul); - tb[0] = b & static_cast(0x11111111ul); - tb[1] = b & static_cast(0x22222222ul); - tb[2] = b & static_cast(0x44444444ul); - tb[3] = b & static_cast(0x88888888ul); - tu64[0] = mul_32_x_32_64(ta[0], tb[0]); - tu64[1] = mul_32_x_32_64(ta[1], tb[3]); - tu64[2] = mul_32_x_32_64(ta[2], tb[2]); - tu64[3] = mul_32_x_32_64(ta[3], tb[1]); - tc[0] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; - tu64[0] = mul_32_x_32_64(ta[0], tb[1]); - tu64[1] = mul_32_x_32_64(ta[1], tb[0]); - tu64[2] = mul_32_x_32_64(ta[2], tb[3]); - tu64[3] = mul_32_x_32_64(ta[3], tb[2]); - tc[1] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; - tu64[0] = mul_32_x_32_64(ta[0], tb[2]); - tu64[1] = mul_32_x_32_64(ta[1], tb[1]); - tu64[2] = mul_32_x_32_64(ta[2], tb[0]); - tu64[3] = mul_32_x_32_64(ta[3], tb[3]); - tc[2] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; - tu64[0] = mul_32_x_32_64(ta[0], tb[3]); - tu64[1] = mul_32_x_32_64(ta[1], tb[2]); - tu64[2] = mul_32_x_32_64(ta[2], tb[1]); - tu64[3] = mul_32_x_32_64(ta[3], tb[0]); - tc[3] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3]; - tc[0] &= static_cast(0x1111111111111111ull); - tc[1] &= static_cast(0x2222222222222222ull); - tc[2] &= static_cast(0x4444444444444444ull); - tc[3] &= static_cast(0x8888888888888888ull); - cc = tc[0] | tc[1] | tc[2] | tc[3]; + ta = a & mask32; + tb = b & mask32; + tb = item_reverse(tb); + + tb = rotate_left(tb); + tu64 = mul_32_x_32_64(ta, tb); + tc[0] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] }); + + tb = rotate_left(tb); + tu64 = mul_32_x_32_64(ta, tb); + tc[1] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] }); + + tb = rotate_left(tb); + tu64 = mul_32_x_32_64(ta, tb); + tc[2] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] }); + + tb = rotate_left(tb); + tu64 = mul_32_x_32_64(ta, tb); + tc[3] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] }); + + tc &= mask64; + cc = reduce_or(tc); lo = static_cast((cc >> (0 * 32)) & 0xfffffffful); hi = static_cast((cc >> (1 * 32)) & 0xfffffffful); }; From 34d3c564e82bb56d3f52cc9dec0514b91867d003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Kn=C3=A1pek?= Date: Thu, 22 Aug 2024 16:11:33 +0200 Subject: [PATCH 3/3] LibCrypto: Add docs --- .../Libraries/LibCrypto/Authentication/GHash.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp index 9f440ff55f2159..bea2c43bbdce3a 100644 --- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp +++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp @@ -88,6 +88,9 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher) void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) { + /** This function computes 128bit x 128bit unsigned integer multiplication inside Galois finite field, producing 128bit result. + * It uses 9 32bit x 32bit to 64bit carry-less multiplications in Karatsuba decomposition. + */ using namespace AK::SIMD; static auto const rotate_left = [](u32x4 const& x) -> u32x4 { @@ -95,6 +98,9 @@ void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) }; static auto const mul_32_x_32_64 = [](u32x4 const& a, u32x4 const& b) -> u64x4 { + /** This function computes 32bit x 32bit unsigned integer multiplication, producing 64bit result. + * It does this for 4 32bit integers x 4 32bit integers at a time, producing 4 64bit integers result. + */ u64x2 r1; u64x2 r2; @@ -114,6 +120,14 @@ void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4]) }; static auto const clmul_32_x_32_64 = [](u32 const& a, u32 const& b, u32& lo, u32& hi) -> void { + /** This function computes 32bit x 32bit unsigned integer carry-less multiplication, producing 64bit result. + * It does this by extracting 4 bits from each integer at a time and multiplying those. + * Those 4 bits are packed into 32bit integers with holes, 1 significant bit plus 3 holes, repeated 4 times. + * Repeating previous logic 4 times, we are able to multiply all of the input 32 bits. + * The holes are there to prevent the carry spill to more significant bits. Respectively, allowing the carry + * to spill into holes, the holes are later discarded. + * https://www.bearssl.org/constanttime.html#ghash-for-gcm + */ constexpr u32x4 mask32 = { 0x11111111, 0x22222222, 0x44444444, 0x88888888 }; constexpr u64x4 mask64 = { 0x1111111111111111ull, 0x2222222222222222ull, 0x4444444444444444ull, 0x8888888888888888ull };