From e573b2b5742441c00ff6ad40362da37325a11864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Kn=C3=A1pek?= <knapek.mar@gmail.com>
Date: Wed, 21 Aug 2024 16:34:19 +0200
Subject: [PATCH 1/3] LibCrypto: Improve GHash / GCM performance

---
 .../LibCrypto/Authentication/GHash.cpp        | 159 ++++++++++++++----
 1 file changed, 129 insertions(+), 30 deletions(-)
diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
index a8db6576cac381..3c5ff6ba6981ab 100644
--- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
+++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
@@ -84,39 +84,138 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher)
     return digest;
 }
 
-/// Galois Field multiplication using <x^127 + x^7 + x^2 + x + 1>.
-/// Note that x, y, and z are strictly BE.
 void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4])
 {
-    // Note: Copied upfront to stack to avoid memory access in the loop.
-    u32 x[4] { _x[0], _x[1], _x[2], _x[3] };
-    u32 const y[4] { _y[0], _y[1], _y[2], _y[3] };
-    u32 z[4] { 0, 0, 0, 0 };
-
-    // Unrolled by 32, the access in y[3-(i/32)] can be cached throughout the loop.
-#pragma GCC unroll 32
-    for (ssize_t i = 127, j = 0; i > -1; --i, j++) {
-        auto r = -((y[j / 32] >> (i % 32)) & 1);
-        z[0] ^= x[0] & r;
-        z[1] ^= x[1] & r;
-        z[2] ^= x[2] & r;
-        z[3] ^= x[3] & r;
-        auto a0 = x[0] & 1;
-        x[0] >>= 1;
-        auto a1 = x[1] & 1;
-        x[1] >>= 1;
-        x[1] |= a0 << 31;
-        auto a2 = x[2] & 1;
-        x[2] >>= 1;
-        x[2] |= a1 << 31;
-        auto a3 = x[3] & 1;
-        x[3] >>= 1;
-        x[3] |= a2 << 31;
-
-        x[0] ^= 0xe1000000 & -a3;
-    }
+    static auto const mul_32_x_32_64 = [](u32 const& a, u32 const& b) -> u64 {
+        return static_cast<u64>(a) * static_cast<u64>(b);
+    };
+
+    static auto const clmul_32_x_32_64 = [](u32 const& a, u32 const& b, u32& lo, u32& hi) -> void {
+        u32 ta[4];
+        u32 tb[4];
+        u64 tu64[4];
+        u64 tc[4];
+        u64 cc;
+
+        ta[0] = a & static_cast<u32>(0x11111111ul);
+        ta[1] = a & static_cast<u32>(0x22222222ul);
+        ta[2] = a & static_cast<u32>(0x44444444ul);
+        ta[3] = a & static_cast<u32>(0x88888888ul);
+        tb[0] = b & static_cast<u32>(0x11111111ul);
+        tb[1] = b & static_cast<u32>(0x22222222ul);
+        tb[2] = b & static_cast<u32>(0x44444444ul);
+        tb[3] = b & static_cast<u32>(0x88888888ul);
+        tu64[0] = mul_32_x_32_64(ta[0], tb[0]);
+        tu64[1] = mul_32_x_32_64(ta[1], tb[3]);
+        tu64[2] = mul_32_x_32_64(ta[2], tb[2]);
+        tu64[3] = mul_32_x_32_64(ta[3], tb[1]);
+        tc[0] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
+        tu64[0] = mul_32_x_32_64(ta[0], tb[1]);
+        tu64[1] = mul_32_x_32_64(ta[1], tb[0]);
+        tu64[2] = mul_32_x_32_64(ta[2], tb[3]);
+        tu64[3] = mul_32_x_32_64(ta[3], tb[2]);
+        tc[1] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
+        tu64[0] = mul_32_x_32_64(ta[0], tb[2]);
+        tu64[1] = mul_32_x_32_64(ta[1], tb[1]);
+        tu64[2] = mul_32_x_32_64(ta[2], tb[0]);
+        tu64[3] = mul_32_x_32_64(ta[3], tb[3]);
+        tc[2] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
+        tu64[0] = mul_32_x_32_64(ta[0], tb[3]);
+        tu64[1] = mul_32_x_32_64(ta[1], tb[2]);
+        tu64[2] = mul_32_x_32_64(ta[2], tb[1]);
+        tu64[3] = mul_32_x_32_64(ta[3], tb[0]);
+        tc[3] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
+        tc[0] &= static_cast<u64>(0x1111111111111111ull);
+        tc[1] &= static_cast<u64>(0x2222222222222222ull);
+        tc[2] &= static_cast<u64>(0x4444444444444444ull);
+        tc[3] &= static_cast<u64>(0x8888888888888888ull);
+        cc = tc[0] | tc[1] | tc[2] | tc[3];
+        lo = static_cast<u32>((cc >> (0 * 32)) & 0xfffffffful);
+        hi = static_cast<u32>((cc >> (1 * 32)) & 0xfffffffful);
+    };
 
-    memcpy(_z, z, sizeof(z));
+    u32 aa[4];
+    u32 bb[4];
+    u32 ta[9];
+    u32 tb[9];
+    u32 tc[4];
+    u32 tu32[4];
+    u32 td[4];
+    u32 te[4];
+    u32 z[8];
+
+    aa[3] = _x[0];
+    aa[2] = _x[1];
+    aa[1] = _x[2];
+    aa[0] = _x[3];
+    bb[3] = _y[0];
+    bb[2] = _y[1];
+    bb[1] = _y[2];
+    bb[0] = _y[3];
+    ta[0] = aa[0];
+    ta[1] = aa[1];
+    ta[2] = ta[0] ^ ta[1];
+    ta[3] = aa[2];
+    ta[4] = aa[3];
+    ta[5] = ta[3] ^ ta[4];
+    ta[6] = ta[0] ^ ta[3];
+    ta[7] = ta[1] ^ ta[4];
+    ta[8] = ta[6] ^ ta[7];
+    tb[0] = bb[0];
+    tb[1] = bb[1];
+    tb[2] = tb[0] ^ tb[1];
+    tb[3] = bb[2];
+    tb[4] = bb[3];
+    tb[5] = tb[3] ^ tb[4];
+    tb[6] = tb[0] ^ tb[3];
+    tb[7] = tb[1] ^ tb[4];
+    tb[8] = tb[6] ^ tb[7];
+    for (int i = 0; i != 9; ++i) {
+        clmul_32_x_32_64(ta[i], tb[i], ta[i], tb[i]);
+    }
+    tc[0] = ta[0];
+    tc[1] = ta[0] ^ ta[1] ^ ta[2] ^ tb[0];
+    tc[2] = ta[1] ^ tb[0] ^ tb[1] ^ tb[2];
+    tc[3] = tb[1];
+    td[0] = ta[3];
+    td[1] = ta[3] ^ ta[4] ^ ta[5] ^ tb[3];
+    td[2] = ta[4] ^ tb[3] ^ tb[4] ^ tb[5];
+    td[3] = tb[4];
+    te[0] = ta[6];
+    te[1] = ta[6] ^ ta[7] ^ ta[8] ^ tb[6];
+    te[2] = ta[7] ^ tb[6] ^ tb[7] ^ tb[8];
+    te[3] = tb[7];
+    te[0] ^= (tc[0] ^ td[0]);
+    te[1] ^= (tc[1] ^ td[1]);
+    te[2] ^= (tc[2] ^ td[2]);
+    te[3] ^= (tc[3] ^ td[3]);
+    tc[2] ^= te[0];
+    tc[3] ^= te[1];
+    td[0] ^= te[2];
+    td[1] ^= te[3];
+    z[0] = tc[0] << 1;
+    z[1] = (tc[1] << 1) | (tc[0] >> 31);
+    z[2] = (tc[2] << 1) | (tc[1] >> 31);
+    z[3] = (tc[3] << 1) | (tc[2] >> 31);
+    z[4] = (td[0] << 1) | (tc[3] >> 31);
+    z[5] = (td[1] << 1) | (td[0] >> 31);
+    z[6] = (td[2] << 1) | (td[1] >> 31);
+    z[7] = (td[3] << 1) | (td[2] >> 31);
+    for (int i = 0; i != 4; ++i) {
+        tu32[0] = z[i] << 31;
+        tu32[1] = z[i] << 30;
+        tu32[2] = z[i] << 25;
+        z[i + 3] ^= (tu32[0] ^ tu32[1] ^ tu32[2]);
+        tu32[0] = z[i] >> 0;
+        tu32[1] = z[i] >> 1;
+        tu32[2] = z[i] >> 2;
+        tu32[3] = z[i] >> 7;
+        z[i + 4] ^= (tu32[0] ^ tu32[1] ^ tu32[2] ^ tu32[3]);
+    }
+    _z[0] = z[7];
+    _z[1] = z[6];
+    _z[2] = z[5];
+    _z[3] = z[4];
 }
 
 }

From d70e79890e0a107b504bae33bcb080c543211a9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Kn=C3=A1pek?= <knapek.mar@gmail.com>
Date: Wed, 21 Aug 2024 23:53:51 +0200
Subject: [PATCH 2/3] LibCrypto: SIMDify GHash

---
 AK/SIMD.h                                     |  8 ++
 AK/SIMDExtras.h                               | 70 ++++++++++++++
 .../LibCrypto/Authentication/GHash.cpp        | 93 +++++++++++--------
 3 files changed, 132 insertions(+), 39 deletions(-)

diff --git a/AK/SIMD.h b/AK/SIMD.h
index 28c901016f71e5..7cb4d5cac2a209 100644
--- a/AK/SIMD.h
+++ b/AK/SIMD.h
@@ -108,11 +108,19 @@ struct IndexVectorFor<T> {
 };
 #endif
 
+template<typename T, size_t element_count>
+struct MakeVectorImpl {
+    using Type __attribute__((vector_size(sizeof(T) * element_count))) = T;
+};
+
 }
 
 template<SIMDVector T>
 using IndexVectorFor = typename Detail::IndexVectorFor<T>::Type;
 
+template<typename T, size_t element_count>
+using MakeVector = typename Detail::MakeVectorImpl<T, element_count>::Type;
+
 static_assert(IsSame<IndexVectorFor<i8x16>, i8x16>);
 static_assert(IsSame<IndexVectorFor<u32x4>, u32x4>);
 static_assert(IsSame<IndexVectorFor<u64x4>, u64x4>);
diff --git a/AK/SIMDExtras.h b/AK/SIMDExtras.h
index e03c70c6a2d771..f48003ecb9e724 100644
--- a/AK/SIMDExtras.h
+++ b/AK/SIMDExtras.h
@@ -253,6 +253,40 @@ ALWAYS_INLINE static T elementwise_byte_reverse_impl(T a, IndexSequence<Idx...>)
     }
 }
 
+template<SIMDVector T, size_t... Idx>
+ALWAYS_INLINE static ElementOf<T> reduce_or_impl(T const& a, IndexSequence<Idx...> const&)
+{
+    static_assert(is_power_of_two(vector_length<T>));
+    static_assert(vector_length<T> == sizeof...(Idx) * 2);
+
+    using E = ElementOf<T>;
+
+    constexpr size_t N = sizeof...(Idx);
+
+    if constexpr (N == 1) {
+        return a[0] | a[1];
+    } else {
+        return reduce_or_impl(MakeVector<E, N> { (a[Idx])... }, MakeIndexSequence<N / 2>()) | reduce_or_impl(MakeVector<E, N> { (a[N + Idx])... }, MakeIndexSequence<N / 2>());
+    }
+}
+
+template<SIMDVector T, size_t... Idx>
+ALWAYS_INLINE static ElementOf<T> reduce_xor_impl(T const& a, IndexSequence<Idx...> const&)
+{
+    static_assert(is_power_of_two(vector_length<T>));
+    static_assert(vector_length<T> == sizeof...(Idx) * 2);
+
+    using E = ElementOf<T>;
+
+    constexpr size_t N = sizeof...(Idx);
+
+    if constexpr (N == 1) {
+        return a[0] ^ a[1];
+    } else {
+        return reduce_xor_impl(MakeVector<E, N> { (a[Idx])... }, MakeIndexSequence<N / 2>()) ^ reduce_xor_impl(MakeVector<E, N> { (a[N + Idx])... }, MakeIndexSequence<N / 2>());
+    }
+}
+
 }
 
 // FIXME: Shuffles only work with integral types for now
@@ -286,4 +320,40 @@ ALWAYS_INLINE static T elementwise_byte_reverse(T a)
     return Detail::elementwise_byte_reverse_impl(a, MakeIndexSequence<vector_length<T>>());
 }
 
+template<SIMDVector T>
+ALWAYS_INLINE static ElementOf<T> reduce_or(T const& a)
+{
+    static_assert(is_power_of_two(vector_length<T>));
+    static_assert(IsUnsigned<ElementOf<T>>);
+
+#if defined __has_builtin
+#    if __has_builtin(__builtin_reduce_or)
+    if (true) {
+        return __builtin_reduce_or(a);
+    } else
+#    endif
+#endif
+    {
+        return Detail::reduce_or_impl(a, MakeIndexSequence<vector_length<T> / 2>());
+    }
+}
+
+template<SIMDVector T>
+ALWAYS_INLINE static ElementOf<T> reduce_xor(T const& a)
+{
+    static_assert(is_power_of_two(vector_length<T>));
+    static_assert(IsUnsigned<ElementOf<T>>);
+
+#if defined __has_builtin
+#    if __has_builtin(__builtin_reduce_xor)
+    if (true) {
+        return __builtin_reduce_xor(a);
+    } else
+#    endif
+#endif
+    {
+        return Detail::reduce_xor_impl(a, MakeIndexSequence<vector_length<T> / 2>());
+    }
+}
+
 }
diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
index 3c5ff6ba6981ab..9f440ff55f2159 100644
--- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
+++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
@@ -6,6 +6,8 @@
 
 #include <AK/ByteReader.h>
 #include <AK/Debug.h>
+#include <AK/SIMD.h>
+#include <AK/SIMDExtras.h>
 #include <AK/Types.h>
 #include <LibCrypto/Authentication/GHash.h>
 
@@ -86,50 +88,63 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher)
 
 void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4])
 {
-    static auto const mul_32_x_32_64 = [](u32 const& a, u32 const& b) -> u64 {
-        return static_cast<u64>(a) * static_cast<u64>(b);
+    using namespace AK::SIMD;
+
+    static auto const rotate_left = [](u32x4 const& x) -> u32x4 {
+        return u32x4 { x[3], x[0], x[1], x[2] };
+    };
+
+    static auto const mul_32_x_32_64 = [](u32x4 const& a, u32x4 const& b) -> u64x4 {
+        u64x2 r1;
+        u64x2 r2;
+
+#if defined __has_builtin
+#    if __has_builtin(__builtin_ia32_pmuludq128)
+        if (true) {
+            r1 = simd_cast<u64x2>(__builtin_ia32_pmuludq128(simd_cast<i32x4>(u32x4 { a[0], 0, a[1], 0 }), simd_cast<i32x4>(u32x4 { b[0], 0, b[1], 0 })));
+            r2 = simd_cast<u64x2>(__builtin_ia32_pmuludq128(simd_cast<i32x4>(u32x4 { a[2], 0, a[3], 0 }), simd_cast<i32x4>(u32x4 { b[2], 0, b[3], 0 })));
+        } else
+#    endif
+#endif
+        {
+            r1 = u64x2 { static_cast<u64>(a[0]) * static_cast<u64>(b[0]), static_cast<u64>(a[1]) * static_cast<u64>(b[1]) };
+            r2 = u64x2 { static_cast<u64>(a[2]) * static_cast<u64>(b[2]), static_cast<u64>(a[3]) * static_cast<u64>(b[3]) };
+        }
+        return u64x4 { r1[0], r1[1], r2[0], r2[1] };
     };
 
     static auto const clmul_32_x_32_64 = [](u32 const& a, u32 const& b, u32& lo, u32& hi) -> void {
-        u32 ta[4];
-        u32 tb[4];
-        u64 tu64[4];
-        u64 tc[4];
+        constexpr u32x4 mask32 = { 0x11111111, 0x22222222, 0x44444444, 0x88888888 };
+        constexpr u64x4 mask64 = { 0x1111111111111111ull, 0x2222222222222222ull, 0x4444444444444444ull, 0x8888888888888888ull };
+
+        u32x4 ta;
+        u32x4 tb;
+        u64x4 tu64;
+        u64x4 tc;
         u64 cc;
 
-        ta[0] = a & static_cast<u32>(0x11111111ul);
-        ta[1] = a & static_cast<u32>(0x22222222ul);
-        ta[2] = a & static_cast<u32>(0x44444444ul);
-        ta[3] = a & static_cast<u32>(0x88888888ul);
-        tb[0] = b & static_cast<u32>(0x11111111ul);
-        tb[1] = b & static_cast<u32>(0x22222222ul);
-        tb[2] = b & static_cast<u32>(0x44444444ul);
-        tb[3] = b & static_cast<u32>(0x88888888ul);
-        tu64[0] = mul_32_x_32_64(ta[0], tb[0]);
-        tu64[1] = mul_32_x_32_64(ta[1], tb[3]);
-        tu64[2] = mul_32_x_32_64(ta[2], tb[2]);
-        tu64[3] = mul_32_x_32_64(ta[3], tb[1]);
-        tc[0] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
-        tu64[0] = mul_32_x_32_64(ta[0], tb[1]);
-        tu64[1] = mul_32_x_32_64(ta[1], tb[0]);
-        tu64[2] = mul_32_x_32_64(ta[2], tb[3]);
-        tu64[3] = mul_32_x_32_64(ta[3], tb[2]);
-        tc[1] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
-        tu64[0] = mul_32_x_32_64(ta[0], tb[2]);
-        tu64[1] = mul_32_x_32_64(ta[1], tb[1]);
-        tu64[2] = mul_32_x_32_64(ta[2], tb[0]);
-        tu64[3] = mul_32_x_32_64(ta[3], tb[3]);
-        tc[2] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
-        tu64[0] = mul_32_x_32_64(ta[0], tb[3]);
-        tu64[1] = mul_32_x_32_64(ta[1], tb[2]);
-        tu64[2] = mul_32_x_32_64(ta[2], tb[1]);
-        tu64[3] = mul_32_x_32_64(ta[3], tb[0]);
-        tc[3] = tu64[0] ^ tu64[1] ^ tu64[2] ^ tu64[3];
-        tc[0] &= static_cast<u64>(0x1111111111111111ull);
-        tc[1] &= static_cast<u64>(0x2222222222222222ull);
-        tc[2] &= static_cast<u64>(0x4444444444444444ull);
-        tc[3] &= static_cast<u64>(0x8888888888888888ull);
-        cc = tc[0] | tc[1] | tc[2] | tc[3];
+        ta = a & mask32;
+        tb = b & mask32;
+        tb = item_reverse(tb);
+
+        tb = rotate_left(tb);
+        tu64 = mul_32_x_32_64(ta, tb);
+        tc[0] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] });
+
+        tb = rotate_left(tb);
+        tu64 = mul_32_x_32_64(ta, tb);
+        tc[1] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] });
+
+        tb = rotate_left(tb);
+        tu64 = mul_32_x_32_64(ta, tb);
+        tc[2] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] });
+
+        tb = rotate_left(tb);
+        tu64 = mul_32_x_32_64(ta, tb);
+        tc[3] = reduce_xor(u64x4 { tu64[0], tu64[1], tu64[2], tu64[3] });
+
+        tc &= mask64;
+        cc = reduce_or(tc);
         lo = static_cast<u32>((cc >> (0 * 32)) & 0xfffffffful);
         hi = static_cast<u32>((cc >> (1 * 32)) & 0xfffffffful);
     };

From 34d3c564e82bb56d3f52cc9dec0514b91867d003 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Kn=C3=A1pek?= <knapek.mar@gmail.com>
Date: Thu, 22 Aug 2024 16:11:33 +0200
Subject: [PATCH 3/3] LibCrypto: Add docs

---
 .../Libraries/LibCrypto/Authentication/GHash.cpp   | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
index 9f440ff55f2159..bea2c43bbdce3a 100644
--- a/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
+++ b/Userland/Libraries/LibCrypto/Authentication/GHash.cpp
@@ -88,6 +88,9 @@ GHash::TagType GHash::process(ReadonlyBytes aad, ReadonlyBytes cipher)
 
 void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4])
 {
+    /** This function computes 128bit x 128bit unsigned integer multiplication inside Galois finite field, producing 128bit result.
+     *   It uses 9 32bit x 32bit to 64bit carry-less multiplications in Karatsuba decomposition.
+     */
     using namespace AK::SIMD;
 
     static auto const rotate_left = [](u32x4 const& x) -> u32x4 {
@@ -95,6 +98,9 @@ void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4])
     };
 
     static auto const mul_32_x_32_64 = [](u32x4 const& a, u32x4 const& b) -> u64x4 {
+        /** This function computes 32bit x 32bit unsigned integer multiplication, producing 64bit result.
+         *  It does this for 4 32bit integers x 4 32bit integers at a time, producing 4 64bit integers result.
+         */
         u64x2 r1;
         u64x2 r2;
 
@@ -114,6 +120,14 @@ void galois_multiply(u32 (&_z)[4], u32 const (&_x)[4], u32 const (&_y)[4])
     };
 
     static auto const clmul_32_x_32_64 = [](u32 const& a, u32 const& b, u32& lo, u32& hi) -> void {
+        /** This function computes 32bit x 32bit unsigned integer carry-less multiplication, producing 64bit result.
+         *  It does this by extracting 4 bits from each integer at a time and multiplying those.
+         *  Those 4 bits are packed into 32bit integers with holes, 1 significant bit plus 3 holes, repeated 4 times.
+         *  Repeating previous logic 4 times, we are able to multiply all of the input 32 bits.
+         *  The holes are there to prevent the carry spill to more significant bits. Respectively, allowing the carry
+         *  to spill into holes, the holes are later discarded.
+         *  https://www.bearssl.org/constanttime.html#ghash-for-gcm
+         */
         constexpr u32x4 mask32 = { 0x11111111, 0x22222222, 0x44444444, 0x88888888 };
         constexpr u64x4 mask64 = { 0x1111111111111111ull, 0x2222222222222222ull, 0x4444444444444444ull, 0x8888888888888888ull };