diff --git a/simd/keccakf1600/f1600x.go b/simd/keccakf1600/f1600x.go new file mode 100644 index 000000000..5f32e59fa --- /dev/null +++ b/simd/keccakf1600/f1600x.go @@ -0,0 +1,146 @@ +// Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel. +// +// Keccak-f[1600] is the permutation underlying several algorithms such as +// Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is +// useful in some scenarios like in hash-based signatures. +// +// Limitations +// +// Note that not all the architectures support SIMD instructions. This package +// uses AVX2 instructions that are available in some AMD64 architectures +// and NEON instructions that are available in some ARM64 architectures. +// +// For those systems not supporting these, the package still provides the +// expected functionality by means of a generic and slow implementation. +// The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2() +// to determine if the current system supports the SIMD implementation. +package keccakf1600 + +import ( + "unsafe" + + "github.com/cloudflare/circl/internal/sha3" + "golang.org/x/sys/cpu" +) + +// StateX4 contains state for the four-way permutation including the four +// interleaved [25]uint64 buffers. Call Initialize() before use to initialize +// and get a pointer to the interleaved buffer. +type StateX4 struct { + // Go guarantees a to be aligned on 8 bytes, whereas we need it to be + // aligned on 32 bytes for bet performance. Thus we leave some headroom + // to be able to move the start of the state. + + // 4 x 25 uint64s for the interleaved states and three uint64s headroom + // to fix alignment. + a [103]uint64 + + // Offset into a that is 32 byte aligned. + offset int +} + +// StateX2 contains state for the two-way permutation including the two +// interleaved [25]uint64 buffers. Call Initialize() before use to initialize +// and get a pointer to the interleaved buffer. +type StateX2 struct { + // Go guarantees a to be aligned on 8 bytes, whereas we need it to be + // aligned on 32 bytes for bet performance. Thus we leave some headroom + // to be able to move the start of the state. + + // 2 x 25 uint64s for the interleaved states and three uint64s headroom + // to fix alignment. + a [53]uint64 + + // Offset into a that is 32 byte aligned. + offset int +} + +// IsEnabledX4 returns true if the architecture supports a four-way SIMD +// implementation provided in this package. +func IsEnabledX4() bool { return cpu.X86.HasAVX2 } + +// IsEnabledX2 returns true if the architecture supports a two-way SIMD +// implementation provided in this package. +func IsEnabledX2() bool { return cpu.ARM64.HasSHA3 } + +// Initialize the state and returns the buffer on which the four permutations +// will act: a uint64 slice of length 100. The first permutation will act +// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc. +func (s *StateX4) Initialize() []uint64 { + rp := unsafe.Pointer(&s.a[0]) + + // uint64s are always aligned by a multiple of 8. Compute the remainder + // of the address modulo 32 divided by 8. + rem := (int(uintptr(rp)&31) >> 3) + + if rem != 0 { + s.offset = 4 - rem + } + + // The slice we return will be aligned on 32 byte boundary. + return s.a[s.offset : s.offset+100] +} + +// Initialize the state and returns the buffer on which the two permutations +// will act: a uint64 slice of length 50. The first permutation will act +// on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}. +func (s *StateX2) Initialize() []uint64 { + rp := unsafe.Pointer(&s.a[0]) + + // uint64s are always aligned by a multiple of 8. Compute the remainder + // of the address modulo 32 divided by 8. + rem := (int(uintptr(rp)&31) >> 3) + + if rem != 0 { + s.offset = 4 - rem + } + + // The slice we return will be aligned on 32 byte boundary. + return s.a[s.offset : s.offset+50] +} + +// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice +// returned from Initialize(). +func (s *StateX4) Permute() { + if IsEnabledX4() { + permuteSIMDx4(s.a[s.offset:]) + } else { + permuteScalarX4(s.a[s.offset:]) // A slower generic implementation. + } +} + +// Permute performs the two parallel Keccak-f[1600]s interleaved on the slice +// returned from Initialize(). +func (s *StateX2) Permute() { + if IsEnabledX2() { + permuteSIMDx2(s.a[s.offset:]) + } else { + permuteScalarX2(s.a[s.offset:]) // A slower generic implementation. + } +} + +func permuteScalarX4(a []uint64) { + var buf [25]uint64 + for i := 0; i < 4; i++ { + for j := 0; j < 25; j++ { + buf[j] = a[4*j+i] + } + sha3.KeccakF1600(&buf) + for j := 0; j < 25; j++ { + a[4*j+i] = buf[j] + } + } +} + +func permuteScalarX2(a []uint64) { + var buf [25]uint64 + for i := 0; i < 2; i++ { + for j := 0; j < 25; j++ { + buf[j] = a[2*j+i] + } + sha3.KeccakF1600(&buf) + for j := 0; j < 25; j++ { + a[2*j+i] = buf[j] + } + } +} diff --git a/simd/keccakf1600/f1600x2_arm64.go b/simd/keccakf1600/f1600x2_arm64.go new file mode 100644 index 000000000..99fe7a2a7 --- /dev/null +++ b/simd/keccakf1600/f1600x2_arm64.go @@ -0,0 +1,12 @@ +// +build arm64,go1.16 + +package keccakf1600 + +import "github.com/cloudflare/circl/internal/sha3" + +func permuteSIMDx2(state []uint64) { f1600x2ARM(&state[0], &sha3.RC) } + +func permuteSIMDx4(state []uint64) { permuteScalarX4(state) } + +//go:noescape +func f1600x2ARM(state *uint64, rc *[24]uint64) diff --git a/simd/keccakf1600/f1600x2_arm64.s b/simd/keccakf1600/f1600x2_arm64.s new file mode 100644 index 000000000..1e8547f9b --- /dev/null +++ b/simd/keccakf1600/f1600x2_arm64.s @@ -0,0 +1,130 @@ +// +build arm64,go1.16 + +// Taken from https://github.com/bwesterb/armed-keccak + +#include "textflag.h" + +// func f1600x2ARM(state *uint64, rc *[24]uint64) +TEXT ·f1600x2ARM(SB), NOSPLIT, $0-16 + MOVD state+0(FP), R0 + MOVD rc+8(FP), R1 + MOVD R0, R2 + MOVD $24, R3 + + VLD1.P 64(R0), [ V0.B16, V1.B16, V2.B16, V3.B16] + VLD1.P 64(R0), [ V4.B16, V5.B16, V6.B16, V7.B16] + VLD1.P 64(R0), [ V8.B16, V9.B16, V10.B16, V11.B16] + VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16] + VLD1.P (R0), [V24.B16] + +loop: + // Execute theta but without xorring into the state yet. + VEOR3 V10.B16, V5.B16, V0.B16, V25.B16 + VEOR3 V11.B16, V6.B16, V1.B16, V26.B16 + VEOR3 V12.B16, V7.B16, V2.B16, V27.B16 + VEOR3 V13.B16, V8.B16, V3.B16, V28.B16 + VEOR3 V14.B16, V9.B16, V4.B16, V29.B16 + + VEOR3 V20.B16, V15.B16, V25.B16, V25.B16 + VEOR3 V21.B16, V16.B16, V26.B16, V26.B16 + VEOR3 V22.B16, V17.B16, V27.B16, V27.B16 + VEOR3 V23.B16, V18.B16, V28.B16, V28.B16 + VEOR3 V24.B16, V19.B16, V29.B16, V29.B16 + + // Xor parities from step theta into the state at the same time as + // exeuting rho and pi. + VRAX1 V26.D2, V29.D2, V30.D2 + VRAX1 V29.D2, V27.D2, V29.D2 + VRAX1 V27.D2, V25.D2, V27.D2 + VRAX1 V25.D2, V28.D2, V25.D2 + VRAX1 V28.D2, V26.D2, V28.D2 + + VEOR V30.B16, V0.B16, V0.B16 + VMOV V1.B16, V31.B16 + + VXAR $20, V27.D2, V6.D2, V1.D2 + VXAR $44, V25.D2, V9.D2, V6.D2 + VXAR $3 , V28.D2, V22.D2, V9.D2 + VXAR $25, V25.D2, V14.D2, V22.D2 + VXAR $46, V30.D2, V20.D2, V14.D2 + VXAR $2 , V28.D2, V2.D2, V20.D2 + VXAR $21, V28.D2, V12.D2, V2.D2 + VXAR $39, V29.D2, V13.D2, V12.D2 + VXAR $56, V25.D2, V19.D2, V13.D2 + VXAR $8 , V29.D2, V23.D2, V19.D2 + VXAR $23, V30.D2, V15.D2, V23.D2 + VXAR $37, V25.D2, V4.D2, V15.D2 + VXAR $50, V25.D2, V24.D2, V4.D2 + VXAR $62, V27.D2, V21.D2, V24.D2 + VXAR $9 , V29.D2, V8.D2, V21.D2 + VXAR $19, V27.D2, V16.D2, V8.D2 + VXAR $28, V30.D2, V5.D2, V16.D2 + VXAR $36, V29.D2, V3.D2, V5.D2 + VXAR $43, V29.D2, V18.D2, V3.D2 + VXAR $49, V28.D2, V17.D2, V18.D2 + VXAR $54, V27.D2, V11.D2, V17.D2 + VXAR $58, V28.D2, V7.D2, V11.D2 + VXAR $61, V30.D2, V10.D2, V7.D2 + VXAR $63, V27.D2, V31.D2, V10.D2 + + // Chi + VBCAX V1.B16, V2.B16, V0.B16, V25.B16 + VBCAX V2.B16, V3.B16, V1.B16, V26.B16 + VBCAX V3.B16, V4.B16, V2.B16, V2.B16 + VBCAX V4.B16, V0.B16, V3.B16, V3.B16 + VBCAX V0.B16, V1.B16, V4.B16, V4.B16 + VMOV V25.B16, V0.B16 + VMOV V26.B16, V1.B16 + + VBCAX V6.B16, V7.B16, V5.B16, V25.B16 + VBCAX V7.B16, V8.B16, V6.B16, V26.B16 + VBCAX V8.B16, V9.B16, V7.B16, V7.B16 + VBCAX V9.B16, V5.B16, V8.B16, V8.B16 + VBCAX V5.B16, V6.B16, V9.B16, V9.B16 + VMOV V25.B16, V5.B16 + VMOV V26.B16, V6.B16 + + VBCAX V11.B16, V12.B16, V10.B16, V25.B16 + VBCAX V12.B16, V13.B16, V11.B16, V26.B16 + VBCAX V13.B16, V14.B16, V12.B16, V12.B16 + VBCAX V14.B16, V10.B16, V13.B16, V13.B16 + VBCAX V10.B16, V11.B16, V14.B16, V14.B16 + VMOV V25.B16, V10.B16 + VMOV V26.B16, V11.B16 + + VBCAX V16.B16, V17.B16, V15.B16, V25.B16 + VBCAX V17.B16, V18.B16, V16.B16, V26.B16 + VBCAX V18.B16, V19.B16, V17.B16, V17.B16 + VBCAX V19.B16, V15.B16, V18.B16, V18.B16 + VBCAX V15.B16, V16.B16, V19.B16, V19.B16 + VMOV V25.B16, V15.B16 + VMOV V26.B16, V16.B16 + + VBCAX V21.B16, V22.B16, V20.B16, V25.B16 + VBCAX V22.B16, V23.B16, V21.B16, V26.B16 + VBCAX V23.B16, V24.B16, V22.B16, V22.B16 + VBCAX V24.B16, V20.B16, V23.B16, V23.B16 + VBCAX V20.B16, V21.B16, V24.B16, V24.B16 + VMOV V25.B16, V20.B16 + VMOV V26.B16, V21.B16 + + // Iota + VLD1R.P 8(R1), [V25.D2] + VEOR V25.B16, V0.B16, V0.B16 + + SUBS $1, R3, R3 + CBNZ R3, loop + + MOVD R2, R0 + + VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0) + VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R0) + VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0) + VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0) + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0) + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0) + VST1.P [V24.B16], (R0) + + RET diff --git a/simd/keccakf1600/f1600x4.go b/simd/keccakf1600/f1600x4.go deleted file mode 100644 index 58838233d..000000000 --- a/simd/keccakf1600/f1600x4.go +++ /dev/null @@ -1,84 +0,0 @@ -// Package keccakf1600 provides a four-way Keccak-f[1600] permutation in parallel. -// -// Keccak-f[1600] is the permutation underlying several algorithms such as -// Keccak, SHA3 and SHAKE. Running four permutations in parallel is useful in -// some scenarios like in hash-based signatures. -// -// Limitations -// -// Note that not all the architectures support SIMD instructions. This package -// uses AVX2 instructions that are available in some AMD64 architectures. -// -// For those systems not supporting AVX2, the package still provides the -// expected functionality by means of a generic and slow implementation. -// The recommendation is to beforehand verify IsEnabledX4() to determine if -// the current system supports the SIMD implementation. -package keccakf1600 - -import ( - "unsafe" - - "github.com/cloudflare/circl/internal/sha3" - "golang.org/x/sys/cpu" -) - -// StateX4 contains state for the four-way permutation including the four -// interleaved [25]uint64 buffers. Call Initialize() before use to initialize -// and get a pointer to the interleaved buffer. -type StateX4 struct { - // Go guarantees a to be aligned on 8 bytes, whereas we need it to be - // aligned on 32 bytes for bet performance. Thus we leave some headroom - // to be able to move the start of the state. - - // 4 x 25 uint64s for the interleaved states and three uint64s headroom - // to fix alignment. - a [103]uint64 - - // Offset into a that is 32 byte aligned. - offset int -} - -// IsEnabledX4 returns true if the architecture supports a four-way SIMD -// implementation provided in this package. -func IsEnabledX4() bool { return cpu.X86.HasAVX2 } - -// Initialize the state and returns the buffer on which the four permutations -// will act: a uint64 slice of length 100. The first permutation will act -// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc. -func (s *StateX4) Initialize() []uint64 { - rp := unsafe.Pointer(&s.a[0]) - - // uint64s are always aligned by a multiple of 8. Compute the remainder - // of the address modulo 32 divided by 8. - rem := (int(uintptr(rp)&31) >> 3) - - if rem != 0 { - s.offset = 4 - rem - } - - // The slice we return will be aligned on 32 byte boundary. - return s.a[s.offset : s.offset+100] -} - -// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice -// returned from Initialize(). -func (s *StateX4) Permute() { - if IsEnabledX4() { - permuteSIMD(s.a[s.offset:]) - } else { - permuteScalar(s.a[s.offset:]) // A slower generic implementation. - } -} - -func permuteScalar(a []uint64) { - var buf [25]uint64 - for i := 0; i < 4; i++ { - for j := 0; j < 25; j++ { - buf[j] = a[4*j+i] - } - sha3.KeccakF1600(&buf) - for j := 0; j < 25; j++ { - a[4*j+i] = buf[j] - } - } -} diff --git a/simd/keccakf1600/f1600x4_amd64.go b/simd/keccakf1600/f1600x4_amd64.go index 0ea57a4ff..ac5c658d5 100644 --- a/simd/keccakf1600/f1600x4_amd64.go +++ b/simd/keccakf1600/f1600x4_amd64.go @@ -2,4 +2,6 @@ package keccakf1600 import "github.com/cloudflare/circl/internal/sha3" -func permuteSIMD(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) } +func permuteSIMDx4(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) } + +func permuteSIMDx2(state []uint64) { permuteScalarX2(state) } diff --git a/simd/keccakf1600/f1600x4_test.go b/simd/keccakf1600/f1600x_test.go similarity index 57% rename from simd/keccakf1600/f1600x4_test.go rename to simd/keccakf1600/f1600x_test.go index 866a1ae2a..09e83bba6 100644 --- a/simd/keccakf1600/f1600x4_test.go +++ b/simd/keccakf1600/f1600x_test.go @@ -15,6 +15,29 @@ var permutationOfZeroes = [25]uint64{ 0xEAF1FF7B5CECA249, } +func TestKeccakF1600x2(t *testing.T) { + test := func(t *testing.T, f func(s *StateX2, a []uint64)) { + t.Helper() + var state StateX2 + a := state.Initialize() + f(&state, a) + for i := 0; i < 25; i++ { + for j := 0; j < 2; j++ { + if a[2*i+j] != permutationOfZeroes[i] { + t.Fatalf("%X", a) + } + } + } + } + + t.Run("Generic", func(t *testing.T) { + test(t, func(s *StateX2, a []uint64) { permuteScalarX2(a) }) + }) + t.Run("SIMD", func(t *testing.T) { + test(t, func(s *StateX2, a []uint64) { s.Permute() }) + }) +} + func TestKeccakF1600x4(t *testing.T) { test := func(t *testing.T, f func(s *StateX4, a []uint64)) { t.Helper() @@ -31,13 +54,31 @@ func TestKeccakF1600x4(t *testing.T) { } t.Run("Generic", func(t *testing.T) { - test(t, func(s *StateX4, a []uint64) { permuteScalar(a) }) + test(t, func(s *StateX4, a []uint64) { permuteScalarX4(a) }) }) t.Run("SIMD", func(t *testing.T) { test(t, func(s *StateX4, a []uint64) { s.Permute() }) }) } +func BenchmarkF1600x2(b *testing.B) { + benchmark := func(b *testing.B, f func(s *StateX2, a []uint64)) { + var state StateX2 + a := state.Initialize() + + for i := 0; i < b.N; i++ { + f(&state, a) + } + } + + b.Run("Generic", func(b *testing.B) { + benchmark(b, func(s *StateX2, a []uint64) { permuteScalarX2(a) }) + }) + b.Run("SIMD", func(b *testing.B) { + benchmark(b, func(s *StateX2, a []uint64) { s.Permute() }) + }) +} + func BenchmarkF1600x4(b *testing.B) { benchmark := func(b *testing.B, f func(s *StateX4, a []uint64)) { var state StateX4 @@ -49,7 +90,7 @@ func BenchmarkF1600x4(b *testing.B) { } b.Run("Generic", func(b *testing.B) { - benchmark(b, func(s *StateX4, a []uint64) { permuteScalar(a) }) + benchmark(b, func(s *StateX4, a []uint64) { permuteScalarX4(a) }) }) b.Run("SIMD", func(b *testing.B) { benchmark(b, func(s *StateX4, a []uint64) { s.Permute() }) diff --git a/simd/keccakf1600/fallback.go b/simd/keccakf1600/fallback.go index f4918011f..70b3070bd 100644 --- a/simd/keccakf1600/fallback.go +++ b/simd/keccakf1600/fallback.go @@ -1,5 +1,7 @@ -// +build !amd64 +// +build !amd64,!arm64 arm64,!go1.16 package keccakf1600 -func permuteSIMD(state []uint64) { permuteScalar(state) } +func permuteSIMDx2(state []uint64) { permuteScalarX2(state) } + +func permuteSIMDx4(state []uint64) { permuteScalarX4(state) }