cloudflare · armfazh · Dec 8, 2020 · Nov 25, 2020
diff --git a/simd/keccakf1600/f1600x.go b/simd/keccakf1600/f1600x.go
@@ -0,0 +1,146 @@
+// Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
+//
+// Keccak-f[1600] is the permutation underlying several algorithms such as
+// Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
+// useful in some scenarios like in hash-based signatures.
+//
+// Limitations
+//
+// Note that not all the architectures support SIMD instructions. This package
+// uses AVX2 instructions that are available in some AMD64 architectures
+// and  NEON instructions that are available in some ARM64 architectures.
+//
+// For those systems not supporting these, the package still provides the
+// expected functionality by means of a generic and slow implementation.
+// The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
+// to determine if the current system supports the SIMD implementation.
+package keccakf1600
+
+import (
+	"unsafe"
+
+	"github.com/cloudflare/circl/internal/sha3"
+	"golang.org/x/sys/cpu"
+)
+
+// StateX4 contains state for the four-way permutation including the four
+// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
+// and get a pointer to the interleaved buffer.
+type StateX4 struct {
+	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
+	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
+	// to be able to move the start of the state.
+
+	// 4 x 25 uint64s for the interleaved states and three uint64s headroom
+	// to fix alignment.
+	a [103]uint64
+
+	// Offset into a that is 32 byte aligned.
+	offset int
+}
+
+// StateX2 contains state for the two-way permutation including the two
+// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
+// and get a pointer to the interleaved buffer.
+type StateX2 struct {
+	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
+	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
+	// to be able to move the start of the state.
+
+	// 2 x 25 uint64s for the interleaved states and three uint64s headroom
+	// to fix alignment.
+	a [53]uint64
+
+	// Offset into a that is 32 byte aligned.
+	offset int
+}
+
+// IsEnabledX4 returns true if the architecture supports a four-way SIMD
+// implementation provided in this package.
+func IsEnabledX4() bool { return cpu.X86.HasAVX2 }
+
+// IsEnabledX2 returns true if the architecture supports a two-way SIMD
+// implementation provided in this package.
+func IsEnabledX2() bool { return cpu.ARM64.HasSHA3 }
+
+// Initialize the state and returns the buffer on which the four permutations
+// will act: a uint64 slice of length 100.  The first permutation will act
+// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
+func (s *StateX4) Initialize() []uint64 {
+	rp := unsafe.Pointer(&s.a[0])
+
+	// uint64s are always aligned by a multiple of 8.  Compute the remainder
+	// of the address modulo 32 divided by 8.
+	rem := (int(uintptr(rp)&31) >> 3)
+
+	if rem != 0 {
+		s.offset = 4 - rem
+	}
+
+	// The slice we return will be aligned on 32 byte boundary.
+	return s.a[s.offset : s.offset+100]
+}
+
+// Initialize the state and returns the buffer on which the two permutations
+// will act: a uint64 slice of length 50.  The first permutation will act
+// on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
+func (s *StateX2) Initialize() []uint64 {
+	rp := unsafe.Pointer(&s.a[0])
+
+	// uint64s are always aligned by a multiple of 8.  Compute the remainder
+	// of the address modulo 32 divided by 8.
+	rem := (int(uintptr(rp)&31) >> 3)
+
+	if rem != 0 {
+		s.offset = 4 - rem
+	}
+
+	// The slice we return will be aligned on 32 byte boundary.
+	return s.a[s.offset : s.offset+50]
+}
+
+// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
+// returned from Initialize().
+func (s *StateX4) Permute() {
+	if IsEnabledX4() {
+		permuteSIMDx4(s.a[s.offset:])
+	} else {
+		permuteScalarX4(s.a[s.offset:]) // A slower generic implementation.
+	}
+}
+
+// Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
+// returned from Initialize().
+func (s *StateX2) Permute() {
+	if IsEnabledX2() {
+		permuteSIMDx2(s.a[s.offset:])
+	} else {
+		permuteScalarX2(s.a[s.offset:]) // A slower generic implementation.
+	}
+}
+
+func permuteScalarX4(a []uint64) {
+	var buf [25]uint64
+	for i := 0; i < 4; i++ {
+		for j := 0; j < 25; j++ {
+			buf[j] = a[4*j+i]
+		}
+		sha3.KeccakF1600(&buf)
+		for j := 0; j < 25; j++ {
+			a[4*j+i] = buf[j]
+		}
+	}
+}
+
+func permuteScalarX2(a []uint64) {
+	var buf [25]uint64
+	for i := 0; i < 2; i++ {
+		for j := 0; j < 25; j++ {
+			buf[j] = a[2*j+i]
+		}
+		sha3.KeccakF1600(&buf)
+		for j := 0; j < 25; j++ {
+			a[2*j+i] = buf[j]
+		}
+	}
+}
diff --git a/simd/keccakf1600/f1600x2_arm64.go b/simd/keccakf1600/f1600x2_arm64.go
@@ -0,0 +1,12 @@
+// +build arm64,go1.16
+
+package keccakf1600
+
+import "github.com/cloudflare/circl/internal/sha3"
+
+func permuteSIMDx2(state []uint64) { f1600x2ARM(&state[0], &sha3.RC) }
+
+func permuteSIMDx4(state []uint64) { permuteScalarX4(state) }
+
+//go:noescape
+func f1600x2ARM(state *uint64, rc *[24]uint64)
diff --git a/simd/keccakf1600/f1600x2_arm64.s b/simd/keccakf1600/f1600x2_arm64.s
@@ -0,0 +1,130 @@
+// +build arm64,go1.16
+
+// Taken from https://github.com/bwesterb/armed-keccak
+
+#include "textflag.h"
+
+// func f1600x2ARM(state *uint64, rc *[24]uint64)
+TEXT ·f1600x2ARM(SB), NOSPLIT, $0-16
+    MOVD state+0(FP), R0
+    MOVD rc+8(FP), R1
+    MOVD R0, R2
+    MOVD $24, R3
+
+    VLD1.P 64(R0), [ V0.B16,  V1.B16,  V2.B16,  V3.B16]
+    VLD1.P 64(R0), [ V4.B16,  V5.B16,  V6.B16,  V7.B16]
+    VLD1.P 64(R0), [ V8.B16,  V9.B16, V10.B16, V11.B16]
+    VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
+    VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
+    VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
+    VLD1.P (R0),   [V24.B16]
+
+loop:
+    // Execute theta but without xorring into the state yet.
+    VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
+    VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
+    VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
+    VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
+    VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
+
+    VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
+    VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
+    VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
+    VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
+    VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
+
+    // Xor parities from step theta into the state at the same time as
+    // exeuting rho and pi.   
+    VRAX1 V26.D2, V29.D2, V30.D2
+    VRAX1 V29.D2, V27.D2, V29.D2
+    VRAX1 V27.D2, V25.D2, V27.D2
+    VRAX1 V25.D2, V28.D2, V25.D2
+    VRAX1 V28.D2, V26.D2, V28.D2
+
+    VEOR V30.B16, V0.B16, V0.B16
+    VMOV V1.B16, V31.B16
+
+    VXAR $20, V27.D2,  V6.D2,  V1.D2   
+    VXAR $44, V25.D2,  V9.D2,  V6.D2   
+    VXAR $3 , V28.D2, V22.D2,  V9.D2   
+    VXAR $25, V25.D2, V14.D2, V22.D2  
+    VXAR $46, V30.D2, V20.D2, V14.D2  
+    VXAR $2 , V28.D2,  V2.D2, V20.D2  
+    VXAR $21, V28.D2, V12.D2,  V2.D2  
+    VXAR $39, V29.D2, V13.D2, V12.D2  
+    VXAR $56, V25.D2, V19.D2, V13.D2  
+    VXAR $8 , V29.D2, V23.D2, V19.D2  
+    VXAR $23, V30.D2, V15.D2, V23.D2  
+    VXAR $37, V25.D2,  V4.D2, V15.D2  
+    VXAR $50, V25.D2, V24.D2,  V4.D2   
+    VXAR $62, V27.D2, V21.D2, V24.D2  
+    VXAR $9 , V29.D2,  V8.D2, V21.D2  
+    VXAR $19, V27.D2, V16.D2,  V8.D2   
+    VXAR $28, V30.D2,  V5.D2, V16.D2  
+    VXAR $36, V29.D2,  V3.D2,  V5.D2   
+    VXAR $43, V29.D2, V18.D2,  V3.D2   
+    VXAR $49, V28.D2, V17.D2, V18.D2  
+    VXAR $54, V27.D2, V11.D2, V17.D2  
+    VXAR $58, V28.D2,  V7.D2, V11.D2  
+    VXAR $61, V30.D2, V10.D2,  V7.D2   
+    VXAR $63, V27.D2, V31.D2, V10.D2  
+
+    // Chi
+    VBCAX V1.B16, V2.B16, V0.B16, V25.B16
+    VBCAX V2.B16, V3.B16, V1.B16, V26.B16
+    VBCAX V3.B16, V4.B16, V2.B16,  V2.B16
+    VBCAX V4.B16, V0.B16, V3.B16,  V3.B16
+    VBCAX V0.B16, V1.B16, V4.B16,  V4.B16
+    VMOV V25.B16, V0.B16
+    VMOV V26.B16, V1.B16
+
+    VBCAX V6.B16, V7.B16, V5.B16, V25.B16
+    VBCAX V7.B16, V8.B16, V6.B16, V26.B16
+    VBCAX V8.B16, V9.B16, V7.B16,  V7.B16
+    VBCAX V9.B16, V5.B16, V8.B16,  V8.B16
+    VBCAX V5.B16, V6.B16, V9.B16,  V9.B16
+    VMOV V25.B16, V5.B16
+    VMOV V26.B16, V6.B16
+
+    VBCAX V11.B16, V12.B16, V10.B16, V25.B16
+    VBCAX V12.B16, V13.B16, V11.B16, V26.B16
+    VBCAX V13.B16, V14.B16, V12.B16, V12.B16
+    VBCAX V14.B16, V10.B16, V13.B16, V13.B16
+    VBCAX V10.B16, V11.B16, V14.B16, V14.B16
+    VMOV V25.B16, V10.B16
+    VMOV V26.B16, V11.B16
+
+    VBCAX V16.B16, V17.B16, V15.B16, V25.B16
+    VBCAX V17.B16, V18.B16, V16.B16, V26.B16
+    VBCAX V18.B16, V19.B16, V17.B16, V17.B16
+    VBCAX V19.B16, V15.B16, V18.B16, V18.B16
+    VBCAX V15.B16, V16.B16, V19.B16, V19.B16
+    VMOV V25.B16, V15.B16
+    VMOV V26.B16, V16.B16
+
+    VBCAX V21.B16, V22.B16, V20.B16, V25.B16
+    VBCAX V22.B16, V23.B16, V21.B16, V26.B16
+    VBCAX V23.B16, V24.B16, V22.B16, V22.B16
+    VBCAX V24.B16, V20.B16, V23.B16, V23.B16
+    VBCAX V20.B16, V21.B16, V24.B16, V24.B16
+    VMOV V25.B16, V20.B16
+    VMOV V26.B16, V21.B16
+
+    // Iota
+    VLD1R.P 8(R1), [V25.D2]
+    VEOR V25.B16, V0.B16, V0.B16
+
+    SUBS $1, R3, R3
+    CBNZ R3, loop
+
+    MOVD R2, R0
+
+    VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R0) 
+    VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R0)
+    VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R0)
+    VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
+    VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
+    VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
+    VST1.P [V24.B16], (R0)
+
+    RET
diff --git a/simd/keccakf1600/f1600x4.go b/simd/keccakf1600/f1600x4.go
diff --git a/simd/keccakf1600/f1600x4_amd64.go b/simd/keccakf1600/f1600x4_amd64.go
@@ -2,4 +2,6 @@ package keccakf1600
 
 import "github.com/cloudflare/circl/internal/sha3"
 
-func permuteSIMD(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) }
+func permuteSIMDx4(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) }
+
+func permuteSIMDx2(state []uint64) { permuteScalarX2(state) }