Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

simd: add two-way f1600 using SHA3 extensions for arm64 #196

Merged
merged 1 commit into from
Dec 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions simd/keccakf1600/f1600x.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
// Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
//
// Keccak-f[1600] is the permutation underlying several algorithms such as
// Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
// useful in some scenarios like in hash-based signatures.
//
// Limitations
//
// Note that not all the architectures support SIMD instructions. This package
// uses AVX2 instructions that are available in some AMD64 architectures
// and NEON instructions that are available in some ARM64 architectures.
//
// For those systems not supporting these, the package still provides the
// expected functionality by means of a generic and slow implementation.
// The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
// to determine if the current system supports the SIMD implementation.
package keccakf1600

import (
"unsafe"

"github.com/cloudflare/circl/internal/sha3"
"golang.org/x/sys/cpu"
)

// StateX4 contains state for the four-way permutation including the four
// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
// and get a pointer to the interleaved buffer.
type StateX4 struct {
// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
// aligned on 32 bytes for bet performance. Thus we leave some headroom
// to be able to move the start of the state.

// 4 x 25 uint64s for the interleaved states and three uint64s headroom
// to fix alignment.
a [103]uint64

// Offset into a that is 32 byte aligned.
offset int
}

// StateX2 contains state for the two-way permutation including the two
// interleaved [25]uint64 buffers. Call Initialize() before use to initialize
// and get a pointer to the interleaved buffer.
type StateX2 struct {
// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
// aligned on 32 bytes for bet performance. Thus we leave some headroom
// to be able to move the start of the state.

// 2 x 25 uint64s for the interleaved states and three uint64s headroom
// to fix alignment.
a [53]uint64

// Offset into a that is 32 byte aligned.
offset int
}

// IsEnabledX4 returns true if the architecture supports a four-way SIMD
// implementation provided in this package.
func IsEnabledX4() bool { return cpu.X86.HasAVX2 }

// IsEnabledX2 returns true if the architecture supports a two-way SIMD
// implementation provided in this package.
func IsEnabledX2() bool { return cpu.ARM64.HasSHA3 }

// Initialize the state and returns the buffer on which the four permutations
// will act: a uint64 slice of length 100. The first permutation will act
// on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
func (s *StateX4) Initialize() []uint64 {
rp := unsafe.Pointer(&s.a[0])

// uint64s are always aligned by a multiple of 8. Compute the remainder
// of the address modulo 32 divided by 8.
rem := (int(uintptr(rp)&31) >> 3)

if rem != 0 {
s.offset = 4 - rem
}

// The slice we return will be aligned on 32 byte boundary.
return s.a[s.offset : s.offset+100]
}

// Initialize the state and returns the buffer on which the two permutations
// will act: a uint64 slice of length 50. The first permutation will act
// on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
func (s *StateX2) Initialize() []uint64 {
rp := unsafe.Pointer(&s.a[0])

// uint64s are always aligned by a multiple of 8. Compute the remainder
// of the address modulo 32 divided by 8.
rem := (int(uintptr(rp)&31) >> 3)

if rem != 0 {
s.offset = 4 - rem
}

// The slice we return will be aligned on 32 byte boundary.
return s.a[s.offset : s.offset+50]
}

// Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
// returned from Initialize().
func (s *StateX4) Permute() {
if IsEnabledX4() {
permuteSIMDx4(s.a[s.offset:])
} else {
permuteScalarX4(s.a[s.offset:]) // A slower generic implementation.
}
}

// Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
// returned from Initialize().
func (s *StateX2) Permute() {
if IsEnabledX2() {
permuteSIMDx2(s.a[s.offset:])
} else {
permuteScalarX2(s.a[s.offset:]) // A slower generic implementation.
}
}

func permuteScalarX4(a []uint64) {
var buf [25]uint64
for i := 0; i < 4; i++ {
for j := 0; j < 25; j++ {
buf[j] = a[4*j+i]
}
sha3.KeccakF1600(&buf)
for j := 0; j < 25; j++ {
a[4*j+i] = buf[j]
}
}
}

func permuteScalarX2(a []uint64) {
var buf [25]uint64
for i := 0; i < 2; i++ {
for j := 0; j < 25; j++ {
buf[j] = a[2*j+i]
}
sha3.KeccakF1600(&buf)
for j := 0; j < 25; j++ {
a[2*j+i] = buf[j]
}
}
}
12 changes: 12 additions & 0 deletions simd/keccakf1600/f1600x2_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// +build arm64,go1.16

package keccakf1600

import "github.com/cloudflare/circl/internal/sha3"

func permuteSIMDx2(state []uint64) { f1600x2ARM(&state[0], &sha3.RC) }

func permuteSIMDx4(state []uint64) { permuteScalarX4(state) }

//go:noescape
func f1600x2ARM(state *uint64, rc *[24]uint64)
130 changes: 130 additions & 0 deletions simd/keccakf1600/f1600x2_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// +build arm64,go1.16

// Taken from https://github.com/bwesterb/armed-keccak

#include "textflag.h"

// func f1600x2ARM(state *uint64, rc *[24]uint64)
TEXT ·f1600x2ARM(SB), NOSPLIT, $0-16
MOVD state+0(FP), R0
MOVD rc+8(FP), R1
MOVD R0, R2
MOVD $24, R3

VLD1.P 64(R0), [ V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R0), [ V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R0), [ V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
VLD1.P (R0), [V24.B16]

loop:
// Execute theta but without xorring into the state yet.
VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
VEOR3 V14.B16, V9.B16, V4.B16, V29.B16

VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
VEOR3 V24.B16, V19.B16, V29.B16, V29.B16

// Xor parities from step theta into the state at the same time as
// exeuting rho and pi.
VRAX1 V26.D2, V29.D2, V30.D2
VRAX1 V29.D2, V27.D2, V29.D2
VRAX1 V27.D2, V25.D2, V27.D2
VRAX1 V25.D2, V28.D2, V25.D2
VRAX1 V28.D2, V26.D2, V28.D2

VEOR V30.B16, V0.B16, V0.B16
VMOV V1.B16, V31.B16

VXAR $20, V27.D2, V6.D2, V1.D2
VXAR $44, V25.D2, V9.D2, V6.D2
VXAR $3 , V28.D2, V22.D2, V9.D2
VXAR $25, V25.D2, V14.D2, V22.D2
VXAR $46, V30.D2, V20.D2, V14.D2
VXAR $2 , V28.D2, V2.D2, V20.D2
VXAR $21, V28.D2, V12.D2, V2.D2
VXAR $39, V29.D2, V13.D2, V12.D2
VXAR $56, V25.D2, V19.D2, V13.D2
VXAR $8 , V29.D2, V23.D2, V19.D2
VXAR $23, V30.D2, V15.D2, V23.D2
VXAR $37, V25.D2, V4.D2, V15.D2
VXAR $50, V25.D2, V24.D2, V4.D2
VXAR $62, V27.D2, V21.D2, V24.D2
VXAR $9 , V29.D2, V8.D2, V21.D2
VXAR $19, V27.D2, V16.D2, V8.D2
VXAR $28, V30.D2, V5.D2, V16.D2
VXAR $36, V29.D2, V3.D2, V5.D2
VXAR $43, V29.D2, V18.D2, V3.D2
VXAR $49, V28.D2, V17.D2, V18.D2
VXAR $54, V27.D2, V11.D2, V17.D2
VXAR $58, V28.D2, V7.D2, V11.D2
VXAR $61, V30.D2, V10.D2, V7.D2
VXAR $63, V27.D2, V31.D2, V10.D2

// Chi
VBCAX V1.B16, V2.B16, V0.B16, V25.B16
VBCAX V2.B16, V3.B16, V1.B16, V26.B16
VBCAX V3.B16, V4.B16, V2.B16, V2.B16
VBCAX V4.B16, V0.B16, V3.B16, V3.B16
VBCAX V0.B16, V1.B16, V4.B16, V4.B16
VMOV V25.B16, V0.B16
VMOV V26.B16, V1.B16

VBCAX V6.B16, V7.B16, V5.B16, V25.B16
VBCAX V7.B16, V8.B16, V6.B16, V26.B16
VBCAX V8.B16, V9.B16, V7.B16, V7.B16
VBCAX V9.B16, V5.B16, V8.B16, V8.B16
VBCAX V5.B16, V6.B16, V9.B16, V9.B16
VMOV V25.B16, V5.B16
VMOV V26.B16, V6.B16

VBCAX V11.B16, V12.B16, V10.B16, V25.B16
VBCAX V12.B16, V13.B16, V11.B16, V26.B16
VBCAX V13.B16, V14.B16, V12.B16, V12.B16
VBCAX V14.B16, V10.B16, V13.B16, V13.B16
VBCAX V10.B16, V11.B16, V14.B16, V14.B16
VMOV V25.B16, V10.B16
VMOV V26.B16, V11.B16

VBCAX V16.B16, V17.B16, V15.B16, V25.B16
VBCAX V17.B16, V18.B16, V16.B16, V26.B16
VBCAX V18.B16, V19.B16, V17.B16, V17.B16
VBCAX V19.B16, V15.B16, V18.B16, V18.B16
VBCAX V15.B16, V16.B16, V19.B16, V19.B16
VMOV V25.B16, V15.B16
VMOV V26.B16, V16.B16

VBCAX V21.B16, V22.B16, V20.B16, V25.B16
VBCAX V22.B16, V23.B16, V21.B16, V26.B16
VBCAX V23.B16, V24.B16, V22.B16, V22.B16
VBCAX V24.B16, V20.B16, V23.B16, V23.B16
VBCAX V20.B16, V21.B16, V24.B16, V24.B16
VMOV V25.B16, V20.B16
VMOV V26.B16, V21.B16

// Iota
VLD1R.P 8(R1), [V25.D2]
VEOR V25.B16, V0.B16, V0.B16

SUBS $1, R3, R3
CBNZ R3, loop

MOVD R2, R0

VST1.P [ V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
VST1.P [ V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
VST1.P [ V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
VST1.P [V24.B16], (R0)

RET
84 changes: 0 additions & 84 deletions simd/keccakf1600/f1600x4.go

This file was deleted.

4 changes: 3 additions & 1 deletion simd/keccakf1600/f1600x4_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ package keccakf1600

import "github.com/cloudflare/circl/internal/sha3"

func permuteSIMD(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) }
func permuteSIMDx4(state []uint64) { f1600x4AVX2(&state[0], &sha3.RC) }

func permuteSIMDx2(state []uint64) { permuteScalarX2(state) }
Loading