From 7b8a7f8272fd1941a199af1adb334bd9996e8909 Mon Sep 17 00:00:00 2001 From: fanzha02 <fannie.zhang@arm.com> Date: Fri, 28 Jul 2017 04:14:45 +0000 Subject: [PATCH] crypto/sha256: optimize arm64 sha256 implemention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimize with ARMv8 SHA256 instructions. Result (Cortex-A72) name old time/op new time/op delta Hash8Bytes-64 1.54µs ± 1% 0.61µs ± 9% -60.67% (p=0.008 n=5+5) Hash1K-64 17.2µs ± 1% 1.4µs ± 2% -91.91% (p=0.008 n=5+5) Hash8K-64 127µs ± 0% 7µs ± 1% -94.42% (p=0.008 n=5+5) name old speed new speed delta Hash8Bytes-64 5.20MB/s ± 1% 13.23MB/s ±10% +154.58% (p=0.008 n=5+5) Hash1K-64 59.4MB/s ± 1% 735.1MB/s ± 2% +1136.96% (p=0.008 n=5+5) Hash8K-64 64.5MB/s ± 0% 1156.3MB/s ± 1% +1692.75% (p=0.008 n=5+5) Change-Id: I47eca6471b75cd07cb0c77477053a07d0de7494f Reviewed-on: https://go-review.googlesource.com/61570 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> --- src/crypto/sha256/sha256block_arm64.go | 22 +++++ src/crypto/sha256/sha256block_arm64.s | 119 +++++++++++++++++++++++ src/crypto/sha256/sha256block_generic.go | 2 +- 3 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 src/crypto/sha256/sha256block_arm64.go create mode 100644 src/crypto/sha256/sha256block_arm64.s diff --git a/src/crypto/sha256/sha256block_arm64.go b/src/crypto/sha256/sha256block_arm64.go new file mode 100644 index 00000000000..48c436baf34 --- /dev/null +++ b/src/crypto/sha256/sha256block_arm64.go @@ -0,0 +1,22 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha256 + +import "internal/cpu" + +var k = _K + +var hasSHA2 = cpu.ARM64.HasSHA2 + +func sha256block(h []uint32, p []byte, k []uint32) + +func block(dig *digest, p []byte) { + if !hasSHA2 { + blockGeneric(dig, p) + } else { + h := dig.h[:] + sha256block(h, p, k) + } +} diff --git a/src/crypto/sha256/sha256block_arm64.s b/src/crypto/sha256/sha256block_arm64.s new file mode 100644 index 00000000000..a88bfa1bcbe --- /dev/null +++ b/src/crypto/sha256/sha256block_arm64.s @@ -0,0 +1,119 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +#define HASHUPDATE \ + SHA256H V9.S4, V3, V2 \ + SHA256H2 V9.S4, V8, V3 \ + VMOV V2.B16, V8.B16 + +// func sha256block(h []uint32, p []byte, k []uint32) +TEXT ·sha256block(SB),NOSPLIT,$0 + MOVD h_base+0(FP), R0 // Hash value fisrt address + MOVD p_base+24(FP), R1 // message first address + MOVD k_base+48(FP), R2 // k constants first address + MOVD p_len+32(FP), R3 // message length + VLD1 (R0), [V0.S4, V1.S4] // load h(a,b,c,d,e,f,g,h) + VLD1.P 64(R2), [V16.S4, V17.S4, V18.S4, V19.S4] + VLD1.P 64(R2), [V20.S4, V21.S4, V22.S4, V23.S4] + VLD1.P 64(R2), [V24.S4, V25.S4, V26.S4, V27.S4] + VLD1 (R2), [V28.S4, V29.S4, V30.S4, V31.S4] //load 64*4bytes K constant(K0-K63) + +blockloop: + + VLD1.P 16(R1), [V4.B16] // load 16bytes message + VLD1.P 16(R1), [V5.B16] // load 16bytes message + VLD1.P 16(R1), [V6.B16] // load 16bytes message + VLD1.P 16(R1), [V7.B16] // load 16bytes message + VMOV V0.B16, V2.B16 // backup: VO h(dcba) + VMOV V1.B16, V3.B16 // backup: V1 h(hgfe) + VMOV V2.B16, V8.B16 + VREV32 V4.B16, V4.B16 // prepare for using message in Byte format + VREV32 V5.B16, V5.B16 + VREV32 V6.B16, V6.B16 + VREV32 V7.B16, V7.B16 + + VADD V16.S4, V4.S4, V9.S4 // V18(W0+K0...W3+K3) + SHA256SU0 V5.S4, V4.S4 // V4: (su0(W1)+W0,...,su0(W4)+W3) + HASHUPDATE // H4 + + VADD V17.S4, V5.S4, V9.S4 // V18(W4+K4...W7+K7) + SHA256SU0 V6.S4, V5.S4 // V5: (su0(W5)+W4,...,su0(W8)+W7) + SHA256SU1 V7.S4, V6.S4, V4.S4 // V4: W16-W19 + HASHUPDATE // H8 + + VADD V18.S4, V6.S4, V9.S4 // V18(W8+K8...W11+K11) + SHA256SU0 V7.S4, V6.S4 // V6: (su0(W9)+W8,...,su0(W12)+W11) + SHA256SU1 V4.S4, V7.S4, V5.S4 // V5: W20-W23 + HASHUPDATE // H12 + + VADD V19.S4, V7.S4, V9.S4 // V18(W12+K12...W15+K15) + SHA256SU0 V4.S4, V7.S4 // V7: (su0(W13)+W12,...,su0(W16)+W15) + SHA256SU1 V5.S4, V4.S4, V6.S4 // V6: W24-W27 + HASHUPDATE // H16 + + VADD V20.S4, V4.S4, V9.S4 // V18(W16+K16...W19+K19) + SHA256SU0 V5.S4, V4.S4 // V4: (su0(W17)+W16,...,su0(W20)+W19) + SHA256SU1 V6.S4, V5.S4, V7.S4 // V7: W28-W31 + HASHUPDATE // H20 + + VADD V21.S4, V5.S4, V9.S4 // V18(W20+K20...W23+K23) + SHA256SU0 V6.S4, V5.S4 // V5: (su0(W21)+W20,...,su0(W24)+W23) + SHA256SU1 V7.S4, V6.S4, V4.S4 // V4: W32-W35 + HASHUPDATE // H24 + + VADD V22.S4, V6.S4, V9.S4 // V18(W24+K24...W27+K27) + SHA256SU0 V7.S4, V6.S4 // V6: (su0(W25)+W24,...,su0(W28)+W27) + SHA256SU1 V4.S4, V7.S4, V5.S4 // V5: W36-W39 + HASHUPDATE // H28 + + VADD V23.S4, V7.S4, V9.S4 // V18(W28+K28...W31+K31) + SHA256SU0 V4.S4, V7.S4 // V7: (su0(W29)+W28,...,su0(W32)+W31) + SHA256SU1 V5.S4, V4.S4, V6.S4 // V6: W40-W43 + HASHUPDATE // H32 + + VADD V24.S4, V4.S4, V9.S4 // V18(W32+K32...W35+K35) + SHA256SU0 V5.S4, V4.S4 // V4: (su0(W33)+W32,...,su0(W36)+W35) + SHA256SU1 V6.S4, V5.S4, V7.S4 // V7: W44-W47 + HASHUPDATE // H36 + + VADD V25.S4, V5.S4, V9.S4 // V18(W36+K36...W39+K39) + SHA256SU0 V6.S4, V5.S4 // V5: (su0(W37)+W36,...,su0(W40)+W39) + SHA256SU1 V7.S4, V6.S4, V4.S4 // V4: W48-W51 + HASHUPDATE // H40 + + VADD V26.S4, V6.S4, V9.S4 // V18(W40+K40...W43+K43) + SHA256SU0 V7.S4, V6.S4 // V6: (su0(W41)+W40,...,su0(W44)+W43) + SHA256SU1 V4.S4, V7.S4, V5.S4 // V5: W52-W55 + HASHUPDATE // H44 + + VADD V27.S4, V7.S4, V9.S4 // V18(W44+K44...W47+K47) + SHA256SU0 V4.S4, V7.S4 // V7: (su0(W45)+W44,...,su0(W48)+W47) + SHA256SU1 V5.S4, V4.S4, V6.S4 // V6: W56-W59 + HASHUPDATE // H48 + + VADD V28.S4, V4.S4, V9.S4 // V18(W48+K48,...,W51+K51) + HASHUPDATE // H52 + SHA256SU1 V6.S4, V5.S4, V7.S4 // V7: W60-W63 + + VADD V29.S4, V5.S4, V9.S4 // V18(W52+K52,...,W55+K55) + HASHUPDATE // H56 + + VADD V30.S4, V6.S4, V9.S4 // V18(W59+K59,...,W59+K59) + HASHUPDATE // H60 + + VADD V31.S4, V7.S4, V9.S4 // V18(W60+K60,...,W63+K63) + HASHUPDATE // H64 + + SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes + VADD V2.S4, V0.S4, V0.S4 + VADD V3.S4, V1.S4, V1.S4 + CBNZ R3, blockloop + +sha256ret: + + VST1 [V0.S4, V1.S4], (R0) // store hash value H + RET + diff --git a/src/crypto/sha256/sha256block_generic.go b/src/crypto/sha256/sha256block_generic.go index a182a5eacfe..61362f41260 100644 --- a/src/crypto/sha256/sha256block_generic.go +++ b/src/crypto/sha256/sha256block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!386,!s390x,!ppc64le +// +build !amd64,!386,!s390x,!ppc64le,!arm64 package sha256 -- GitLab