From 7b8a7f8272fd1941a199af1adb334bd9996e8909 Mon Sep 17 00:00:00 2001
From: fanzha02 <fannie.zhang@arm.com>
Date: Fri, 28 Jul 2017 04:14:45 +0000
Subject: [PATCH] crypto/sha256: optimize arm64 sha256 implemention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Optimize with ARMv8 SHA256 instructions.
Result (Cortex-A72)

name           old time/op    new time/op      delta
Hash8Bytes-64    1.54µs ± 1%      0.61µs ± 9%    -60.67%  (p=0.008 n=5+5)
Hash1K-64        17.2µs ± 1%       1.4µs ± 2%    -91.91%  (p=0.008 n=5+5)
Hash8K-64         127µs ± 0%         7µs ± 1%    -94.42%  (p=0.008 n=5+5)

name           old speed      new speed        delta
Hash8Bytes-64  5.20MB/s ± 1%   13.23MB/s ±10%   +154.58%  (p=0.008 n=5+5)
Hash1K-64      59.4MB/s ± 1%   735.1MB/s ± 2%  +1136.96%  (p=0.008 n=5+5)
Hash8K-64      64.5MB/s ± 0%  1156.3MB/s ± 1%  +1692.75%  (p=0.008 n=5+5)

Change-Id: I47eca6471b75cd07cb0c77477053a07d0de7494f
Reviewed-on: https://go-review.googlesource.com/61570
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/crypto/sha256/sha256block_arm64.go   |  22 +++++
 src/crypto/sha256/sha256block_arm64.s    | 119 +++++++++++++++++++++++
 src/crypto/sha256/sha256block_generic.go |   2 +-
 3 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 src/crypto/sha256/sha256block_arm64.go
 create mode 100644 src/crypto/sha256/sha256block_arm64.s

diff --git a/src/crypto/sha256/sha256block_arm64.go b/src/crypto/sha256/sha256block_arm64.go
new file mode 100644
index 00000000000..48c436baf34
--- /dev/null
+++ b/src/crypto/sha256/sha256block_arm64.go
@@ -0,0 +1,22 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha256
+
+import "internal/cpu"
+
+var k = _K
+
+var hasSHA2 = cpu.ARM64.HasSHA2
+
+func sha256block(h []uint32, p []byte, k []uint32)
+
+func block(dig *digest, p []byte) {
+	if !hasSHA2 {
+		blockGeneric(dig, p)
+	} else {
+		h := dig.h[:]
+		sha256block(h, p, k)
+	}
+}
diff --git a/src/crypto/sha256/sha256block_arm64.s b/src/crypto/sha256/sha256block_arm64.s
new file mode 100644
index 00000000000..a88bfa1bcbe
--- /dev/null
+++ b/src/crypto/sha256/sha256block_arm64.s
@@ -0,0 +1,119 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define HASHUPDATE \
+	SHA256H	V9.S4, V3, V2 \
+	SHA256H2	V9.S4, V8, V3 \
+	VMOV	V2.B16, V8.B16
+
+// func sha256block(h []uint32, p []byte, k []uint32)
+TEXT ·sha256block(SB),NOSPLIT,$0
+	MOVD	h_base+0(FP), R0                           // Hash value fisrt address
+	MOVD	p_base+24(FP), R1                          // message first address
+	MOVD	k_base+48(FP), R2                          // k constants first address
+	MOVD	p_len+32(FP), R3                           // message length
+	VLD1	(R0), [V0.S4, V1.S4]                       // load h(a,b,c,d,e,f,g,h)
+	VLD1.P	64(R2), [V16.S4, V17.S4, V18.S4, V19.S4]
+	VLD1.P	64(R2), [V20.S4, V21.S4, V22.S4, V23.S4]
+	VLD1.P	64(R2), [V24.S4, V25.S4, V26.S4, V27.S4]
+	VLD1	(R2), [V28.S4, V29.S4, V30.S4, V31.S4]     //load 64*4bytes K constant(K0-K63)
+
+blockloop:
+
+	VLD1.P	16(R1), [V4.B16]                            // load 16bytes message
+	VLD1.P	16(R1), [V5.B16]                            // load 16bytes message
+	VLD1.P	16(R1), [V6.B16]                            // load 16bytes message
+	VLD1.P	16(R1), [V7.B16]                            // load 16bytes message
+	VMOV	V0.B16, V2.B16                              // backup: VO h(dcba)
+	VMOV	V1.B16, V3.B16                              // backup: V1 h(hgfe)
+	VMOV	V2.B16, V8.B16
+	VREV32	V4.B16, V4.B16                              // prepare for using message in Byte format
+	VREV32	V5.B16, V5.B16
+	VREV32	V6.B16, V6.B16
+	VREV32	V7.B16, V7.B16
+
+	VADD	V16.S4, V4.S4, V9.S4                        // V18(W0+K0...W3+K3)
+	SHA256SU0	V5.S4, V4.S4                        // V4: (su0(W1)+W0,...,su0(W4)+W3)
+	HASHUPDATE                                          // H4
+
+	VADD	V17.S4, V5.S4, V9.S4                        // V18(W4+K4...W7+K7)
+	SHA256SU0	V6.S4, V5.S4                        // V5: (su0(W5)+W4,...,su0(W8)+W7)
+	SHA256SU1	V7.S4, V6.S4, V4.S4                 // V4: W16-W19
+	HASHUPDATE                                          // H8
+
+	VADD	V18.S4, V6.S4, V9.S4                        // V18(W8+K8...W11+K11)
+	SHA256SU0	V7.S4, V6.S4                        // V6: (su0(W9)+W8,...,su0(W12)+W11)
+	SHA256SU1	V4.S4, V7.S4, V5.S4                 // V5: W20-W23
+	HASHUPDATE                                          // H12
+
+	VADD	V19.S4, V7.S4, V9.S4                        // V18(W12+K12...W15+K15)
+	SHA256SU0	V4.S4, V7.S4                        // V7: (su0(W13)+W12,...,su0(W16)+W15)
+	SHA256SU1	V5.S4, V4.S4, V6.S4                 // V6: W24-W27
+	HASHUPDATE                                          // H16
+
+	VADD	V20.S4, V4.S4, V9.S4                        // V18(W16+K16...W19+K19)
+	SHA256SU0	V5.S4, V4.S4                        // V4: (su0(W17)+W16,...,su0(W20)+W19)
+	SHA256SU1	V6.S4, V5.S4, V7.S4                 // V7: W28-W31
+	HASHUPDATE                                          // H20
+
+	VADD	V21.S4, V5.S4, V9.S4                        // V18(W20+K20...W23+K23)
+	SHA256SU0	V6.S4, V5.S4                        // V5: (su0(W21)+W20,...,su0(W24)+W23)
+	SHA256SU1	V7.S4, V6.S4, V4.S4                 // V4: W32-W35
+	HASHUPDATE                                          // H24
+
+	VADD	V22.S4, V6.S4, V9.S4                        // V18(W24+K24...W27+K27)
+	SHA256SU0	V7.S4, V6.S4                        // V6: (su0(W25)+W24,...,su0(W28)+W27)
+	SHA256SU1	V4.S4, V7.S4, V5.S4                 // V5: W36-W39
+	HASHUPDATE                                          // H28
+
+	VADD	V23.S4, V7.S4, V9.S4                        // V18(W28+K28...W31+K31)
+	SHA256SU0	V4.S4, V7.S4                        // V7: (su0(W29)+W28,...,su0(W32)+W31)
+	SHA256SU1	V5.S4, V4.S4, V6.S4                 // V6: W40-W43
+	HASHUPDATE                                          // H32
+
+	VADD	V24.S4, V4.S4, V9.S4                        // V18(W32+K32...W35+K35)
+	SHA256SU0	V5.S4, V4.S4                        // V4: (su0(W33)+W32,...,su0(W36)+W35)
+	SHA256SU1	V6.S4, V5.S4, V7.S4                 // V7: W44-W47
+	HASHUPDATE                                          // H36
+
+	VADD	V25.S4, V5.S4, V9.S4                        // V18(W36+K36...W39+K39)
+	SHA256SU0	V6.S4, V5.S4                        // V5: (su0(W37)+W36,...,su0(W40)+W39)
+	SHA256SU1	V7.S4, V6.S4, V4.S4                 // V4: W48-W51
+	HASHUPDATE                                          // H40
+
+	VADD	V26.S4, V6.S4, V9.S4                        // V18(W40+K40...W43+K43)
+	SHA256SU0	V7.S4, V6.S4                        // V6: (su0(W41)+W40,...,su0(W44)+W43)
+	SHA256SU1	V4.S4, V7.S4, V5.S4                 // V5: W52-W55
+	HASHUPDATE                                          // H44
+
+	VADD	V27.S4, V7.S4, V9.S4                        // V18(W44+K44...W47+K47)
+	SHA256SU0	V4.S4, V7.S4                        // V7: (su0(W45)+W44,...,su0(W48)+W47)
+	SHA256SU1	V5.S4, V4.S4, V6.S4                 // V6: W56-W59
+	HASHUPDATE                                          // H48
+
+	VADD	V28.S4, V4.S4, V9.S4                        // V18(W48+K48,...,W51+K51)
+	HASHUPDATE                                          // H52
+	SHA256SU1	V6.S4, V5.S4, V7.S4                 // V7: W60-W63
+
+	VADD	V29.S4, V5.S4, V9.S4                        // V18(W52+K52,...,W55+K55)
+	HASHUPDATE                                          // H56
+
+	VADD	V30.S4, V6.S4, V9.S4                        // V18(W59+K59,...,W59+K59)
+	HASHUPDATE                                          // H60
+
+	VADD	V31.S4, V7.S4, V9.S4                        // V18(W60+K60,...,W63+K63)
+	HASHUPDATE                                          // H64
+
+	SUB	$64, R3, R3                                 // message length - 64bytes, then compare with 64bytes
+	VADD	V2.S4, V0.S4, V0.S4
+	VADD	V3.S4, V1.S4, V1.S4
+	CBNZ	R3, blockloop
+
+sha256ret:
+
+	VST1	[V0.S4, V1.S4], (R0)                       // store hash value H
+	RET
+
diff --git a/src/crypto/sha256/sha256block_generic.go b/src/crypto/sha256/sha256block_generic.go
index a182a5eacfe..61362f41260 100644
--- a/src/crypto/sha256/sha256block_generic.go
+++ b/src/crypto/sha256/sha256block_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!386,!s390x,!ppc64le
+// +build !amd64,!386,!s390x,!ppc64le,!arm64
 
 package sha256
 
-- 
GitLab