From 64a5d1d7de14ad74b1a77614de0e17c659ef12b6 Mon Sep 17 00:00:00 2001
From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Date: Mon, 3 Jun 2024 17:54:18 +0800
Subject: [PATCH] crypto/sha1: implement sha1block in hardware on loong64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

goos: linux
goarch: loong64
pkg: crypto/sha1
cpu: Loongson-3A6000 @ 2500.00MHz
                 │  bench.old   │              bench.new              │
                 │    sec/op    │   sec/op     vs base                │
Hash8Bytes/New      489.8n ± 0%   280.6n ± 0%  -42.71% (p=0.000 n=20)
Hash8Bytes/Sum      496.6n ± 0%   288.9n ± 0%  -41.82% (p=0.000 n=20)
Hash320Bytes/New   2251.0n ± 0%   992.0n ± 0%  -55.93% (p=0.000 n=20)
Hash320Bytes/Sum   2258.0n ± 0%   998.0n ± 0%  -55.80% (p=0.000 n=20)
Hash1K/New          6.113µ ± 0%   2.583µ ± 0%  -57.75% (p=0.000 n=20)
Hash1K/Sum          6.117µ ± 0%   2.588µ ± 0%  -57.69% (p=0.000 n=20)
Hash8K/New          45.42µ ± 0%   18.79µ ± 0%  -58.63% (p=0.000 n=20)
Hash8K/Sum          45.43µ ± 0%   18.80µ ± 0%  -58.62% (p=0.000 n=20)
geomean             4.192µ        1.926µ       -54.05%

                 │  bench.old   │               bench.new               │
                 │     B/s      │     B/s       vs base                 │
Hash8Bytes/New     15.57Mi ± 0%   27.19Mi ± 0%   +74.59% (p=0.000 n=20)
Hash8Bytes/Sum     15.36Mi ± 0%   26.41Mi ± 0%   +71.88% (p=0.000 n=20)
Hash320Bytes/New   135.6Mi ± 0%   307.6Mi ± 0%  +126.90% (p=0.000 n=20)
Hash320Bytes/Sum   135.2Mi ± 0%   305.8Mi ± 0%  +126.22% (p=0.000 n=20)
Hash1K/New         159.8Mi ± 0%   378.1Mi ± 0%  +136.69% (p=0.000 n=20)
Hash1K/Sum         159.7Mi ± 0%   377.4Mi ± 0%  +136.38% (p=0.000 n=20)
Hash8K/New         172.0Mi ± 0%   415.8Mi ± 0%  +141.75% (p=0.000 n=20)
Hash8K/Sum         172.0Mi ± 0%   415.6Mi ± 0%  +141.65% (p=0.000 n=20)
geomean            87.09Mi        189.5Mi       +117.64%

goos: linux
goarch: loong64
pkg: crypto/sha1
cpu: Loongson-3A5000 @ 2500.00MHz
                 │  bench.old  │              bench.new              │
                 │   sec/op    │   sec/op     vs base                │
Hash8Bytes/New     565.9n ± 0%   374.5n ± 1%  -33.82% (p=0.000 n=20)
Hash8Bytes/Sum     571.3n ± 0%   366.7n ± 1%  -35.81% (p=0.000 n=20)
Hash320Bytes/New   2.662µ ± 0%   1.201µ ± 0%  -54.88% (p=0.000 n=20)
Hash320Bytes/Sum   2.662µ ± 0%   1.194µ ± 0%  -55.15% (p=0.000 n=20)
Hash1K/New         7.171µ ± 0%   3.084µ ± 0%  -56.99% (p=0.000 n=20)
Hash1K/Sum         7.171µ ± 0%   3.076µ ± 0%  -57.11% (p=0.000 n=20)
Hash8K/New         53.10µ ± 0%   22.24µ ± 0%  -58.12% (p=0.000 n=20)
Hash8K/Sum         53.09µ ± 0%   22.23µ ± 0%  -58.12% (p=0.000 n=20)
geomean            4.900µ        2.348µ       -52.08%

                 │  bench.old   │               bench.new               │
                 │     B/s      │     B/s       vs base                 │
Hash8Bytes/New     13.48Mi ± 0%   20.38Mi ± 1%   +51.10% (p=0.000 n=20)
Hash8Bytes/Sum     13.35Mi ± 0%   20.80Mi ± 1%   +55.82% (p=0.000 n=20)
Hash320Bytes/New   114.6Mi ± 0%   254.0Mi ± 1%  +121.61% (p=0.000 n=20)
Hash320Bytes/Sum   114.6Mi ± 0%   255.6Mi ± 0%  +123.00% (p=0.000 n=20)
Hash1K/New         136.2Mi ± 0%   316.7Mi ± 0%  +132.54% (p=0.000 n=20)
Hash1K/Sum         136.2Mi ± 0%   317.5Mi ± 0%  +133.19% (p=0.000 n=20)
Hash8K/New         147.1Mi ± 0%   351.3Mi ± 0%  +138.79% (p=0.000 n=20)
Hash8K/Sum         147.2Mi ± 0%   351.4Mi ± 0%  +138.78% (p=0.000 n=20)
geomean            74.51Mi        155.5Mi       +108.69%

Change-Id: I716babd19c18dc2c3314d972ced9d83de2d93cb2
Reviewed-on: https://go-review.googlesource.com/c/go/+/589775
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
---
 src/crypto/sha1/sha1block_decl.go    |   2 +-
 src/crypto/sha1/sha1block_generic.go |   2 +-
 src/crypto/sha1/sha1block_loong64.s  | 226 +++++++++++++++++++++++++++
 3 files changed, 228 insertions(+), 2 deletions(-)
 create mode 100644 src/crypto/sha1/sha1block_loong64.s

diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go
index 3edf5a43606..46f41a1cc26 100644
--- a/src/crypto/sha1/sha1block_decl.go
+++ b/src/crypto/sha1/sha1block_decl.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (arm || 386 || s390x) && !purego
+//go:build (386 || arm || loong64 || s390x) && !purego
 
 package sha1
 
diff --git a/src/crypto/sha1/sha1block_generic.go b/src/crypto/sha1/sha1block_generic.go
index 4c6f74d99d8..5989a243476 100644
--- a/src/crypto/sha1/sha1block_generic.go
+++ b/src/crypto/sha1/sha1block_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !386 && !arm && !s390x && !arm64) || purego
+//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !s390x) || purego
 
 package sha1
 
diff --git a/src/crypto/sha1/sha1block_loong64.s b/src/crypto/sha1/sha1block_loong64.s
new file mode 100644
index 00000000000..7e9d6e09339
--- /dev/null
+++ b/src/crypto/sha1/sha1block_loong64.s
@@ -0,0 +1,226 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !purego
+
+#include "textflag.h"
+
+// SHA-1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
+//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+#define REGTMP	R30
+#define REGTMP1	R17
+#define REGTMP2	R18
+#define REGTMP3	R19
+
+#define LOAD1(index) \
+	MOVW	(index*4)(R5), REGTMP3; \
+	WORD	$0x3a73; \	// REVB2W REGTMP3, REGTMP3   to big-endian
+	MOVW	REGTMP3, (index*4)(R3)
+
+#define LOAD(index) \
+	MOVW	(((index)&0xf)*4)(R3), REGTMP3; \
+	MOVW	(((index-3)&0xf)*4)(R3), REGTMP; \
+	MOVW	(((index-8)&0xf)*4)(R3), REGTMP1; \
+	MOVW	(((index-14)&0xf)*4)(R3), REGTMP2; \
+	XOR	REGTMP, REGTMP3; \
+	XOR	REGTMP1, REGTMP3; \
+	XOR	REGTMP2, REGTMP3; \
+	ROTR	$31, REGTMP3; \
+	MOVW	REGTMP3, (((index)&0xf)*4)(R3)
+
+// f = d ^ (b & (c ^ d))
+#define FUNC1(a, b, c, d, e) \
+	XOR	c, d, REGTMP1; \
+	AND	b, REGTMP1; \
+	XOR	d, REGTMP1
+
+// f = b ^ c ^ d
+#define FUNC2(a, b, c, d, e) \
+	XOR	b, c, REGTMP1; \
+	XOR	d, REGTMP1
+
+// f = (b & c) | ((b | c) & d)
+#define FUNC3(a, b, c, d, e) \
+	OR	b, c, REGTMP2; \
+	AND	b, c, REGTMP; \
+	AND	d, REGTMP2; \
+	OR	REGTMP, REGTMP2, REGTMP1
+
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+	ROTR	$2, b; \	// b << 30
+	ADD	REGTMP1, e; \	// e = e + f
+	ROTR	$27, a, REGTMP2; \	// a << 5
+	ADD	REGTMP3, e; \	// e = e + w[i]
+	ADDV	$const, e; \	// e = e + k
+	ADD	REGTMP2, e	// e = e + a<<5
+
+#define ROUND1(a, b, c, d, e, index) \
+	LOAD1(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC2(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC3(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC4(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0xCA62C1D6)
+
+// A stack frame size of 64 bytes is required here, because
+// the frame size used for data expansion is 64 bytes.
+// See the definition of the macro LOAD above, and the definition
+// of the local variable w in the general implementation (sha1block.go).
+TEXT ·block(SB),NOSPLIT,$64-32
+	MOVV	dig+0(FP),	R4
+	MOVV	p_base+8(FP),	R5
+	MOVV	p_len+16(FP),	R6
+	AND	$~63, R6
+	BEQ	R6, zero
+
+	// p_len >= 64
+	ADDV    R5, R6, R24
+	MOVW	(0*4)(R4), R7
+	MOVW	(1*4)(R4), R8
+	MOVW	(2*4)(R4), R9
+	MOVW	(3*4)(R4), R10
+	MOVW	(4*4)(R4), R11
+
+loop:
+	MOVW	R7,	R12
+	MOVW	R8,	R13
+	MOVW	R9,	R14
+	MOVW	R10,	R15
+	MOVW	R11,	R16
+
+	ROUND1(R7,  R8,  R9,  R10, R11, 0)
+	ROUND1(R11, R7,  R8,  R9,  R10, 1)
+	ROUND1(R10, R11, R7,  R8,  R9,  2)
+	ROUND1(R9,  R10, R11, R7,  R8,  3)
+	ROUND1(R8,  R9,  R10, R11, R7,  4)
+	ROUND1(R7,  R8,  R9,  R10, R11, 5)
+	ROUND1(R11, R7,  R8,  R9,  R10, 6)
+	ROUND1(R10, R11, R7,  R8,  R9,  7)
+	ROUND1(R9,  R10, R11, R7,  R8,  8)
+	ROUND1(R8,  R9,  R10, R11, R7,  9)
+	ROUND1(R7,  R8,  R9,  R10, R11, 10)
+	ROUND1(R11, R7,  R8,  R9,  R10, 11)
+	ROUND1(R10, R11, R7,  R8,  R9,  12)
+	ROUND1(R9,  R10, R11, R7,  R8,  13)
+	ROUND1(R8,  R9,  R10, R11, R7,  14)
+	ROUND1(R7,  R8,  R9,  R10, R11, 15)
+
+	ROUND1x(R11, R7,  R8,  R9,  R10, 16)
+	ROUND1x(R10, R11, R7,  R8,  R9,  17)
+	ROUND1x(R9,  R10, R11, R7,  R8,  18)
+	ROUND1x(R8,  R9,  R10, R11, R7,  19)
+
+	ROUND2(R7,  R8,  R9,  R10, R11, 20)
+	ROUND2(R11, R7,  R8,  R9,  R10, 21)
+	ROUND2(R10, R11, R7,  R8,  R9,  22)
+	ROUND2(R9,  R10, R11, R7,  R8,  23)
+	ROUND2(R8,  R9,  R10, R11, R7,  24)
+	ROUND2(R7,  R8,  R9,  R10, R11, 25)
+	ROUND2(R11, R7,  R8,  R9,  R10, 26)
+	ROUND2(R10, R11, R7,  R8,  R9,  27)
+	ROUND2(R9,  R10, R11, R7,  R8,  28)
+	ROUND2(R8,  R9,  R10, R11, R7,  29)
+	ROUND2(R7,  R8,  R9,  R10, R11, 30)
+	ROUND2(R11, R7,  R8,  R9,  R10, 31)
+	ROUND2(R10, R11, R7,  R8,  R9,  32)
+	ROUND2(R9,  R10, R11, R7,  R8,  33)
+	ROUND2(R8,  R9,  R10, R11, R7,  34)
+	ROUND2(R7,  R8,  R9,  R10, R11, 35)
+	ROUND2(R11, R7,  R8,  R9,  R10, 36)
+	ROUND2(R10, R11, R7,  R8,  R9,  37)
+	ROUND2(R9,  R10, R11, R7,  R8,  38)
+	ROUND2(R8,  R9,  R10, R11, R7,  39)
+
+	ROUND3(R7,  R8,  R9,  R10, R11, 40)
+	ROUND3(R11, R7,  R8,  R9,  R10, 41)
+	ROUND3(R10, R11, R7,  R8,  R9,  42)
+	ROUND3(R9,  R10, R11, R7,  R8,  43)
+	ROUND3(R8,  R9,  R10, R11, R7,  44)
+	ROUND3(R7,  R8,  R9,  R10, R11, 45)
+	ROUND3(R11, R7,  R8,  R9,  R10, 46)
+	ROUND3(R10, R11, R7,  R8,  R9,  47)
+	ROUND3(R9,  R10, R11, R7,  R8,  48)
+	ROUND3(R8,  R9,  R10, R11, R7,  49)
+	ROUND3(R7,  R8,  R9,  R10, R11, 50)
+	ROUND3(R11, R7,  R8,  R9,  R10, 51)
+	ROUND3(R10, R11, R7,  R8,  R9,  52)
+	ROUND3(R9,  R10, R11, R7,  R8,  53)
+	ROUND3(R8,  R9,  R10, R11, R7,  54)
+	ROUND3(R7,  R8,  R9,  R10, R11, 55)
+	ROUND3(R11, R7,  R8,  R9,  R10, 56)
+	ROUND3(R10, R11, R7,  R8,  R9,  57)
+	ROUND3(R9,  R10, R11, R7,  R8,  58)
+	ROUND3(R8,  R9,  R10, R11, R7,  59)
+
+	ROUND4(R7,  R8,  R9,  R10, R11, 60)
+	ROUND4(R11, R7,  R8,  R9,  R10, 61)
+	ROUND4(R10, R11, R7,  R8,  R9,  62)
+	ROUND4(R9,  R10, R11, R7,  R8,  63)
+	ROUND4(R8,  R9,  R10, R11, R7,  64)
+	ROUND4(R7,  R8,  R9,  R10, R11, 65)
+	ROUND4(R11, R7,  R8,  R9,  R10, 66)
+	ROUND4(R10, R11, R7,  R8,  R9,  67)
+	ROUND4(R9,  R10, R11, R7,  R8,  68)
+	ROUND4(R8,  R9,  R10, R11, R7,  69)
+	ROUND4(R7,  R8,  R9,  R10, R11, 70)
+	ROUND4(R11, R7,  R8,  R9,  R10, 71)
+	ROUND4(R10, R11, R7,  R8,  R9,  72)
+	ROUND4(R9,  R10, R11, R7,  R8,  73)
+	ROUND4(R8,  R9,  R10, R11, R7,  74)
+	ROUND4(R7,  R8,  R9,  R10, R11, 75)
+	ROUND4(R11, R7,  R8,  R9,  R10, 76)
+	ROUND4(R10, R11, R7,  R8,  R9,  77)
+	ROUND4(R9,  R10, R11, R7,  R8,  78)
+	ROUND4(R8,  R9,  R10, R11, R7,  79)
+
+	ADD	R12, R7
+	ADD	R13, R8
+	ADD	R14, R9
+	ADD	R15, R10
+	ADD	R16, R11
+
+	ADDV	$64, R5
+	BNE	R5, R24, loop
+
+end:
+	MOVW	R7, (0*4)(R4)
+	MOVW	R8, (1*4)(R4)
+	MOVW	R9, (2*4)(R4)
+	MOVW	R10, (3*4)(R4)
+	MOVW	R11, (4*4)(R4)
+zero:
+	RET
-- 
GitLab