From 8d7c43b51722f5841f8c0907797f24d4d71dae19 Mon Sep 17 00:00:00 2001
From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Date: Tue, 29 Apr 2025 17:15:45 +0800
Subject: [PATCH] crypto/sha256: improve performance of loong64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Replaced WORD with instruction REVB2W.
2. Simplified the implementation of Ch and Maj by reducing instructions, refer to the implementation of riscv64.

goos: linux
goarch: loong64
pkg: crypto/sha256
cpu: Loongson-3A6000-HV @ 2500.00MHz
                  |  bench.old  |             bench.new              |
                  |   sec/op    |   sec/op     vs base               |
Hash8Bytes/New      313.9n ± 0%   293.4n ± 0%  -6.53% (p=0.000 n=10)
Hash8Bytes/Sum224   324.0n ± 0%   304.2n ± 0%  -6.11% (p=0.000 n=10)
Hash8Bytes/Sum256   322.8n ± 0%   301.8n ± 0%  -6.51% (p=0.000 n=10)
Hash1K/New          4.513µ ± 0%   4.183µ ± 0%  -7.31% (p=0.000 n=10)
Hash1K/Sum224       4.522µ ± 0%   4.189µ ± 0%  -7.36% (p=0.000 n=10)
Hash1K/Sum256       4.522µ ± 0%   4.190µ ± 0%  -7.34% (p=0.000 n=10)
Hash8K/New          33.92µ ± 0%   31.42µ ± 0%  -7.38% (p=0.000 n=10)
Hash8K/Sum224       33.94µ ± 0%   31.42µ ± 0%  -7.40% (p=0.000 n=10)
Hash8K/Sum256       33.94µ ± 0%   31.42µ ± 0%  -7.41% (p=0.000 n=10)
geomean             3.662µ        3.404µ       -7.04%

goos: linux
goarch: loong64
pkg: crypto/sha256
cpu: Loongson-3A5000 @ 2500.00MHz
                  |  bench.old  |             bench.new              |
                  |   sec/op    |   sec/op     vs base               |
Hash8Bytes/New      382.2n ± 0%   357.3n ± 0%  -6.51% (p=0.000 n=10)
Hash8Bytes/Sum224   392.3n ± 0%   367.0n ± 0%  -6.45% (p=0.000 n=10)
Hash8Bytes/Sum256   393.9n ± 0%   368.8n ± 0%  -6.37% (p=0.000 n=10)
Hash1K/New          5.173µ ± 0%   4.725µ ± 0%  -8.66% (p=0.000 n=10)
Hash1K/Sum224       5.189µ ± 0%   4.742µ ± 0%  -8.62% (p=0.000 n=10)
Hash1K/Sum256       5.188µ ± 0%   4.742µ ± 0%  -8.60% (p=0.000 n=10)
Hash8K/New          38.75µ ± 0%   35.34µ ± 0%  -8.78% (p=0.000 n=10)
Hash8K/Sum224       38.77µ ± 0%   35.35µ ± 0%  -8.80% (p=0.000 n=10)
Hash8K/Sum256       38.76µ ± 0%   35.35µ ± 0%  -8.80% (p=0.000 n=10)
geomean             4.277µ        3.936µ       -7.96%

Change-Id: I561f6db118d05fe44485af8ea25df85afa6905a3
Reviewed-on: https://go-review.googlesource.com/c/go/+/668775
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
---
 .../fips140/sha256/sha256block_loong64.s      | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/crypto/internal/fips140/sha256/sha256block_loong64.s b/src/crypto/internal/fips140/sha256/sha256block_loong64.s
index 971ad97ab82..e171d93e0ba 100644
--- a/src/crypto/internal/fips140/sha256/sha256block_loong64.s
+++ b/src/crypto/internal/fips140/sha256/sha256block_loong64.s
@@ -56,7 +56,7 @@
 // W[i] = M[i]; for 0 <= i <= 15
 #define LOAD0(index) \
 	MOVW	(index*4)(R5), REGTMP4; \
-	WORD	$0x38e7; \	// REVB2W REGTMP4, REGTMP4 to big-endian
+	REVB2W	REGTMP4, REGTMP4; \
 	MOVW	REGTMP4, (index*4)(R3)
 
 // W[i] = SIGMA1(W[i-2]) + W[i-7] + SIGMA0(W[i-15]) + W[i-16]; for 16 <= i <= 63
@@ -87,38 +87,37 @@
 // T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + K[i] + W[i]
 // BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
 // Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
+//             = ((y XOR z) AND x) XOR z
 // Calculate T1 in REGTMP4
 #define SHA256T1(const, e, f, g, h) \
 	ADDV	$const, h; \
 	ADD	REGTMP4, h; \
-	ROTR	$6, e, REGTMP4; \
+	ROTR	$6, e, REGTMP5; \
 	ROTR	$11, e, REGTMP; \
 	ROTR	$25, e, REGTMP3; \
-	AND	f, e, REGTMP2; \
-	XOR	REGTMP, REGTMP4; \
-	MOVV	$0xffffffff, REGTMP; \
-	XOR	REGTMP4, REGTMP3; \
-	XOR	REGTMP, e, REGTMP5; \
+	XOR	f, g, REGTMP2; \
+	XOR	REGTMP, REGTMP5; \
+	AND	e, REGTMP2; \
+	XOR	REGTMP5, REGTMP3; \
+	XOR	g, REGTMP2; \
 	ADD	REGTMP3, h; \
-	AND	g, REGTMP5; \
-	XOR	REGTMP2, REGTMP5; \
-	ADD	h, REGTMP5, REGTMP4
+	ADD	h, REGTMP2, REGTMP4
 
 // T2 = BIGSIGMA0(a) + Maj(a, b, c)
 // BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
 // Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
+//              = ((y XOR z) AND x) XOR (y AND z)
 // Calculate T2 in REGTMP1
 #define SHA256T2(a, b, c) \
 	ROTR	$2, a, REGTMP5; \
-	AND	b, c, REGTMP1; \
 	ROTR	$13, a, REGTMP3; \
-	AND	c, a, REGTMP; \
-	XOR	REGTMP3, REGTMP5; \
-	XOR	REGTMP, REGTMP1; \
 	ROTR	$22, a, REGTMP2; \
-	AND	a, b, REGTMP3; \
+	XOR	b, c, REGTMP; \
+	AND	b, c, REGTMP1; \
+	XOR	REGTMP3, REGTMP5; \
+	AND	REGTMP, a, REGTMP; \
 	XOR	REGTMP2, REGTMP5; \
-	XOR	REGTMP3, REGTMP1; \
+	XOR	REGTMP, REGTMP1; \
 	ADD	REGTMP5, REGTMP1
 
 // Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
-- 
GitLab