From 6edc1c23ed078386bfbf7978f6cb5891cc2aa241 Mon Sep 17 00:00:00 2001
From: Xiaolin Zhao <zhaoxiaolin@loongson.cn>
Date: Mon, 3 Jun 2024 17:04:53 +0800
Subject: [PATCH] crypto/md5: implement md5block in hardware on loong64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

goos: linux
goarch: loong64
pkg: crypto/md5
cpu: Loongson-3A6000 @ 2500.00MHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
Hash8Bytes             276.6n ± 0%   219.7n ± 0%  -20.57% (p=0.000 n=20)
Hash64                 445.8n ± 0%   339.9n ± 0%  -23.76% (p=0.000 n=20)
Hash128                632.0n ± 0%   468.1n ± 0%  -25.93% (p=0.000 n=20)
Hash256               1005.0n ± 0%   723.8n ± 0%  -27.98% (p=0.000 n=20)
Hash512                1.749µ ± 0%   1.238µ ± 0%  -29.22% (p=0.000 n=20)
Hash1K                 3.238µ ± 0%   2.265µ ± 0%  -30.05% (p=0.000 n=20)
Hash8K                 24.09µ ± 0%   16.66µ ± 0%  -30.83% (p=0.000 n=20)
Hash1M                 3.049m ± 0%   2.105m ± 0%  -30.97% (p=0.000 n=20)
Hash8M                 24.39m ± 0%   16.84m ± 0%  -30.97% (p=0.000 n=20)
Hash8BytesUnaligned    284.1n ± 0%   227.2n ± 0%  -20.03% (p=0.000 n=20)
Hash1KUnaligned        3.238µ ± 0%   2.265µ ± 0%  -30.05% (p=0.000 n=20)
Hash8KUnaligned        24.09µ ± 0%   16.66µ ± 0%  -30.82% (p=0.000 n=20)
geomean                7.142µ        5.164µ       -27.70%

                    │  bench.old   │              bench.new               │
                    │     B/s      │     B/s       vs base                │
Hash8Bytes            27.58Mi ± 0%   34.73Mi ± 0%  +25.93% (p=0.000 n=20)
Hash64                136.9Mi ± 0%   179.6Mi ± 0%  +31.15% (p=0.000 n=20)
Hash128               193.1Mi ± 0%   260.8Mi ± 0%  +35.03% (p=0.000 n=20)
Hash256               243.0Mi ± 0%   337.3Mi ± 0%  +38.82% (p=0.000 n=20)
Hash512               279.1Mi ± 0%   394.3Mi ± 0%  +41.25% (p=0.000 n=20)
Hash1K                301.6Mi ± 0%   431.1Mi ± 0%  +42.94% (p=0.000 n=20)
Hash8K                324.3Mi ± 0%   468.9Mi ± 0%  +44.56% (p=0.000 n=20)
Hash1M                327.9Mi ± 0%   475.0Mi ± 0%  +44.86% (p=0.000 n=20)
Hash8M                328.0Mi ± 0%   475.1Mi ± 0%  +44.86% (p=0.000 n=20)
Hash8BytesUnaligned   26.86Mi ± 0%   33.58Mi ± 0%  +25.04% (p=0.000 n=20)
Hash1KUnaligned       301.6Mi ± 0%   431.1Mi ± 0%  +42.95% (p=0.000 n=20)
Hash8KUnaligned       324.3Mi ± 0%   468.9Mi ± 0%  +44.56% (p=0.000 n=20)
geomean               182.5Mi        252.4Mi       +38.31%

goos: linux
goarch: loong64
pkg: crypto/md5
cpu: Loongson-3A5000 @ 2500.00MHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
Hash8Bytes             346.0n ± 0%   289.1n ± 0%  -16.45% (p=0.000 n=20)
Hash64                 521.2n ± 0%   409.3n ± 0%  -21.47% (p=0.000 n=20)
Hash128                707.1n ± 0%   537.8n ± 0%  -23.94% (p=0.000 n=20)
Hash256               1080.0n ± 0%   795.8n ± 0%  -26.31% (p=0.000 n=20)
Hash512                1.826µ ± 0%   1.311µ ± 0%  -28.20% (p=0.000 n=20)
Hash1K                 3.315µ ± 0%   2.342µ ± 0%  -29.35% (p=0.000 n=20)
Hash8K                 24.19µ ± 0%   16.78µ ± 0%  -30.65% (p=0.000 n=20)
Hash1M                 3.052m ± 0%   2.110m ± 0%  -30.86% (p=0.000 n=20)
Hash8M                 24.41m ± 0%   16.88m ± 0%  -30.85% (p=0.000 n=20)
Hash8BytesUnaligned    345.9n ± 0%   289.0n ± 0%  -16.45% (p=0.000 n=20)
Hash1KUnaligned        3.316µ ± 0%   2.342µ ± 0%  -29.37% (p=0.000 n=20)
Hash8KUnaligned        24.19µ ± 0%   16.78µ ± 0%  -30.66% (p=0.000 n=20)
geomean                7.673µ        5.648µ       -26.39%

                    │  bench.old   │              bench.new               │
                    │     B/s      │     B/s       vs base                │
Hash8Bytes            22.05Mi ± 0%   26.39Mi ± 0%  +19.68% (p=0.000 n=20)
Hash64                117.1Mi ± 0%   149.1Mi ± 0%  +27.32% (p=0.000 n=20)
Hash128               172.6Mi ± 0%   227.0Mi ± 0%  +31.49% (p=0.000 n=20)
Hash256               226.0Mi ± 0%   306.8Mi ± 0%  +35.77% (p=0.000 n=20)
Hash512               267.4Mi ± 0%   372.5Mi ± 0%  +39.26% (p=0.000 n=20)
Hash1K                294.6Mi ± 0%   417.0Mi ± 0%  +41.53% (p=0.000 n=20)
Hash8K                322.9Mi ± 0%   465.7Mi ± 0%  +44.20% (p=0.000 n=20)
Hash1M                327.7Mi ± 0%   474.0Mi ± 0%  +44.64% (p=0.000 n=20)
Hash8M                327.8Mi ± 0%   474.1Mi ± 0%  +44.62% (p=0.000 n=20)
Hash8BytesUnaligned   22.06Mi ± 0%   26.40Mi ± 0%  +19.67% (p=0.000 n=20)
Hash1KUnaligned       294.5Mi ± 0%   417.0Mi ± 0%  +41.60% (p=0.000 n=20)
Hash8KUnaligned       322.9Mi ± 0%   465.7Mi ± 0%  +44.21% (p=0.000 n=20)
geomean               169.9Mi        230.8Mi       +35.85%

Change-Id: Iffddd60e3fc0b3bb265289f836a2d875f0805f64
Reviewed-on: https://go-review.googlesource.com/c/go/+/589540
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
---
 src/crypto/md5/md5block_decl.go    |   2 +-
 src/crypto/md5/md5block_generic.go |   2 +-
 src/crypto/md5/md5block_loong64.s  | 180 +++++++++++++++++++++++++++++
 3 files changed, 182 insertions(+), 2 deletions(-)
 create mode 100644 src/crypto/md5/md5block_loong64.s

diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go
index 3664542fb92..57b7462bb22 100644
--- a/src/crypto/md5/md5block_decl.go
+++ b/src/crypto/md5/md5block_decl.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64) && !purego
+//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || s390x) && !purego
 
 package md5
 
diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go
index 43cfebd38a9..d6b852db91c 100644
--- a/src/crypto/md5/md5block_generic.go
+++ b/src/crypto/md5/md5block_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64) || purego
+//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !s390x) || purego
 
 package md5
 
diff --git a/src/crypto/md5/md5block_loong64.s b/src/crypto/md5/md5block_loong64.s
new file mode 100644
index 00000000000..c16aa23cfe6
--- /dev/null
+++ b/src/crypto/md5/md5block_loong64.s
@@ -0,0 +1,180 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Loong64 version of md5block.go
+// derived from crypto/md5/md5block_amd64.s
+
+//go:build !purego
+
+#define REGTMP	R30
+#define REGTMP1 R12
+#define REGTMP2 R18
+
+#include "textflag.h"
+
+// func block(dig *digest, p []byte)
+TEXT	·block(SB),NOSPLIT,$0-32
+	MOVV	dig+0(FP), R4
+	MOVV	p+8(FP), R5
+	MOVV	p_len+16(FP), R6
+	AND	$~63, R6
+	BEQ	R6, zero
+
+	// p_len >= 64
+	ADDV	R5, R6, R24
+	MOVW	(0*4)(R4), R7
+	MOVW	(1*4)(R4), R8
+	MOVW	(2*4)(R4), R9
+	MOVW	(3*4)(R4), R10
+
+loop:
+	MOVW	R7, R14
+	MOVW	R8, R15
+	MOVW	R9, R16
+	MOVW	R10, R17
+
+	MOVW	(0*4)(R5), R11
+	MOVW	R10, REGTMP1
+
+// F = ((c ^ d) & b) ^ d
+#define ROUND1(a, b, c, d, index, const, shift) \
+	ADDV	$const, a; \
+	ADD	R11, a; \
+	MOVW	(index*4)(R5), R11; \
+	XOR	c, REGTMP1; \
+	AND	b, REGTMP1; \
+	XOR	d, REGTMP1; \
+	ADD	REGTMP1, a; \
+	ROTR	$(32-shift), a; \
+	MOVW	c, REGTMP1; \
+	ADD	b, a
+
+	ROUND1(R7,  R8,  R9,  R10,  1, 0xd76aa478,  7);
+	ROUND1(R10, R7,  R8,  R9,   2, 0xe8c7b756, 12);
+	ROUND1(R9,  R10, R7,  R8,   3, 0x242070db, 17);
+	ROUND1(R8,  R9,  R10, R7,   4, 0xc1bdceee, 22);
+	ROUND1(R7,  R8,  R9,  R10,  5, 0xf57c0faf,  7);
+	ROUND1(R10, R7,  R8,  R9,   6, 0x4787c62a, 12);
+	ROUND1(R9,  R10, R7,  R8,   7, 0xa8304613, 17);
+	ROUND1(R8,  R9,  R10, R7,   8, 0xfd469501, 22);
+	ROUND1(R7,  R8,  R9,  R10,  9, 0x698098d8,  7);
+	ROUND1(R10, R7,  R8,  R9,  10, 0x8b44f7af, 12);
+	ROUND1(R9,  R10, R7,  R8,  11, 0xffff5bb1, 17);
+	ROUND1(R8,  R9,  R10, R7,  12, 0x895cd7be, 22);
+	ROUND1(R7,  R8,  R9,  R10, 13, 0x6b901122,  7);
+	ROUND1(R10, R7,  R8,  R9,  14, 0xfd987193, 12);
+	ROUND1(R9,  R10, R7,  R8,  15, 0xa679438e, 17);
+	ROUND1(R8,  R9,  R10, R7,   1, 0x49b40821, 22);
+
+	MOVW	(1*4)(R5), R11
+
+// F = ((b ^ c) & d) ^ c
+#define ROUND2(a, b, c, d, index, const, shift) \
+	ADDV	$const, a; \
+	ADD	R11, a; \
+	MOVW	(index*4)(R5), R11; \
+	XOR	b, c, REGTMP; \
+	AND	REGTMP, d, REGTMP; \
+	XOR	REGTMP, c, REGTMP; \
+	ADD	REGTMP, a; \
+	ROTR	$(32-shift), a; \
+	ADD	b, a
+
+	ROUND2(R7,  R8,  R9,  R10,  6, 0xf61e2562,  5);
+	ROUND2(R10, R7,  R8,  R9,  11, 0xc040b340,  9);
+	ROUND2(R9,  R10, R7,  R8,   0, 0x265e5a51, 14);
+	ROUND2(R8,  R9,  R10, R7,   5, 0xe9b6c7aa, 20);
+	ROUND2(R7,  R8,  R9,  R10, 10, 0xd62f105d,  5);
+	ROUND2(R10, R7,  R8,  R9,  15,  0x2441453,  9);
+	ROUND2(R9,  R10, R7,  R8,   4, 0xd8a1e681, 14);
+	ROUND2(R8,  R9,  R10, R7,   9, 0xe7d3fbc8, 20);
+	ROUND2(R7,  R8,  R9,  R10, 14, 0x21e1cde6,  5);
+	ROUND2(R10, R7,  R8,  R9,   3, 0xc33707d6,  9);
+	ROUND2(R9,  R10, R7,  R8,   8, 0xf4d50d87, 14);
+	ROUND2(R8,  R9,  R10, R7,  13, 0x455a14ed, 20);
+	ROUND2(R7,  R8,  R9,  R10,  2, 0xa9e3e905,  5);
+	ROUND2(R10, R7,  R8,  R9,   7, 0xfcefa3f8,  9);
+	ROUND2(R9,  R10, R7,  R8,  12, 0x676f02d9, 14);
+	ROUND2(R8,  R9,  R10, R7,   5, 0x8d2a4c8a, 20);
+
+	MOVW	(5*4)(R5), R11
+	MOVW	R9, REGTMP1
+
+// F = b ^ c ^ d
+#define ROUND3(a, b, c, d, index, const, shift) \
+	ADDV	$const, a; \
+	ADD	R11, a; \
+	MOVW	(index*4)(R5), R11; \
+	XOR	d, REGTMP1; \
+	XOR	b, REGTMP1; \
+	ADD	REGTMP1, a; \
+	ROTR	$(32-shift), a; \
+	MOVW	b, REGTMP1; \
+	ADD	b, a
+
+	ROUND3(R7,  R8,  R9,  R10,  8, 0xfffa3942,  4);
+	ROUND3(R10, R7,  R8,  R9,  11, 0x8771f681, 11);
+	ROUND3(R9,  R10, R7,  R8,  14, 0x6d9d6122, 16);
+	ROUND3(R8,  R9,  R10, R7,   1, 0xfde5380c, 23);
+	ROUND3(R7,  R8,  R9,  R10,  4, 0xa4beea44,  4);
+	ROUND3(R10, R7,  R8,  R9,   7, 0x4bdecfa9, 11);
+	ROUND3(R9,  R10, R7,  R8,  10, 0xf6bb4b60, 16);
+	ROUND3(R8,  R9,  R10, R7,  13, 0xbebfbc70, 23);
+	ROUND3(R7,  R8,  R9,  R10,  0, 0x289b7ec6,  4);
+	ROUND3(R10, R7,  R8,  R9,   3, 0xeaa127fa, 11);
+	ROUND3(R9,  R10, R7,  R8,   6, 0xd4ef3085, 16);
+	ROUND3(R8,  R9,  R10, R7,   9,  0x4881d05, 23);
+	ROUND3(R7,  R8,  R9,  R10, 12, 0xd9d4d039,  4);
+	ROUND3(R10, R7,  R8,  R9,  15, 0xe6db99e5, 11);
+	ROUND3(R9,  R10, R7,  R8,   2, 0x1fa27cf8, 16);
+	ROUND3(R8,  R9,  R10, R7,   0, 0xc4ac5665, 23);
+
+	MOVW	(0*4)(R5), R11
+	MOVV	$0xffffffff, REGTMP2
+	XOR	R10, REGTMP2, REGTMP1	// REGTMP1 = ~d
+
+// F = c ^ (b | (~d))
+#define ROUND4(a, b, c, d, index, const, shift) \
+	ADDV	$const, a; \
+	ADD	R11, a; \
+	MOVW	(index*4)(R5), R11; \
+	OR	b, REGTMP1; \
+	XOR	c, REGTMP1; \
+	ADD	REGTMP1, a; \
+	ROTR	$(32-shift), a; \
+	MOVV	$0xffffffff, REGTMP2; \
+	XOR	c, REGTMP2, REGTMP1; \
+	ADD	b, a
+
+	ROUND4(R7,  R8,  R9,  R10,  7, 0xf4292244,  6);
+	ROUND4(R10, R7,  R8,  R9,  14, 0x432aff97, 10);
+	ROUND4(R9,  R10, R7,  R8,   5, 0xab9423a7, 15);
+	ROUND4(R8,  R9,  R10, R7,  12, 0xfc93a039, 21);
+	ROUND4(R7,  R8,  R9,  R10,  3, 0x655b59c3,  6);
+	ROUND4(R10, R7,  R8,  R9,  10, 0x8f0ccc92, 10);
+	ROUND4(R9,  R10, R7,  R8,   1, 0xffeff47d, 15);
+	ROUND4(R8,  R9,  R10, R7,   8, 0x85845dd1, 21);
+	ROUND4(R7,  R8,  R9,  R10, 15, 0x6fa87e4f,  6);
+	ROUND4(R10, R7,  R8,  R9,   6, 0xfe2ce6e0, 10);
+	ROUND4(R9,  R10, R7,  R8,  13, 0xa3014314, 15);
+	ROUND4(R8,  R9,  R10, R7,   4, 0x4e0811a1, 21);
+	ROUND4(R7,  R8,  R9,  R10, 11, 0xf7537e82,  6);
+	ROUND4(R10, R7,  R8,  R9,   2, 0xbd3af235, 10);
+	ROUND4(R9,  R10, R7,  R8,   9, 0x2ad7d2bb, 15);
+	ROUND4(R8,  R9,  R10, R7,   0, 0xeb86d391, 21);
+
+	ADD	R14, R7
+	ADD	R15, R8
+	ADD	R16, R9
+	ADD	R17, R10
+
+	ADDV	$64, R5
+	BNE	R5, R24, loop
+
+	MOVW	R7, (0*4)(R4)
+	MOVW	R8, (1*4)(R4)
+	MOVW	R9, (2*4)(R4)
+	MOVW	R10, (3*4)(R4)
+zero:
+	RET
-- 
GitLab