From 6edc1c23ed078386bfbf7978f6cb5891cc2aa241 Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao <zhaoxiaolin@loongson.cn> Date: Mon, 3 Jun 2024 17:04:53 +0800 Subject: [PATCH] crypto/md5: implement md5block in hardware on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit goos: linux goarch: loong64 pkg: crypto/md5 cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes 276.6n ± 0% 219.7n ± 0% -20.57% (p=0.000 n=20) Hash64 445.8n ± 0% 339.9n ± 0% -23.76% (p=0.000 n=20) Hash128 632.0n ± 0% 468.1n ± 0% -25.93% (p=0.000 n=20) Hash256 1005.0n ± 0% 723.8n ± 0% -27.98% (p=0.000 n=20) Hash512 1.749µ ± 0% 1.238µ ± 0% -29.22% (p=0.000 n=20) Hash1K 3.238µ ± 0% 2.265µ ± 0% -30.05% (p=0.000 n=20) Hash8K 24.09µ ± 0% 16.66µ ± 0% -30.83% (p=0.000 n=20) Hash1M 3.049m ± 0% 2.105m ± 0% -30.97% (p=0.000 n=20) Hash8M 24.39m ± 0% 16.84m ± 0% -30.97% (p=0.000 n=20) Hash8BytesUnaligned 284.1n ± 0% 227.2n ± 0% -20.03% (p=0.000 n=20) Hash1KUnaligned 3.238µ ± 0% 2.265µ ± 0% -30.05% (p=0.000 n=20) Hash8KUnaligned 24.09µ ± 0% 16.66µ ± 0% -30.82% (p=0.000 n=20) geomean 7.142µ 5.164µ -27.70% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes 27.58Mi ± 0% 34.73Mi ± 0% +25.93% (p=0.000 n=20) Hash64 136.9Mi ± 0% 179.6Mi ± 0% +31.15% (p=0.000 n=20) Hash128 193.1Mi ± 0% 260.8Mi ± 0% +35.03% (p=0.000 n=20) Hash256 243.0Mi ± 0% 337.3Mi ± 0% +38.82% (p=0.000 n=20) Hash512 279.1Mi ± 0% 394.3Mi ± 0% +41.25% (p=0.000 n=20) Hash1K 301.6Mi ± 0% 431.1Mi ± 0% +42.94% (p=0.000 n=20) Hash8K 324.3Mi ± 0% 468.9Mi ± 0% +44.56% (p=0.000 n=20) Hash1M 327.9Mi ± 0% 475.0Mi ± 0% +44.86% (p=0.000 n=20) Hash8M 328.0Mi ± 0% 475.1Mi ± 0% +44.86% (p=0.000 n=20) Hash8BytesUnaligned 26.86Mi ± 0% 33.58Mi ± 0% +25.04% (p=0.000 n=20) Hash1KUnaligned 301.6Mi ± 0% 431.1Mi ± 0% +42.95% (p=0.000 n=20) Hash8KUnaligned 324.3Mi ± 0% 468.9Mi ± 0% +44.56% (p=0.000 n=20) geomean 182.5Mi 252.4Mi +38.31% goos: linux goarch: loong64 pkg: crypto/md5 cpu: Loongson-3A5000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes 346.0n ± 0% 289.1n ± 0% -16.45% (p=0.000 n=20) Hash64 521.2n ± 0% 409.3n ± 0% -21.47% (p=0.000 n=20) Hash128 707.1n ± 0% 537.8n ± 0% -23.94% (p=0.000 n=20) Hash256 1080.0n ± 0% 795.8n ± 0% -26.31% (p=0.000 n=20) Hash512 1.826µ ± 0% 1.311µ ± 0% -28.20% (p=0.000 n=20) Hash1K 3.315µ ± 0% 2.342µ ± 0% -29.35% (p=0.000 n=20) Hash8K 24.19µ ± 0% 16.78µ ± 0% -30.65% (p=0.000 n=20) Hash1M 3.052m ± 0% 2.110m ± 0% -30.86% (p=0.000 n=20) Hash8M 24.41m ± 0% 16.88m ± 0% -30.85% (p=0.000 n=20) Hash8BytesUnaligned 345.9n ± 0% 289.0n ± 0% -16.45% (p=0.000 n=20) Hash1KUnaligned 3.316µ ± 0% 2.342µ ± 0% -29.37% (p=0.000 n=20) Hash8KUnaligned 24.19µ ± 0% 16.78µ ± 0% -30.66% (p=0.000 n=20) geomean 7.673µ 5.648µ -26.39% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes 22.05Mi ± 0% 26.39Mi ± 0% +19.68% (p=0.000 n=20) Hash64 117.1Mi ± 0% 149.1Mi ± 0% +27.32% (p=0.000 n=20) Hash128 172.6Mi ± 0% 227.0Mi ± 0% +31.49% (p=0.000 n=20) Hash256 226.0Mi ± 0% 306.8Mi ± 0% +35.77% (p=0.000 n=20) Hash512 267.4Mi ± 0% 372.5Mi ± 0% +39.26% (p=0.000 n=20) Hash1K 294.6Mi ± 0% 417.0Mi ± 0% +41.53% (p=0.000 n=20) Hash8K 322.9Mi ± 0% 465.7Mi ± 0% +44.20% (p=0.000 n=20) Hash1M 327.7Mi ± 0% 474.0Mi ± 0% +44.64% (p=0.000 n=20) Hash8M 327.8Mi ± 0% 474.1Mi ± 0% +44.62% (p=0.000 n=20) Hash8BytesUnaligned 22.06Mi ± 0% 26.40Mi ± 0% +19.67% (p=0.000 n=20) Hash1KUnaligned 294.5Mi ± 0% 417.0Mi ± 0% +41.60% (p=0.000 n=20) Hash8KUnaligned 322.9Mi ± 0% 465.7Mi ± 0% +44.21% (p=0.000 n=20) geomean 169.9Mi 230.8Mi +35.85% Change-Id: Iffddd60e3fc0b3bb265289f836a2d875f0805f64 Reviewed-on: https://go-review.googlesource.com/c/go/+/589540 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> --- src/crypto/md5/md5block_decl.go | 2 +- src/crypto/md5/md5block_generic.go | 2 +- src/crypto/md5/md5block_loong64.s | 180 +++++++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 2 deletions(-) create mode 100644 src/crypto/md5/md5block_loong64.s diff --git a/src/crypto/md5/md5block_decl.go b/src/crypto/md5/md5block_decl.go index 3664542fb92..57b7462bb22 100644 --- a/src/crypto/md5/md5block_decl.go +++ b/src/crypto/md5/md5block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64) && !purego +//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || s390x) && !purego package md5 diff --git a/src/crypto/md5/md5block_generic.go b/src/crypto/md5/md5block_generic.go index 43cfebd38a9..d6b852db91c 100644 --- a/src/crypto/md5/md5block_generic.go +++ b/src/crypto/md5/md5block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64) || purego +//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !s390x) || purego package md5 diff --git a/src/crypto/md5/md5block_loong64.s b/src/crypto/md5/md5block_loong64.s new file mode 100644 index 00000000000..c16aa23cfe6 --- /dev/null +++ b/src/crypto/md5/md5block_loong64.s @@ -0,0 +1,180 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Loong64 version of md5block.go +// derived from crypto/md5/md5block_amd64.s + +//go:build !purego + +#define REGTMP R30 +#define REGTMP1 R12 +#define REGTMP2 R18 + +#include "textflag.h" + +// func block(dig *digest, p []byte) +TEXT ·block(SB),NOSPLIT,$0-32 + MOVV dig+0(FP), R4 + MOVV p+8(FP), R5 + MOVV p_len+16(FP), R6 + AND $~63, R6 + BEQ R6, zero + + // p_len >= 64 + ADDV R5, R6, R24 + MOVW (0*4)(R4), R7 + MOVW (1*4)(R4), R8 + MOVW (2*4)(R4), R9 + MOVW (3*4)(R4), R10 + +loop: + MOVW R7, R14 + MOVW R8, R15 + MOVW R9, R16 + MOVW R10, R17 + + MOVW (0*4)(R5), R11 + MOVW R10, REGTMP1 + +// F = ((c ^ d) & b) ^ d +#define ROUND1(a, b, c, d, index, const, shift) \ + ADDV $const, a; \ + ADD R11, a; \ + MOVW (index*4)(R5), R11; \ + XOR c, REGTMP1; \ + AND b, REGTMP1; \ + XOR d, REGTMP1; \ + ADD REGTMP1, a; \ + ROTR $(32-shift), a; \ + MOVW c, REGTMP1; \ + ADD b, a + + ROUND1(R7, R8, R9, R10, 1, 0xd76aa478, 7); + ROUND1(R10, R7, R8, R9, 2, 0xe8c7b756, 12); + ROUND1(R9, R10, R7, R8, 3, 0x242070db, 17); + ROUND1(R8, R9, R10, R7, 4, 0xc1bdceee, 22); + ROUND1(R7, R8, R9, R10, 5, 0xf57c0faf, 7); + ROUND1(R10, R7, R8, R9, 6, 0x4787c62a, 12); + ROUND1(R9, R10, R7, R8, 7, 0xa8304613, 17); + ROUND1(R8, R9, R10, R7, 8, 0xfd469501, 22); + ROUND1(R7, R8, R9, R10, 9, 0x698098d8, 7); + ROUND1(R10, R7, R8, R9, 10, 0x8b44f7af, 12); + ROUND1(R9, R10, R7, R8, 11, 0xffff5bb1, 17); + ROUND1(R8, R9, R10, R7, 12, 0x895cd7be, 22); + ROUND1(R7, R8, R9, R10, 13, 0x6b901122, 7); + ROUND1(R10, R7, R8, R9, 14, 0xfd987193, 12); + ROUND1(R9, R10, R7, R8, 15, 0xa679438e, 17); + ROUND1(R8, R9, R10, R7, 1, 0x49b40821, 22); + + MOVW (1*4)(R5), R11 + +// F = ((b ^ c) & d) ^ c +#define ROUND2(a, b, c, d, index, const, shift) \ + ADDV $const, a; \ + ADD R11, a; \ + MOVW (index*4)(R5), R11; \ + XOR b, c, REGTMP; \ + AND REGTMP, d, REGTMP; \ + XOR REGTMP, c, REGTMP; \ + ADD REGTMP, a; \ + ROTR $(32-shift), a; \ + ADD b, a + + ROUND2(R7, R8, R9, R10, 6, 0xf61e2562, 5); + ROUND2(R10, R7, R8, R9, 11, 0xc040b340, 9); + ROUND2(R9, R10, R7, R8, 0, 0x265e5a51, 14); + ROUND2(R8, R9, R10, R7, 5, 0xe9b6c7aa, 20); + ROUND2(R7, R8, R9, R10, 10, 0xd62f105d, 5); + ROUND2(R10, R7, R8, R9, 15, 0x2441453, 9); + ROUND2(R9, R10, R7, R8, 4, 0xd8a1e681, 14); + ROUND2(R8, R9, R10, R7, 9, 0xe7d3fbc8, 20); + ROUND2(R7, R8, R9, R10, 14, 0x21e1cde6, 5); + ROUND2(R10, R7, R8, R9, 3, 0xc33707d6, 9); + ROUND2(R9, R10, R7, R8, 8, 0xf4d50d87, 14); + ROUND2(R8, R9, R10, R7, 13, 0x455a14ed, 20); + ROUND2(R7, R8, R9, R10, 2, 0xa9e3e905, 5); + ROUND2(R10, R7, R8, R9, 7, 0xfcefa3f8, 9); + ROUND2(R9, R10, R7, R8, 12, 0x676f02d9, 14); + ROUND2(R8, R9, R10, R7, 5, 0x8d2a4c8a, 20); + + MOVW (5*4)(R5), R11 + MOVW R9, REGTMP1 + +// F = b ^ c ^ d +#define ROUND3(a, b, c, d, index, const, shift) \ + ADDV $const, a; \ + ADD R11, a; \ + MOVW (index*4)(R5), R11; \ + XOR d, REGTMP1; \ + XOR b, REGTMP1; \ + ADD REGTMP1, a; \ + ROTR $(32-shift), a; \ + MOVW b, REGTMP1; \ + ADD b, a + + ROUND3(R7, R8, R9, R10, 8, 0xfffa3942, 4); + ROUND3(R10, R7, R8, R9, 11, 0x8771f681, 11); + ROUND3(R9, R10, R7, R8, 14, 0x6d9d6122, 16); + ROUND3(R8, R9, R10, R7, 1, 0xfde5380c, 23); + ROUND3(R7, R8, R9, R10, 4, 0xa4beea44, 4); + ROUND3(R10, R7, R8, R9, 7, 0x4bdecfa9, 11); + ROUND3(R9, R10, R7, R8, 10, 0xf6bb4b60, 16); + ROUND3(R8, R9, R10, R7, 13, 0xbebfbc70, 23); + ROUND3(R7, R8, R9, R10, 0, 0x289b7ec6, 4); + ROUND3(R10, R7, R8, R9, 3, 0xeaa127fa, 11); + ROUND3(R9, R10, R7, R8, 6, 0xd4ef3085, 16); + ROUND3(R8, R9, R10, R7, 9, 0x4881d05, 23); + ROUND3(R7, R8, R9, R10, 12, 0xd9d4d039, 4); + ROUND3(R10, R7, R8, R9, 15, 0xe6db99e5, 11); + ROUND3(R9, R10, R7, R8, 2, 0x1fa27cf8, 16); + ROUND3(R8, R9, R10, R7, 0, 0xc4ac5665, 23); + + MOVW (0*4)(R5), R11 + MOVV $0xffffffff, REGTMP2 + XOR R10, REGTMP2, REGTMP1 // REGTMP1 = ~d + +// F = c ^ (b | (~d)) +#define ROUND4(a, b, c, d, index, const, shift) \ + ADDV $const, a; \ + ADD R11, a; \ + MOVW (index*4)(R5), R11; \ + OR b, REGTMP1; \ + XOR c, REGTMP1; \ + ADD REGTMP1, a; \ + ROTR $(32-shift), a; \ + MOVV $0xffffffff, REGTMP2; \ + XOR c, REGTMP2, REGTMP1; \ + ADD b, a + + ROUND4(R7, R8, R9, R10, 7, 0xf4292244, 6); + ROUND4(R10, R7, R8, R9, 14, 0x432aff97, 10); + ROUND4(R9, R10, R7, R8, 5, 0xab9423a7, 15); + ROUND4(R8, R9, R10, R7, 12, 0xfc93a039, 21); + ROUND4(R7, R8, R9, R10, 3, 0x655b59c3, 6); + ROUND4(R10, R7, R8, R9, 10, 0x8f0ccc92, 10); + ROUND4(R9, R10, R7, R8, 1, 0xffeff47d, 15); + ROUND4(R8, R9, R10, R7, 8, 0x85845dd1, 21); + ROUND4(R7, R8, R9, R10, 15, 0x6fa87e4f, 6); + ROUND4(R10, R7, R8, R9, 6, 0xfe2ce6e0, 10); + ROUND4(R9, R10, R7, R8, 13, 0xa3014314, 15); + ROUND4(R8, R9, R10, R7, 4, 0x4e0811a1, 21); + ROUND4(R7, R8, R9, R10, 11, 0xf7537e82, 6); + ROUND4(R10, R7, R8, R9, 2, 0xbd3af235, 10); + ROUND4(R9, R10, R7, R8, 9, 0x2ad7d2bb, 15); + ROUND4(R8, R9, R10, R7, 0, 0xeb86d391, 21); + + ADD R14, R7 + ADD R15, R8 + ADD R16, R9 + ADD R17, R10 + + ADDV $64, R5 + BNE R5, R24, loop + + MOVW R7, (0*4)(R4) + MOVW R8, (1*4)(R4) + MOVW R9, (2*4)(R4) + MOVW R10, (3*4)(R4) +zero: + RET -- GitLab