From 64a5d1d7de14ad74b1a77614de0e17c659ef12b6 Mon Sep 17 00:00:00 2001 From: Xiaolin Zhao <zhaoxiaolin@loongson.cn> Date: Mon, 3 Jun 2024 17:54:18 +0800 Subject: [PATCH] crypto/sha1: implement sha1block in hardware on loong64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit goos: linux goarch: loong64 pkg: crypto/sha1 cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New 489.8n ± 0% 280.6n ± 0% -42.71% (p=0.000 n=20) Hash8Bytes/Sum 496.6n ± 0% 288.9n ± 0% -41.82% (p=0.000 n=20) Hash320Bytes/New 2251.0n ± 0% 992.0n ± 0% -55.93% (p=0.000 n=20) Hash320Bytes/Sum 2258.0n ± 0% 998.0n ± 0% -55.80% (p=0.000 n=20) Hash1K/New 6.113µ ± 0% 2.583µ ± 0% -57.75% (p=0.000 n=20) Hash1K/Sum 6.117µ ± 0% 2.588µ ± 0% -57.69% (p=0.000 n=20) Hash8K/New 45.42µ ± 0% 18.79µ ± 0% -58.63% (p=0.000 n=20) Hash8K/Sum 45.43µ ± 0% 18.80µ ± 0% -58.62% (p=0.000 n=20) geomean 4.192µ 1.926µ -54.05% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes/New 15.57Mi ± 0% 27.19Mi ± 0% +74.59% (p=0.000 n=20) Hash8Bytes/Sum 15.36Mi ± 0% 26.41Mi ± 0% +71.88% (p=0.000 n=20) Hash320Bytes/New 135.6Mi ± 0% 307.6Mi ± 0% +126.90% (p=0.000 n=20) Hash320Bytes/Sum 135.2Mi ± 0% 305.8Mi ± 0% +126.22% (p=0.000 n=20) Hash1K/New 159.8Mi ± 0% 378.1Mi ± 0% +136.69% (p=0.000 n=20) Hash1K/Sum 159.7Mi ± 0% 377.4Mi ± 0% +136.38% (p=0.000 n=20) Hash8K/New 172.0Mi ± 0% 415.8Mi ± 0% +141.75% (p=0.000 n=20) Hash8K/Sum 172.0Mi ± 0% 415.6Mi ± 0% +141.65% (p=0.000 n=20) geomean 87.09Mi 189.5Mi +117.64% goos: linux goarch: loong64 pkg: crypto/sha1 cpu: Loongson-3A5000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New 565.9n ± 0% 374.5n ± 1% -33.82% (p=0.000 n=20) Hash8Bytes/Sum 571.3n ± 0% 366.7n ± 1% -35.81% (p=0.000 n=20) Hash320Bytes/New 2.662µ ± 0% 1.201µ ± 0% -54.88% (p=0.000 n=20) Hash320Bytes/Sum 2.662µ ± 0% 1.194µ ± 0% -55.15% (p=0.000 n=20) Hash1K/New 7.171µ ± 0% 3.084µ ± 0% -56.99% (p=0.000 n=20) Hash1K/Sum 7.171µ ± 0% 3.076µ ± 0% -57.11% (p=0.000 n=20) Hash8K/New 53.10µ ± 0% 22.24µ ± 0% -58.12% (p=0.000 n=20) Hash8K/Sum 53.09µ ± 0% 22.23µ ± 0% -58.12% (p=0.000 n=20) geomean 4.900µ 2.348µ -52.08% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes/New 13.48Mi ± 0% 20.38Mi ± 1% +51.10% (p=0.000 n=20) Hash8Bytes/Sum 13.35Mi ± 0% 20.80Mi ± 1% +55.82% (p=0.000 n=20) Hash320Bytes/New 114.6Mi ± 0% 254.0Mi ± 1% +121.61% (p=0.000 n=20) Hash320Bytes/Sum 114.6Mi ± 0% 255.6Mi ± 0% +123.00% (p=0.000 n=20) Hash1K/New 136.2Mi ± 0% 316.7Mi ± 0% +132.54% (p=0.000 n=20) Hash1K/Sum 136.2Mi ± 0% 317.5Mi ± 0% +133.19% (p=0.000 n=20) Hash8K/New 147.1Mi ± 0% 351.3Mi ± 0% +138.79% (p=0.000 n=20) Hash8K/Sum 147.2Mi ± 0% 351.4Mi ± 0% +138.78% (p=0.000 n=20) geomean 74.51Mi 155.5Mi +108.69% Change-Id: I716babd19c18dc2c3314d972ced9d83de2d93cb2 Reviewed-on: https://go-review.googlesource.com/c/go/+/589775 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> --- src/crypto/sha1/sha1block_decl.go | 2 +- src/crypto/sha1/sha1block_generic.go | 2 +- src/crypto/sha1/sha1block_loong64.s | 226 +++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 src/crypto/sha1/sha1block_loong64.s diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go index 3edf5a43606..46f41a1cc26 100644 --- a/src/crypto/sha1/sha1block_decl.go +++ b/src/crypto/sha1/sha1block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (arm || 386 || s390x) && !purego +//go:build (386 || arm || loong64 || s390x) && !purego package sha1 diff --git a/src/crypto/sha1/sha1block_generic.go b/src/crypto/sha1/sha1block_generic.go index 4c6f74d99d8..5989a243476 100644 --- a/src/crypto/sha1/sha1block_generic.go +++ b/src/crypto/sha1/sha1block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !386 && !arm && !s390x && !arm64) || purego +//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !s390x) || purego package sha1 diff --git a/src/crypto/sha1/sha1block_loong64.s b/src/crypto/sha1/sha1block_loong64.s new file mode 100644 index 00000000000..7e9d6e09339 --- /dev/null +++ b/src/crypto/sha1/sha1block_loong64.s @@ -0,0 +1,226 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +#include "textflag.h" + +// SHA-1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +#define REGTMP R30 +#define REGTMP1 R17 +#define REGTMP2 R18 +#define REGTMP3 R19 + +#define LOAD1(index) \ + MOVW (index*4)(R5), REGTMP3; \ + WORD $0x3a73; \ // REVB2W REGTMP3, REGTMP3 to big-endian + MOVW REGTMP3, (index*4)(R3) + +#define LOAD(index) \ + MOVW (((index)&0xf)*4)(R3), REGTMP3; \ + MOVW (((index-3)&0xf)*4)(R3), REGTMP; \ + MOVW (((index-8)&0xf)*4)(R3), REGTMP1; \ + MOVW (((index-14)&0xf)*4)(R3), REGTMP2; \ + XOR REGTMP, REGTMP3; \ + XOR REGTMP1, REGTMP3; \ + XOR REGTMP2, REGTMP3; \ + ROTR $31, REGTMP3; \ + MOVW REGTMP3, (((index)&0xf)*4)(R3) + +// f = d ^ (b & (c ^ d)) +#define FUNC1(a, b, c, d, e) \ + XOR c, d, REGTMP1; \ + AND b, REGTMP1; \ + XOR d, REGTMP1 + +// f = b ^ c ^ d +#define FUNC2(a, b, c, d, e) \ + XOR b, c, REGTMP1; \ + XOR d, REGTMP1 + +// f = (b & c) | ((b | c) & d) +#define FUNC3(a, b, c, d, e) \ + OR b, c, REGTMP2; \ + AND b, c, REGTMP; \ + AND d, REGTMP2; \ + OR REGTMP, REGTMP2, REGTMP1 + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, const) \ + ROTR $2, b; \ // b << 30 + ADD REGTMP1, e; \ // e = e + f + ROTR $27, a, REGTMP2; \ // a << 5 + ADD REGTMP3, e; \ // e = e + w[i] + ADDV $const, e; \ // e = e + k + ADD REGTMP2, e // e = e + a<<5 + +#define ROUND1(a, b, c, d, e, index) \ + LOAD1(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND1x(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND2(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x6ED9EBA1) + +#define ROUND3(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x8F1BBCDC) + +#define ROUND4(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0xCA62C1D6) + +// A stack frame size of 64 bytes is required here, because +// the frame size used for data expansion is 64 bytes. +// See the definition of the macro LOAD above, and the definition +// of the local variable w in the general implementation (sha1block.go). +TEXT ·block(SB),NOSPLIT,$64-32 + MOVV dig+0(FP), R4 + MOVV p_base+8(FP), R5 + MOVV p_len+16(FP), R6 + AND $~63, R6 + BEQ R6, zero + + // p_len >= 64 + ADDV R5, R6, R24 + MOVW (0*4)(R4), R7 + MOVW (1*4)(R4), R8 + MOVW (2*4)(R4), R9 + MOVW (3*4)(R4), R10 + MOVW (4*4)(R4), R11 + +loop: + MOVW R7, R12 + MOVW R8, R13 + MOVW R9, R14 + MOVW R10, R15 + MOVW R11, R16 + + ROUND1(R7, R8, R9, R10, R11, 0) + ROUND1(R11, R7, R8, R9, R10, 1) + ROUND1(R10, R11, R7, R8, R9, 2) + ROUND1(R9, R10, R11, R7, R8, 3) + ROUND1(R8, R9, R10, R11, R7, 4) + ROUND1(R7, R8, R9, R10, R11, 5) + ROUND1(R11, R7, R8, R9, R10, 6) + ROUND1(R10, R11, R7, R8, R9, 7) + ROUND1(R9, R10, R11, R7, R8, 8) + ROUND1(R8, R9, R10, R11, R7, 9) + ROUND1(R7, R8, R9, R10, R11, 10) + ROUND1(R11, R7, R8, R9, R10, 11) + ROUND1(R10, R11, R7, R8, R9, 12) + ROUND1(R9, R10, R11, R7, R8, 13) + ROUND1(R8, R9, R10, R11, R7, 14) + ROUND1(R7, R8, R9, R10, R11, 15) + + ROUND1x(R11, R7, R8, R9, R10, 16) + ROUND1x(R10, R11, R7, R8, R9, 17) + ROUND1x(R9, R10, R11, R7, R8, 18) + ROUND1x(R8, R9, R10, R11, R7, 19) + + ROUND2(R7, R8, R9, R10, R11, 20) + ROUND2(R11, R7, R8, R9, R10, 21) + ROUND2(R10, R11, R7, R8, R9, 22) + ROUND2(R9, R10, R11, R7, R8, 23) + ROUND2(R8, R9, R10, R11, R7, 24) + ROUND2(R7, R8, R9, R10, R11, 25) + ROUND2(R11, R7, R8, R9, R10, 26) + ROUND2(R10, R11, R7, R8, R9, 27) + ROUND2(R9, R10, R11, R7, R8, 28) + ROUND2(R8, R9, R10, R11, R7, 29) + ROUND2(R7, R8, R9, R10, R11, 30) + ROUND2(R11, R7, R8, R9, R10, 31) + ROUND2(R10, R11, R7, R8, R9, 32) + ROUND2(R9, R10, R11, R7, R8, 33) + ROUND2(R8, R9, R10, R11, R7, 34) + ROUND2(R7, R8, R9, R10, R11, 35) + ROUND2(R11, R7, R8, R9, R10, 36) + ROUND2(R10, R11, R7, R8, R9, 37) + ROUND2(R9, R10, R11, R7, R8, 38) + ROUND2(R8, R9, R10, R11, R7, 39) + + ROUND3(R7, R8, R9, R10, R11, 40) + ROUND3(R11, R7, R8, R9, R10, 41) + ROUND3(R10, R11, R7, R8, R9, 42) + ROUND3(R9, R10, R11, R7, R8, 43) + ROUND3(R8, R9, R10, R11, R7, 44) + ROUND3(R7, R8, R9, R10, R11, 45) + ROUND3(R11, R7, R8, R9, R10, 46) + ROUND3(R10, R11, R7, R8, R9, 47) + ROUND3(R9, R10, R11, R7, R8, 48) + ROUND3(R8, R9, R10, R11, R7, 49) + ROUND3(R7, R8, R9, R10, R11, 50) + ROUND3(R11, R7, R8, R9, R10, 51) + ROUND3(R10, R11, R7, R8, R9, 52) + ROUND3(R9, R10, R11, R7, R8, 53) + ROUND3(R8, R9, R10, R11, R7, 54) + ROUND3(R7, R8, R9, R10, R11, 55) + ROUND3(R11, R7, R8, R9, R10, 56) + ROUND3(R10, R11, R7, R8, R9, 57) + ROUND3(R9, R10, R11, R7, R8, 58) + ROUND3(R8, R9, R10, R11, R7, 59) + + ROUND4(R7, R8, R9, R10, R11, 60) + ROUND4(R11, R7, R8, R9, R10, 61) + ROUND4(R10, R11, R7, R8, R9, 62) + ROUND4(R9, R10, R11, R7, R8, 63) + ROUND4(R8, R9, R10, R11, R7, 64) + ROUND4(R7, R8, R9, R10, R11, 65) + ROUND4(R11, R7, R8, R9, R10, 66) + ROUND4(R10, R11, R7, R8, R9, 67) + ROUND4(R9, R10, R11, R7, R8, 68) + ROUND4(R8, R9, R10, R11, R7, 69) + ROUND4(R7, R8, R9, R10, R11, 70) + ROUND4(R11, R7, R8, R9, R10, 71) + ROUND4(R10, R11, R7, R8, R9, 72) + ROUND4(R9, R10, R11, R7, R8, 73) + ROUND4(R8, R9, R10, R11, R7, 74) + ROUND4(R7, R8, R9, R10, R11, 75) + ROUND4(R11, R7, R8, R9, R10, 76) + ROUND4(R10, R11, R7, R8, R9, 77) + ROUND4(R9, R10, R11, R7, R8, 78) + ROUND4(R8, R9, R10, R11, R7, 79) + + ADD R12, R7 + ADD R13, R8 + ADD R14, R9 + ADD R15, R10 + ADD R16, R11 + + ADDV $64, R5 + BNE R5, R24, loop + +end: + MOVW R7, (0*4)(R4) + MOVW R8, (1*4)(R4) + MOVW R9, (2*4)(R4) + MOVW R10, (3*4)(R4) + MOVW R11, (4*4)(R4) +zero: + RET -- GitLab