diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go index 3edf5a43606d417e55b100f3b7aae7d410006fd5..46f41a1cc2634ad762b86475dc121a306a382b55 100644 --- a/src/crypto/sha1/sha1block_decl.go +++ b/src/crypto/sha1/sha1block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (arm || 386 || s390x) && !purego +//go:build (386 || arm || loong64 || s390x) && !purego package sha1 diff --git a/src/crypto/sha1/sha1block_generic.go b/src/crypto/sha1/sha1block_generic.go index 4c6f74d99d8f23eb273530277a3dd880a15448d9..5989a2434760b53fa3e9cc478f5adbc19ef4a5d4 100644 --- a/src/crypto/sha1/sha1block_generic.go +++ b/src/crypto/sha1/sha1block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !386 && !arm && !s390x && !arm64) || purego +//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !s390x) || purego package sha1 diff --git a/src/crypto/sha1/sha1block_loong64.s b/src/crypto/sha1/sha1block_loong64.s new file mode 100644 index 0000000000000000000000000000000000000000..7e9d6e0933994e8abf5b2a1ccf8d4a5e71373ebd --- /dev/null +++ b/src/crypto/sha1/sha1block_loong64.s @@ -0,0 +1,226 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +#include "textflag.h" + +// SHA-1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +#define REGTMP R30 +#define REGTMP1 R17 +#define REGTMP2 R18 +#define REGTMP3 R19 + +#define LOAD1(index) \ + MOVW (index*4)(R5), REGTMP3; \ + WORD $0x3a73; \ // REVB2W REGTMP3, REGTMP3 to big-endian + MOVW REGTMP3, (index*4)(R3) + +#define LOAD(index) \ + MOVW (((index)&0xf)*4)(R3), REGTMP3; \ + MOVW (((index-3)&0xf)*4)(R3), REGTMP; \ + MOVW (((index-8)&0xf)*4)(R3), REGTMP1; \ + MOVW (((index-14)&0xf)*4)(R3), REGTMP2; \ + XOR REGTMP, REGTMP3; \ + XOR REGTMP1, REGTMP3; \ + XOR REGTMP2, REGTMP3; \ + ROTR $31, REGTMP3; \ + MOVW REGTMP3, (((index)&0xf)*4)(R3) + +// f = d ^ (b & (c ^ d)) +#define FUNC1(a, b, c, d, e) \ + XOR c, d, REGTMP1; \ + AND b, REGTMP1; \ + XOR d, REGTMP1 + +// f = b ^ c ^ d +#define FUNC2(a, b, c, d, e) \ + XOR b, c, REGTMP1; \ + XOR d, REGTMP1 + +// f = (b & c) | ((b | c) & d) +#define FUNC3(a, b, c, d, e) \ + OR b, c, REGTMP2; \ + AND b, c, REGTMP; \ + AND d, REGTMP2; \ + OR REGTMP, REGTMP2, REGTMP1 + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, const) \ + ROTR $2, b; \ // b << 30 + ADD REGTMP1, e; \ // e = e + f + ROTR $27, a, REGTMP2; \ // a << 5 + ADD REGTMP3, e; \ // e = e + w[i] + ADDV $const, e; \ // e = e + k + ADD REGTMP2, e // e = e + a<<5 + +#define ROUND1(a, b, c, d, e, index) \ + LOAD1(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND1x(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND2(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x6ED9EBA1) + +#define ROUND3(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x8F1BBCDC) + +#define ROUND4(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0xCA62C1D6) + +// A stack frame size of 64 bytes is required here, because +// the frame size used for data expansion is 64 bytes. +// See the definition of the macro LOAD above, and the definition +// of the local variable w in the general implementation (sha1block.go). +TEXT ·block(SB),NOSPLIT,$64-32 + MOVV dig+0(FP), R4 + MOVV p_base+8(FP), R5 + MOVV p_len+16(FP), R6 + AND $~63, R6 + BEQ R6, zero + + // p_len >= 64 + ADDV R5, R6, R24 + MOVW (0*4)(R4), R7 + MOVW (1*4)(R4), R8 + MOVW (2*4)(R4), R9 + MOVW (3*4)(R4), R10 + MOVW (4*4)(R4), R11 + +loop: + MOVW R7, R12 + MOVW R8, R13 + MOVW R9, R14 + MOVW R10, R15 + MOVW R11, R16 + + ROUND1(R7, R8, R9, R10, R11, 0) + ROUND1(R11, R7, R8, R9, R10, 1) + ROUND1(R10, R11, R7, R8, R9, 2) + ROUND1(R9, R10, R11, R7, R8, 3) + ROUND1(R8, R9, R10, R11, R7, 4) + ROUND1(R7, R8, R9, R10, R11, 5) + ROUND1(R11, R7, R8, R9, R10, 6) + ROUND1(R10, R11, R7, R8, R9, 7) + ROUND1(R9, R10, R11, R7, R8, 8) + ROUND1(R8, R9, R10, R11, R7, 9) + ROUND1(R7, R8, R9, R10, R11, 10) + ROUND1(R11, R7, R8, R9, R10, 11) + ROUND1(R10, R11, R7, R8, R9, 12) + ROUND1(R9, R10, R11, R7, R8, 13) + ROUND1(R8, R9, R10, R11, R7, 14) + ROUND1(R7, R8, R9, R10, R11, 15) + + ROUND1x(R11, R7, R8, R9, R10, 16) + ROUND1x(R10, R11, R7, R8, R9, 17) + ROUND1x(R9, R10, R11, R7, R8, 18) + ROUND1x(R8, R9, R10, R11, R7, 19) + + ROUND2(R7, R8, R9, R10, R11, 20) + ROUND2(R11, R7, R8, R9, R10, 21) + ROUND2(R10, R11, R7, R8, R9, 22) + ROUND2(R9, R10, R11, R7, R8, 23) + ROUND2(R8, R9, R10, R11, R7, 24) + ROUND2(R7, R8, R9, R10, R11, 25) + ROUND2(R11, R7, R8, R9, R10, 26) + ROUND2(R10, R11, R7, R8, R9, 27) + ROUND2(R9, R10, R11, R7, R8, 28) + ROUND2(R8, R9, R10, R11, R7, 29) + ROUND2(R7, R8, R9, R10, R11, 30) + ROUND2(R11, R7, R8, R9, R10, 31) + ROUND2(R10, R11, R7, R8, R9, 32) + ROUND2(R9, R10, R11, R7, R8, 33) + ROUND2(R8, R9, R10, R11, R7, 34) + ROUND2(R7, R8, R9, R10, R11, 35) + ROUND2(R11, R7, R8, R9, R10, 36) + ROUND2(R10, R11, R7, R8, R9, 37) + ROUND2(R9, R10, R11, R7, R8, 38) + ROUND2(R8, R9, R10, R11, R7, 39) + + ROUND3(R7, R8, R9, R10, R11, 40) + ROUND3(R11, R7, R8, R9, R10, 41) + ROUND3(R10, R11, R7, R8, R9, 42) + ROUND3(R9, R10, R11, R7, R8, 43) + ROUND3(R8, R9, R10, R11, R7, 44) + ROUND3(R7, R8, R9, R10, R11, 45) + ROUND3(R11, R7, R8, R9, R10, 46) + ROUND3(R10, R11, R7, R8, R9, 47) + ROUND3(R9, R10, R11, R7, R8, 48) + ROUND3(R8, R9, R10, R11, R7, 49) + ROUND3(R7, R8, R9, R10, R11, 50) + ROUND3(R11, R7, R8, R9, R10, 51) + ROUND3(R10, R11, R7, R8, R9, 52) + ROUND3(R9, R10, R11, R7, R8, 53) + ROUND3(R8, R9, R10, R11, R7, 54) + ROUND3(R7, R8, R9, R10, R11, 55) + ROUND3(R11, R7, R8, R9, R10, 56) + ROUND3(R10, R11, R7, R8, R9, 57) + ROUND3(R9, R10, R11, R7, R8, 58) + ROUND3(R8, R9, R10, R11, R7, 59) + + ROUND4(R7, R8, R9, R10, R11, 60) + ROUND4(R11, R7, R8, R9, R10, 61) + ROUND4(R10, R11, R7, R8, R9, 62) + ROUND4(R9, R10, R11, R7, R8, 63) + ROUND4(R8, R9, R10, R11, R7, 64) + ROUND4(R7, R8, R9, R10, R11, 65) + ROUND4(R11, R7, R8, R9, R10, 66) + ROUND4(R10, R11, R7, R8, R9, 67) + ROUND4(R9, R10, R11, R7, R8, 68) + ROUND4(R8, R9, R10, R11, R7, 69) + ROUND4(R7, R8, R9, R10, R11, 70) + ROUND4(R11, R7, R8, R9, R10, 71) + ROUND4(R10, R11, R7, R8, R9, 72) + ROUND4(R9, R10, R11, R7, R8, 73) + ROUND4(R8, R9, R10, R11, R7, 74) + ROUND4(R7, R8, R9, R10, R11, 75) + ROUND4(R11, R7, R8, R9, R10, 76) + ROUND4(R10, R11, R7, R8, R9, 77) + ROUND4(R9, R10, R11, R7, R8, 78) + ROUND4(R8, R9, R10, R11, R7, 79) + + ADD R12, R7 + ADD R13, R8 + ADD R14, R9 + ADD R15, R10 + ADD R16, R11 + + ADDV $64, R5 + BNE R5, R24, loop + +end: + MOVW R7, (0*4)(R4) + MOVW R8, (1*4)(R4) + MOVW R9, (2*4)(R4) + MOVW R10, (3*4)(R4) + MOVW R11, (4*4)(R4) +zero: + RET