diff --git a/src/crypto/sha256/sha256block_decl.go b/src/crypto/sha256/sha256block_decl.go index c9c11944876d49d4c2e4efdee5982d555586729d..18ba1c0ec121d10e450c0d139f370de7fd469ab8 100644 --- a/src/crypto/sha256/sha256block_decl.go +++ b/src/crypto/sha256/sha256block_decl.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build 386 || amd64 || s390x || ppc64le +//go:build 386 || amd64 || s390x || ppc64le || ppc64 package sha256 diff --git a/src/crypto/sha256/sha256block_generic.go b/src/crypto/sha256/sha256block_generic.go index a8878c2eeea21fe34f4ebc84de8e9ad4066582b0..fd098bec894be5215b803627d39861984765391f 100644 --- a/src/crypto/sha256/sha256block_generic.go +++ b/src/crypto/sha256/sha256block_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !386 && !s390x && !ppc64le && !arm64 +//go:build !amd64 && !386 && !s390x && !ppc64le && !ppc64 && !arm64 package sha256 diff --git a/src/crypto/sha256/sha256block_ppc64le.s b/src/crypto/sha256/sha256block_ppc64x.s similarity index 92% rename from src/crypto/sha256/sha256block_ppc64le.s rename to src/crypto/sha256/sha256block_ppc64x.s index 77e63c073fd4457a5fcb1e3c68768a6b42b24cf2..617d42e1d735fc0be5808f8893dfca2562236b92 100644 --- a/src/crypto/sha256/sha256block_ppc64le.s +++ b/src/crypto/sha256/sha256block_ppc64x.s @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build ppc64 || ppc64le + // Based on CRYPTOGAMS code with the following comment: // # ==================================================================== // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL @@ -57,19 +59,11 @@ #define END R5 #define TBL R6 #define IDX R7 -#define CNT R8 #define LEN R9 -#define OFFLOAD R11 #define TEMP R12 #define HEX00 R0 #define HEX10 R10 -#define HEX20 R25 -#define HEX30 R26 -#define HEX40 R27 -#define HEX50 R28 -#define HEX60 R29 -#define HEX70 R31 // V0-V7 are A-H // V8-V23 are used for the message schedule @@ -212,12 +206,23 @@ DATA ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2 DATA ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2 DATA ·kcon+0x400(SB)/8, $0x0000000000000000 DATA ·kcon+0x408(SB)/8, $0x0000000000000000 + +#ifdef GOARCH_ppc64le DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors DATA ·kcon+0x418(SB)/8, $0x1011121300010203 DATA ·kcon+0x420(SB)/8, $0x1011121310111213 DATA ·kcon+0x428(SB)/8, $0x0405060700010203 DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b DATA ·kcon+0x438(SB)/8, $0x0405060700010203 +#else +DATA ·kcon+0x410(SB)/8, $0x1011121300010203 +DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x420(SB)/8, $0x0405060700010203 +DATA ·kcon+0x428(SB)/8, $0x1011121310111213 +DATA ·kcon+0x430(SB)/8, $0x0001020304050607 +DATA ·kcon+0x438(SB)/8, $0x08090a0b10111213 +#endif + GLOBL ·kcon(SB), RODATA, $1088 #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \ @@ -257,36 +262,34 @@ GLOBL ·kcon(SB), RODATA, $1088 VADDUWM S0, h, h; \ VADDUWM s1, xj, xj +#ifdef GOARCH_ppc64le +#define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt +#else +#define VPERMLE(va,vb,vc,vt) +#endif + // func block(dig *digest, p []byte) -TEXT ·block(SB),0,$128-32 +TEXT ·block(SB),0,$0-32 MOVD dig+0(FP), CTX MOVD p_base+8(FP), INP MOVD p_len+16(FP), LEN SRD $6, LEN SLD $6, LEN - ADD INP, LEN, END CMP INP, END BEQ end MOVD $·kcon(SB), TBL - MOVD R1, OFFLOAD - - MOVD R0, CNT MOVWZ $0x10, HEX10 - MOVWZ $0x20, HEX20 - MOVWZ $0x30, HEX30 - MOVWZ $0x40, HEX40 - MOVWZ $0x50, HEX50 - MOVWZ $0x60, HEX60 - MOVWZ $0x70, HEX70 - MOVWZ $8, IDX + +#ifdef GOARCH_ppc64le LVSL (IDX)(R0), LEMASK VSPLTISB $0x0F, KI VXOR KI, LEMASK, LEMASK +#endif LXVW4X (CTX)(HEX00), VS32 // v0 = vs32 LXVW4X (CTX)(HEX10), VS36 // v4 = vs36 @@ -306,20 +309,21 @@ loop: LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance ADD $16, INP - STVX V0, (OFFLOAD+HEX00) - STVX V1, (OFFLOAD+HEX10) - STVX V2, (OFFLOAD+HEX20) - STVX V3, (OFFLOAD+HEX30) - STVX V4, (OFFLOAD+HEX40) - STVX V5, (OFFLOAD+HEX50) - STVX V6, (OFFLOAD+HEX60) - STVX V7, (OFFLOAD+HEX70) + // Offload to VSR24-31 (aka FPR24-31) + XXLOR V0, V0, VS24 + XXLOR V1, V1, VS25 + XXLOR V2, V2, VS26 + XXLOR V3, V3, VS27 + XXLOR V4, V4, VS28 + XXLOR V5, V5, VS29 + XXLOR V6, V6, VS30 + XXLOR V7, V7, VS31 VADDUWM KI, V7, V7 // h+K[i] LVX (TBL)(IDX), KI ADD $16, IDX - VPERM V8, V8, LEMASK, V8 + VPERMLE(V8, V8, LEMASK, V8) SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) VSLDOI $4, V8, V8, V9 SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) @@ -329,7 +333,7 @@ loop: ADD $16, INP, INP VSLDOI $4, V10, V10, V11 SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) - VPERM V12, V12, LEMASK, V12 + VPERMLE(V12, V12, LEMASK, V12) SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) VSLDOI $4, V12, V12, V13 SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) @@ -339,7 +343,7 @@ loop: ADD $16, INP, INP VSLDOI $4, V14, V14, V15 SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) - VPERM V16, V16, LEMASK, V16 + VPERMLE(V16, V16, LEMASK, V16) SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) VSLDOI $4, V16, V16, V17 SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) @@ -349,7 +353,7 @@ loop: LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance ADD $16, INP, INP SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) - VPERM V20, V20, LEMASK, V20 + VPERMLE(V20, V20, LEMASK, V20) SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) VSLDOI $4, V20, V20, V21 SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) @@ -381,21 +385,21 @@ L16_xx: BC 0x10, 0, L16_xx // bdnz - LVX (OFFLOAD)(HEX00), V10 + XXLOR VS24, VS24, V10 - LVX (OFFLOAD)(HEX10), V11 + XXLOR VS25, VS25, V11 VADDUWM V10, V0, V0 - LVX (OFFLOAD)(HEX20), V12 + XXLOR VS26, VS26, V12 VADDUWM V11, V1, V1 - LVX (OFFLOAD)(HEX30), V13 + XXLOR VS27, VS27, V13 VADDUWM V12, V2, V2 - LVX (OFFLOAD)(HEX40), V14 + XXLOR VS28, VS28, V14 VADDUWM V13, V3, V3 - LVX (OFFLOAD)(HEX50), V15 + XXLOR VS29, VS29, V15 VADDUWM V14, V4, V4 - LVX (OFFLOAD)(HEX60), V16 + XXLOR VS30, VS30, V16 VADDUWM V15, V5, V5 - LVX (OFFLOAD)(HEX70), V17 + XXLOR VS31, VS31, V17 VADDUWM V16, V6, V6 VADDUWM V17, V7, V7