Skip to content
Snippets Groups Projects
Commit 0c8615be authored by Xiaolin Zhao's avatar Xiaolin Zhao Committed by abner chenc
Browse files

crypto/sha512: improve performance of loong64

1. Replaced WORD with instruction REVBV.
2. Simplified the implementation of Ch and Maj by reducing instructions, refer to the implementation of riscv64.

goos: linux
goarch: loong64
pkg: crypto/sha512
cpu: Loongson-3A6000-HV @ 2500.00MHz
                  |  bench.old  |             bench.new              |
                  |   sec/op    |   sec/op     vs base               |
Hash8Bytes/New      415.6n ± 0%   398.9n ± 0%  -4.01% (p=0.000 n=10)
Hash8Bytes/Sum384   427.6n ± 0%   409.7n ± 0%  -4.20% (p=0.000 n=10)
Hash8Bytes/Sum512   432.1n ± 0%   415.3n ± 0%  -3.89% (p=0.000 n=10)
Hash1K/New          3.087µ ± 0%   2.931µ ± 0%  -5.05% (p=0.000 n=10)
Hash1K/Sum384       3.094µ ± 0%   2.938µ ± 0%  -5.04% (p=0.000 n=10)
Hash1K/Sum512       3.102µ ± 0%   2.946µ ± 0%  -5.01% (p=0.000 n=10)
Hash8K/New          21.81µ ± 0%   20.67µ ± 0%  -5.25% (p=0.000 n=10)
Hash8K/Sum384       21.81µ ± 0%   20.66µ ± 0%  -5.26% (p=0.000 n=10)
Hash8K/Sum512       21.82µ ± 0%   20.69µ ± 0%  -5.21% (p=0.000 n=10)
geomean             3.061µ        2.915µ       -4.77%

goos: linux
goarch: loong64
pkg: crypto/sha512
cpu: Loongson-3A5000 @ 2500.00MHz
                  |  bench.old  |             bench.new              |
                  |   sec/op    |   sec/op     vs base               |
Hash8Bytes/New      509.4n ± 0%   484.9n ± 0%  -4.79% (p=0.000 n=10)
Hash8Bytes/Sum384   522.9n ± 0%   498.2n ± 0%  -4.71% (p=0.000 n=10)
Hash8Bytes/Sum512   529.0n ± 0%   504.5n ± 0%  -4.63% (p=0.000 n=10)
Hash1K/New          3.578µ ± 0%   3.364µ ± 0%  -5.98% (p=0.000 n=10)
Hash1K/Sum384       3.593µ ± 0%   3.382µ ± 0%  -5.87% (p=0.000 n=10)
Hash1K/Sum512       3.599µ ± 0%   3.386µ ± 0%  -5.93% (p=0.000 n=10)
Hash8K/New          25.10µ ± 0%   23.56µ ± 0%  -6.14% (p=0.000 n=10)
Hash8K/Sum384       25.12µ ± 0%   23.58µ ± 0%  -6.13% (p=0.000 n=10)
Hash8K/Sum512       25.12µ ± 0%   23.59µ ± 0%  -6.12% (p=0.000 n=10)
geomean             3.607µ        3.405µ       -5.59%

Change-Id: I8307ea0fd2d474671f1eef2da2ba5fe899c645d5
Reviewed-on: https://go-review.googlesource.com/c/go/+/668835


Reviewed-by: default avatarabner chenc <chenguoqi@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: default avatarCarlos Amedee <carlos@golang.org>
Reviewed-by: default avatarCherry Mui <cherryyz@google.com>
parent 8d7c43b5
Branches
No related tags found
No related merge requests found
......@@ -18,7 +18,7 @@
// W[i] = M[i]; for 0 <= i <= 15
#define LOAD0(index) \
MOVV (index*8)(R5), REGTMP4; \
WORD $0x3ce7; \ //REVBV REGTMP4, REGTMP4
REVBV REGTMP4, REGTMP4; \
MOVV REGTMP4, (index*8)(R3)
// W[i] = SIGMA1(W[i-2]) + W[i-7] + SIGMA0(W[i-15]) + W[i-16]; for 16 <= i <= 79
......@@ -50,38 +50,37 @@
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + K[i] + W[i]
// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
// = ((y XOR z) AND x) XOR z
// Calculate T1 in REGTMP4
#define SHA512T1(const, e, f, g, h) \
ADDV $const, h; \
ADDV REGTMP4, h; \
ROTRV $14, e, REGTMP4; \
ROTRV $14, e, REGTMP5; \
ROTRV $18, e, REGTMP; \
ROTRV $41, e, REGTMP3; \
AND f, e, REGTMP2; \
XOR REGTMP, REGTMP4; \
MOVV $0xffffffffffffffff, REGTMP; \
XOR REGTMP4, REGTMP3; \
XOR REGTMP, e, REGTMP5; \
XOR f, g, REGTMP2; \
XOR REGTMP, REGTMP5; \
AND e, REGTMP2; \
XOR REGTMP5, REGTMP3; \
XOR g, REGTMP2; \
ADDV REGTMP3, h; \
AND g, REGTMP5; \
XOR REGTMP2, REGTMP5; \
ADDV h, REGTMP5, REGTMP4
ADDV h, REGTMP2, REGTMP4
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
// = ((y XOR z) AND x) XOR (y AND z)
// Calculate T2 in REGTMP1
#define SHA512T2(a, b, c) \
ROTRV $28, a, REGTMP5; \
AND b, c, REGTMP1; \
ROTRV $34, a, REGTMP3; \
AND c, a, REGTMP; \
XOR REGTMP3, REGTMP5; \
XOR REGTMP, REGTMP1; \
ROTRV $39, a, REGTMP2; \
AND a, b, REGTMP3; \
XOR REGTMP3, REGTMP1; \
XOR b, c, REGTMP; \
AND b, c, REGTMP1; \
XOR REGTMP3, REGTMP5; \
AND REGTMP, a, REGTMP; \
XOR REGTMP2, REGTMP5; \
XOR REGTMP, REGTMP1; \
ADDV REGTMP5, REGTMP1
// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment