Skip to content
Snippets Groups Projects
Commit 2cee5d81 authored by Joel Sing's avatar Joel Sing
Browse files

math/big: implement addMulVVW in riscv64 assembly

This provides an assembly implementation of addMulVVW for riscv64,
processing up to four words per loop, resulting in a significant
performance gain.

On a StarFive VisionFive 2:

                   │ addmulvvw.1  │             addmulvvw.2             │
                   │    sec/op    │   sec/op     vs base                │
AddMulVVW/1-4         65.49n ± 0%   50.79n ± 0%  -22.44% (p=0.000 n=10)
AddMulVVW/2-4         82.81n ± 0%   66.83n ± 0%  -19.29% (p=0.000 n=10)
AddMulVVW/3-4        100.20n ± 0%   82.87n ± 0%  -17.30% (p=0.000 n=10)
AddMulVVW/4-4        117.50n ± 0%   84.20n ± 0%  -28.34% (p=0.000 n=10)
AddMulVVW/5-4         134.9n ± 0%   100.3n ± 0%  -25.69% (p=0.000 n=10)
AddMulVVW/10-4        221.7n ± 0%   164.4n ± 0%  -25.85% (p=0.000 n=10)
AddMulVVW/100-4       1.794µ ± 0%   1.250µ ± 0%  -30.32% (p=0.000 n=10)
AddMulVVW/1000-4      17.42µ ± 0%   12.08µ ± 0%  -30.68% (p=0.000 n=10)
AddMulVVW/10000-4     254.9µ ± 0%   214.8µ ± 0%  -15.75% (p=0.000 n=10)
AddMulVVW/100000-4    2.569m ± 0%   2.178m ± 0%  -15.20% (p=0.000 n=10)
geomean               1.443µ        1.107µ       -23.29%

                   │ addmulvvw.1  │              addmulvvw.2              │
                   │     B/s      │      B/s       vs base                │
AddMulVVW/1-4        932.0Mi ± 0%   1201.6Mi ± 0%  +28.93% (p=0.000 n=10)
AddMulVVW/2-4        1.440Gi ± 0%    1.784Gi ± 0%  +23.90% (p=0.000 n=10)
AddMulVVW/3-4        1.785Gi ± 0%    2.158Gi ± 0%  +20.87% (p=0.000 n=10)
AddMulVVW/4-4        2.029Gi ± 0%    2.832Gi ± 0%  +39.59% (p=0.000 n=10)
AddMulVVW/5-4        2.209Gi ± 0%    2.973Gi ± 0%  +34.55% (p=0.000 n=10)
AddMulVVW/10-4       2.689Gi ± 0%    3.626Gi ± 0%  +34.86% (p=0.000 n=10)
AddMulVVW/100-4      3.323Gi ± 0%    4.770Gi ± 0%  +43.54% (p=0.000 n=10)
AddMulVVW/1000-4     3.421Gi ± 0%    4.936Gi ± 0%  +44.27% (p=0.000 n=10)
AddMulVVW/10000-4    2.338Gi ± 0%    2.776Gi ± 0%  +18.69% (p=0.000 n=10)
AddMulVVW/100000-4   2.320Gi ± 0%    2.736Gi ± 0%  +17.93% (p=0.000 n=10)
geomean              2.109Gi         2.749Gi       +30.36%

Change-Id: I6c7ee48233c53ff9b6a5a9002675886cd9bff5af
Reviewed-on: https://go-review.googlesource.com/c/go/+/595400


Reviewed-by: default avatarMeng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: default avatarCherry Mui <cherryyz@google.com>
Reviewed-by: default avatarDmitri Shuralyov <dmitshur@google.com>
Reviewed-by: default avatarMark Ryan <markdryan@rivosinc.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
parent 86cd5c40
No related branches found
No related tags found
No related merge requests found
......@@ -375,5 +375,96 @@ done:
RET
TEXT ·addMulVVW(SB),NOSPLIT,$0
JMP ·addMulVVW_g(SB)
MOV x+24(FP), X5
MOV y+48(FP), X6
MOV z+0(FP), X7
MOV z_len+8(FP), X30
MOV $4, X28
MOV $0, X29 // c = 0
BEQZ X30, done
BLTU X30, X28, loop1
loop4:
MOV 0(X5), X8 // x[0]
MOV 0(X7), X10 // z[0]
MOV 8(X5), X11 // x[1]
MOV 8(X7), X13 // z[1]
MOV 16(X5), X14 // x[2]
MOV 16(X7), X16 // z[2]
MOV 24(X5), X17 // x[3]
MOV 24(X7), X19 // z[3]
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
MUL X8, X6, X8 // z_lo[0] = x[0] * y
ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0]
SLTU X8, X21, X22
ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0]
ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c
SLTU X21, X10, X22
ADD X9, X22, X29 // next c
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
MUL X11, X6, X11 // z_lo[1] = x[1] * y
ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1]
SLTU X11, X21, X22
ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1]
ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c
SLTU X21, X13, X22
ADD X12, X22, X29 // next c
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
MUL X14, X6, X14 // z_lo[2] = x[2] * y
ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2]
SLTU X14, X21, X22
ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2]
ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c
SLTU X21, X16, X22
ADD X15, X22, X29 // next c
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
MUL X17, X6, X17 // z_lo[3] = x[3] * y
ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3]
SLTU X17, X21, X22
ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3]
ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c
SLTU X21, X19, X22
ADD X18, X22, X29 // next c
MOV X10, 0(X7) // z[0]
MOV X13, 8(X7) // z[1]
MOV X16, 16(X7) // z[2]
MOV X19, 24(X7) // z[3]
ADD $32, X5
ADD $32, X7
SUB $4, X30
BGEU X30, X28, loop4
BEQZ X30, done
loop1:
MOV 0(X5), X10 // x
MOV 0(X7), X11 // z
MULHU X10, X6, X12 // z_hi = x * y
MUL X10, X6, X10 // z_lo = x * y
ADD X10, X11, X13 // z_lo = x * y + z
SLTU X10, X13, X15
ADD X12, X15, X12 // z_hi = x * y + z
ADD X13, X29, X10 // z = x * y + z + c
SLTU X13, X10, X15
ADD X12, X15, X29 // next c
MOV X10, 0(X7) // z
ADD $8, X5
ADD $8, X7
SUB $1, X30
BNEZ X30, loop1
done:
MOV X29, c+56(FP) // return c
RET
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment