From c6f56985ade937ab4221b8b470139a0ebc4a1825 Mon Sep 17 00:00:00 2001 From: Joel Sing <joel@sing.id.au> Date: Thu, 27 Jun 2024 20:02:38 +1000 Subject: [PATCH] math/big: implement addVW in riscv64 assembly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This provides an assembly implementation of addVW for riscv64, processing up to four words per loop, resulting in a significant performance gain. On a StarFive VisionFive 2: │ addvw.1 │ addvw.2 │ │ sec/op │ sec/op vs base │ AddVW/1-4 57.43n ± 0% 41.45n ± 0% -27.83% (p=0.000 n=10) AddVW/2-4 69.31n ± 0% 48.15n ± 0% -30.53% (p=0.000 n=10) AddVW/3-4 76.12n ± 0% 54.97n ± 0% -27.79% (p=0.000 n=10) AddVW/4-4 85.47n ± 0% 56.14n ± 0% -34.32% (p=0.000 n=10) AddVW/5-4 96.16n ± 0% 62.82n ± 0% -34.67% (p=0.000 n=10) AddVW/10-4 149.60n ± 0% 89.55n ± 0% -40.14% (p=0.000 n=10) AddVW/100-4 1115.0n ± 0% 549.3n ± 0% -50.74% (p=0.000 n=10) AddVW/1000-4 10.732µ ± 0% 5.060µ ± 0% -52.85% (p=0.000 n=10) AddVW/10000-4 151.7µ ± 0% 103.7µ ± 0% -31.63% (p=0.000 n=10) AddVW/100000-4 1.523m ± 0% 1.050m ± 0% -31.03% (p=0.000 n=10) AddVWext/1-4 57.42n ± 0% 41.45n ± 0% -27.81% (p=0.000 n=10) AddVWext/2-4 69.32n ± 0% 48.15n ± 0% -30.54% (p=0.000 n=10) AddVWext/3-4 76.12n ± 0% 54.87n ± 0% -27.92% (p=0.000 n=10) AddVWext/4-4 85.47n ± 0% 56.14n ± 0% -34.32% (p=0.000 n=10) AddVWext/5-4 96.15n ± 0% 62.82n ± 0% -34.66% (p=0.000 n=10) AddVWext/10-4 149.60n ± 0% 89.55n ± 0% -40.14% (p=0.000 n=10) AddVWext/100-4 1115.0n ± 0% 549.3n ± 0% -50.74% (p=0.000 n=10) AddVWext/1000-4 10.732µ ± 0% 5.060µ ± 0% -52.85% (p=0.000 n=10) AddVWext/10000-4 150.5µ ± 0% 103.7µ ± 0% -31.10% (p=0.000 n=10) AddVWext/100000-4 1.530m ± 0% 1.049m ± 0% -31.41% (p=0.000 n=10) geomean 1.003µ 633.9n -36.79% │ addvw.1 │ addvw.2 │ │ B/s │ B/s vs base │ AddVW/1-4 132.8Mi ± 0% 184.1Mi ± 0% +38.55% (p=0.000 n=10) AddVW/2-4 220.1Mi ± 0% 316.9Mi ± 0% +43.96% (p=0.000 n=10) AddVW/3-4 300.7Mi ± 0% 416.4Mi ± 0% +38.48% (p=0.000 n=10) AddVW/4-4 357.1Mi ± 0% 543.6Mi ± 0% +52.25% (p=0.000 n=10) AddVW/5-4 396.7Mi ± 0% 607.2Mi ± 0% +53.06% (p=0.000 n=10) AddVW/10-4 510.1Mi ± 0% 852.0Mi ± 0% +67.02% (p=0.000 n=10) AddVW/100-4 684.1Mi ± 0% 1389.0Mi ± 0% +103.03% (p=0.000 n=10) AddVW/1000-4 710.9Mi ± 0% 1507.8Mi ± 0% +112.08% (p=0.000 n=10) AddVW/10000-4 503.1Mi ± 0% 735.8Mi ± 0% +46.26% (p=0.000 n=10) AddVW/100000-4 501.0Mi ± 0% 726.5Mi ± 0% +45.00% (p=0.000 n=10) AddVWext/1-4 132.9Mi ± 0% 184.1Mi ± 0% +38.55% (p=0.000 n=10) AddVWext/2-4 220.1Mi ± 0% 316.9Mi ± 0% +43.98% (p=0.000 n=10) AddVWext/3-4 300.7Mi ± 0% 417.1Mi ± 0% +38.73% (p=0.000 n=10) AddVWext/4-4 357.1Mi ± 0% 543.6Mi ± 0% +52.25% (p=0.000 n=10) AddVWext/5-4 396.7Mi ± 0% 607.2Mi ± 0% +53.05% (p=0.000 n=10) AddVWext/10-4 510.1Mi ± 0% 852.0Mi ± 0% +67.02% (p=0.000 n=10) AddVWext/100-4 684.2Mi ± 0% 1389.0Mi ± 0% +103.02% (p=0.000 n=10) AddVWext/1000-4 710.9Mi ± 0% 1507.7Mi ± 0% +112.08% (p=0.000 n=10) AddVWext/10000-4 506.9Mi ± 0% 735.8Mi ± 0% +45.15% (p=0.000 n=10) AddVWext/100000-4 498.6Mi ± 0% 727.0Mi ± 0% +45.79% (p=0.000 n=10) geomean 388.3Mi 614.3Mi +58.19% Change-Id: Ib14a4b8c1d81e710753bbf6dd5546bbca44fe3f1 Reviewed-on: https://go-review.googlesource.com/c/go/+/595397 Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> --- src/math/big/arith_riscv64.s | 59 +++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s index 897d08229ee..0029a5b6b44 100644 --- a/src/math/big/arith_riscv64.s +++ b/src/math/big/arith_riscv64.s @@ -174,7 +174,64 @@ done: RET TEXT ·addVW(SB),NOSPLIT,$0 - JMP ·addVW_g(SB) + MOV x+24(FP), X5 + MOV y+48(FP), X6 + MOV z+0(FP), X7 + MOV z_len+8(FP), X30 + + MOV $4, X28 + MOV X6, X29 // c = y + + BEQZ X30, done + BLTU X30, X28, loop1 + +loop4: + MOV 0(X5), X8 // x[0] + MOV 8(X5), X11 // x[1] + MOV 16(X5), X14 // x[2] + MOV 24(X5), X17 // x[3] + + ADD X8, X29, X10 // z[0] = x[0] + c + SLTU X8, X10, X29 // next c + + ADD X11, X29, X13 // z[1] = x[1] + c + SLTU X11, X13, X29 // next c + + ADD X14, X29, X16 // z[2] = x[2] + c + SLTU X14, X16, X29 // next c + + ADD X17, X29, X19 // z[3] = x[3] + c + SLTU X17, X19, X29 // next c + + MOV X10, 0(X7) // z[0] + MOV X13, 8(X7) // z[1] + MOV X16, 16(X7) // z[2] + MOV X19, 24(X7) // z[3] + + ADD $32, X5 + ADD $32, X7 + SUB $4, X30 + + BGEU X30, X28, loop4 + BEQZ X30, done + +loop1: + MOV 0(X5), X10 // x + + ADD X10, X29, X12 // z = x + c + SLTU X10, X12, X29 // next c + + MOV X12, 0(X7) // z + + ADD $8, X5 + ADD $8, X7 + SUB $1, X30 + + BNEZ X30, loop1 + +done: + MOV X29, c+56(FP) // return c + RET TEXT ·subVW(SB),NOSPLIT,$0 JMP ·subVW_g(SB) -- GitLab