From 4f18477db6d6c67abb9dfbf0c9077fecfd7e5ba8 Mon Sep 17 00:00:00 2001
From: Joel Sing <joel@sing.id.au>
Date: Thu, 27 Jun 2024 20:02:57 +1000
Subject: [PATCH] math/big: implement subVW in riscv64 assembly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This provides an assembly implementation of subVW for riscv64,
processing up to four words per loop, resulting in a significant
performance gain.

On a StarFive VisionFive 2:

                  │   subvw.1    │               subvw.2               │
                  │    sec/op    │   sec/op     vs base                │
SubVW/1-4            57.43n ± 0%   41.45n ± 0%  -27.82% (p=0.000 n=10)
SubVW/2-4            69.31n ± 0%   48.15n ± 0%  -30.53% (p=0.000 n=10)
SubVW/3-4            76.12n ± 0%   54.87n ± 0%  -27.92% (p=0.000 n=10)
SubVW/4-4            85.47n ± 0%   56.14n ± 0%  -34.32% (p=0.000 n=10)
SubVW/5-4            96.15n ± 0%   62.83n ± 0%  -34.65% (p=0.000 n=10)
SubVW/10-4          149.60n ± 0%   89.55n ± 0%  -40.14% (p=0.000 n=10)
SubVW/100-4         1115.0n ± 0%   549.3n ± 0%  -50.74% (p=0.000 n=10)
SubVW/1000-4        10.732µ ± 0%   5.071µ ± 0%  -52.75% (p=0.000 n=10)
SubVW/10000-4        153.0µ ± 0%   103.7µ ± 0%  -32.21% (p=0.000 n=10)
SubVW/100000-4       1.542m ± 0%   1.046m ± 0%  -32.13% (p=0.000 n=10)
SubVWext/1-4         57.42n ± 0%   41.45n ± 0%  -27.81% (p=0.000 n=10)
SubVWext/2-4         69.33n ± 0%   48.15n ± 0%  -30.55% (p=0.000 n=10)
SubVWext/3-4         76.12n ± 0%   54.93n ± 0%  -27.84% (p=0.000 n=10)
SubVWext/4-4         85.47n ± 0%   56.14n ± 0%  -34.32% (p=0.000 n=10)
SubVWext/5-4         96.15n ± 0%   62.83n ± 0%  -34.65% (p=0.000 n=10)
SubVWext/10-4       149.60n ± 0%   89.56n ± 0%  -40.14% (p=0.000 n=10)
SubVWext/100-4      1115.0n ± 0%   549.3n ± 0%  -50.74% (p=0.000 n=10)
SubVWext/1000-4     10.732µ ± 0%   5.061µ ± 0%  -52.84% (p=0.000 n=10)
SubVWext/10000-4     152.5µ ± 0%   103.7µ ± 0%  -32.02% (p=0.000 n=10)
SubVWext/100000-4    1.533m ± 0%   1.046m ± 0%  -31.75% (p=0.000 n=10)
geomean              1.005µ        633.7n       -36.92%

                  │   subvw.1    │                subvw.2                 │
                  │     B/s      │      B/s       vs base                 │
SubVW/1-4           132.9Mi ± 0%    184.1Mi ± 0%   +38.54% (p=0.000 n=10)
SubVW/2-4           220.1Mi ± 0%    316.9Mi ± 0%   +43.95% (p=0.000 n=10)
SubVW/3-4           300.7Mi ± 0%    417.1Mi ± 0%   +38.72% (p=0.000 n=10)
SubVW/4-4           357.1Mi ± 0%    543.6Mi ± 0%   +52.24% (p=0.000 n=10)
SubVW/5-4           396.7Mi ± 0%    607.2Mi ± 0%   +53.03% (p=0.000 n=10)
SubVW/10-4          510.1Mi ± 0%    851.9Mi ± 0%   +67.01% (p=0.000 n=10)
SubVW/100-4         684.2Mi ± 0%   1388.9Mi ± 0%  +102.99% (p=0.000 n=10)
SubVW/1000-4        710.9Mi ± 0%   1504.5Mi ± 0%  +111.63% (p=0.000 n=10)
SubVW/10000-4       498.7Mi ± 0%    735.7Mi ± 0%   +47.52% (p=0.000 n=10)
SubVW/100000-4      494.8Mi ± 0%    729.1Mi ± 0%   +47.34% (p=0.000 n=10)
SubVWext/1-4        132.9Mi ± 0%    184.1Mi ± 0%   +38.53% (p=0.000 n=10)
SubVWext/2-4        220.1Mi ± 0%    316.9Mi ± 0%   +44.00% (p=0.000 n=10)
SubVWext/3-4        300.7Mi ± 0%    416.7Mi ± 0%   +38.57% (p=0.000 n=10)
SubVWext/4-4        357.1Mi ± 0%    543.6Mi ± 0%   +52.24% (p=0.000 n=10)
SubVWext/5-4        396.7Mi ± 0%    607.2Mi ± 0%   +53.04% (p=0.000 n=10)
SubVWext/10-4       510.1Mi ± 0%    851.9Mi ± 0%   +67.01% (p=0.000 n=10)
SubVWext/100-4      684.2Mi ± 0%   1388.9Mi ± 0%  +102.99% (p=0.000 n=10)
SubVWext/1000-4     710.9Mi ± 0%   1507.6Mi ± 0%  +112.07% (p=0.000 n=10)
SubVWext/10000-4    500.1Mi ± 0%    735.7Mi ± 0%   +47.10% (p=0.000 n=10)
SubVWext/100000-4   497.8Mi ± 0%    729.4Mi ± 0%   +46.52% (p=0.000 n=10)
geomean             387.6Mi         614.5Mi        +58.51%

Change-Id: I9d7fac719e977710ad9db9121fa298db6df605de
Reviewed-on: https://go-review.googlesource.com/c/go/+/595398
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/math/big/arith_riscv64.s | 59 +++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s
index 0029a5b6b44..e856d2e9c0e 100644
--- a/src/math/big/arith_riscv64.s
+++ b/src/math/big/arith_riscv64.s
@@ -234,7 +234,64 @@ done:
 	RET
 
 TEXT ·subVW(SB),NOSPLIT,$0
-	JMP ·subVW_g(SB)
+	MOV	x+24(FP), X5
+	MOV	y+48(FP), X6
+	MOV	z+0(FP), X7
+	MOV	z_len+8(FP), X30
+
+	MOV	$4, X28
+	MOV	X6, X29		// b = y
+
+	BEQZ	X30, done
+	BLTU	X30, X28, loop1
+
+loop4:
+	MOV	0(X5), X8	// x[0]
+	MOV	8(X5), X11	// x[1]
+	MOV	16(X5), X14	// x[2]
+	MOV	24(X5), X17	// x[3]
+
+	SUB	X29, X8, X10	// z[0] = x[0] - b
+	SLTU	X10, X8, X29	// next b
+
+	SUB	X29, X11, X13	// z[1] = x[1] - b
+	SLTU	X13, X11, X29	// next b
+
+	SUB	X29, X14, X16	// z[2] = x[2] - b
+	SLTU	X16, X14, X29	// next b
+
+	SUB	X29, X17, X19	// z[3] = x[3] - b
+	SLTU	X19, X17, X29	// next b
+
+	MOV	X10, 0(X7)	// z[0]
+	MOV	X13, 8(X7)	// z[1]
+	MOV	X16, 16(X7)	// z[2]
+	MOV	X19, 24(X7)	// z[3]
+
+	ADD	$32, X5
+	ADD	$32, X7
+	SUB	$4, X30
+
+	BGEU	X30, X28, loop4
+	BEQZ	X30, done
+
+loop1:
+	MOV	0(X5), X10	// x
+
+	SUB	X29, X10, X12	// z = x - b
+	SLTU	X12, X10, X29	// next b
+
+	MOV	X12, 0(X7)	// z
+
+	ADD	$8, X5
+	ADD	$8, X7
+	SUB	$1, X30
+
+	BNEZ	X30, loop1
+
+done:
+	MOV	X29, c+56(FP)	// return b
+	RET
 
 TEXT ·shlVU(SB),NOSPLIT,$0
 	JMP ·shlVU_g(SB)
-- 
GitLab