From a4d0269a4f74e8a66ee7487491857e1a1c582231 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Fri, 11 Apr 2025 08:54:58 -0400
Subject: [PATCH] math/big: use clearer loop bounds check elimination
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Checking that the lengths are equal and panicking teaches the compiler
that it can assume “i in range for z” implies “i in range for x”, letting us
simplify the actual loops a bit.

It also turns up a few places in math/big that were playing maybe a little
too fast and loose with slice lengths. Update those to explicitly set all the
input slices to the same length.

These speedups are basically irrelevant, since they only happen
in real code if people are compiling with -tags math_big_pure_go.
But at least the code is clearer.

benchmark \ system                   c3h88    c2s16       s7      386   s7-386   c4as16      mac      arm  loong64  ppc64le  riscv64    s390x
AddVV/words=1/impl=go                    ~  +11.20%   +5.11%   -7.67%   -7.77%   +1.90%  +10.76%  -33.22%        ~  +10.98%        ~   +6.60%
AddVV/words=10/impl=go             -22.12%  -13.48%  -10.37%  -17.95%  -18.07%  -24.58%  -22.04%  -29.95%  -14.22%        ~   -6.33%   +3.66%
AddVV/words=16/impl=go              -9.75%  -13.73%        ~  -21.90%  -18.66%  -30.03%  -20.45%  -28.09%  -17.33%   -7.15%   -8.96%  +12.55%
AddVV/words=100/impl=go             -5.91%   -1.02%        ~  -29.23%  -22.18%  -25.62%   -6.49%  -23.59%  -22.31%   -1.88%  -14.13%   +9.23%
AddVV/words=1000/impl=go            -0.52%   -0.19%   -3.58%  -33.89%  -23.46%  -22.46%        ~  -24.00%  -24.73%   +0.93%  -15.79%  +12.32%
AddVV/words=10000/impl=go                ~        ~        ~  -33.79%  -23.72%  -23.79%   -5.98%  -23.92%        ~   +0.78%  -15.45%   +8.59%
AddVV/words=100000/impl=go               ~        ~        ~  -33.90%  -24.25%  -22.82%   -4.09%  -24.63%        ~   +1.00%  -13.56%        ~
SubVV/words=1/impl=go                    ~  +11.64%  +14.05%        ~   -4.07%        ~  +10.79%  -33.69%        ~        ~   +3.89%  +12.33%
SubVV/words=10/impl=go             -10.31%  -14.09%   -7.38%  +13.76%  -13.25%  -18.05%  -20.08%  -24.97%  -14.15%  +10.13%   -0.97%   -2.51%
SubVV/words=16/impl=go              -8.06%  -13.73%   -5.70%  +17.00%  -12.83%  -23.76%  -17.52%  -25.25%  -17.30%   -2.80%   -4.96%  -18.25%
SubVV/words=100/impl=go             -9.22%   -1.30%   -2.76%  +20.88%  -14.35%  -15.29%   -8.49%  -19.64%  -22.31%   -0.68%  -14.30%   -9.04%
SubVV/words=1000/impl=go            -0.60%        ~   -3.43%  +23.08%  -16.14%  -11.96%        ~  -28.52%  -24.73%        ~  -15.95%   -9.91%
SubVV/words=10000/impl=go                ~        ~        ~  +26.01%  -15.24%  -11.92%        ~  -28.26%   +4.25%        ~  -15.42%   -5.95%
SubVV/words=100000/impl=go               ~        ~        ~  +25.71%  -15.83%  -12.13%        ~  -27.88%   -1.27%        ~  -13.57%   -6.72%
LshVU/words=1/impl=go               +0.56%   +0.36%        ~        ~        ~        ~        ~        ~        ~        ~        ~        ~
LshVU/words=10/impl=go             +13.37%   +4.63%        ~        ~        ~        ~        ~   -2.90%        ~        ~        ~        ~
LshVU/words=16/impl=go             +22.83%   +6.47%        ~        ~        ~        ~        ~        ~   +0.80%        ~        ~   +5.88%
LshVU/words=100/impl=go             +7.56%  +13.95%        ~        ~        ~        ~        ~        ~   +0.33%   -2.50%        ~        ~
LshVU/words=1000/impl=go            +0.64%  +17.92%        ~        ~        ~        ~        ~   -6.52%        ~   -2.58%        ~        ~
LshVU/words=10000/impl=go                ~  +17.60%        ~        ~        ~        ~        ~   -6.64%   -6.22%   -1.40%        ~        ~
LshVU/words=100000/impl=go               ~  +14.57%        ~        ~        ~        ~        ~        ~   -5.47%        ~        ~        ~
RshVU/words=1/impl=go                    ~        ~        ~        ~        ~        ~        ~        ~        ~        ~        ~   +2.72%
RshVU/words=10/impl=go                   ~        ~        ~        ~        ~        ~        ~   +2.50%        ~        ~        ~        ~
RshVU/words=16/impl=go                   ~   +0.53%        ~        ~        ~        ~        ~   +3.82%        ~        ~        ~        ~
RshVU/words=100/impl=go                  ~        ~        ~        ~        ~        ~        ~   +6.18%        ~        ~        ~        ~
RshVU/words=1000/impl=go                 ~        ~        ~        ~        ~        ~        ~   +7.00%        ~        ~        ~        ~
RshVU/words=10000/impl=go                ~        ~        ~        ~        ~        ~        ~        ~        ~        ~        ~        ~
RshVU/words=100000/impl=go               ~        ~        ~        ~        ~        ~        ~   +7.05%        ~        ~        ~        ~
MulAddVWW/words=1/impl=go          -10.34%   +4.43%  +10.62%   -1.62%   -4.74%   -2.86%  +11.75%        ~   -8.00%   +8.89%   +3.87%        ~
MulAddVWW/words=10/impl=go          -1.61%   -5.87%        ~   -8.30%   -4.55%   +0.87%        ~   -5.28%  -20.82%        ~        ~   -2.32%
MulAddVWW/words=16/impl=go          -2.96%   -5.28%        ~   -9.22%   -5.28%        ~        ~   -3.74%  -19.52%   -1.48%   -2.53%   -9.52%
MulAddVWW/words=100/impl=go         -3.89%   -7.53%   +1.93%  -10.49%   -4.87%   -8.27%        ~        ~   -0.65%   -0.61%   -7.59%  -20.61%
MulAddVWW/words=1000/impl=go        -0.45%   -3.91%   +4.54%  -11.46%   -4.69%   -8.53%        ~        ~   -0.05%        ~   -8.88%  -19.77%
MulAddVWW/words=10000/impl=go            ~   -3.30%   +4.10%  -11.34%   -4.10%   -9.43%        ~   -0.61%        ~   -0.55%   -8.21%  -18.48%
MulAddVWW/words=100000/impl=go      -0.30%   -3.03%   +4.31%  -11.55%   -4.41%   -9.74%        ~   -0.75%   +0.63%        ~   -7.80%  -19.82%
AddMulVVWW/words=1/impl=go               ~  +13.09%  +12.50%   -7.05%  -10.41%   +2.53%  +13.32%   -3.49%        ~  +15.56%   +3.62%        ~
AddMulVVWW/words=10/impl=go        -15.96%   -9.06%   -5.06%  -14.56%  -11.83%   -5.44%  -26.30%  -14.23%  -11.44%   -1.79%   -5.93%   -6.60%
AddMulVVWW/words=16/impl=go        -19.05%  -12.43%   -6.19%  -14.24%  -12.67%   -8.65%  -18.64%  -16.56%  -10.64%   -3.00%   -7.61%  -12.80%
AddMulVVWW/words=100/impl=go       -22.13%  -16.59%  -13.04%  -13.79%  -11.46%  -12.01%   -6.46%  -21.80%   -5.08%   -3.13%  -13.60%  -22.53%
AddMulVVWW/words=1000/impl=go      -17.07%  -17.05%  -14.08%  -13.59%  -12.13%  -11.21%        ~  -22.81%   -4.27%   -1.27%  -16.35%  -23.47%
AddMulVVWW/words=10000/impl=go     -15.03%  -16.78%  -14.23%  -13.86%  -11.84%  -11.69%        ~  -22.75%  -13.39%   -1.10%  -14.37%  -22.01%
AddMulVVWW/words=100000/impl=go    -13.70%  -14.90%  -14.26%  -13.55%  -12.04%  -11.63%        ~  -22.61%        ~   -2.53%  -10.42%  -23.16%

Change-Id: Ic6f64344484a762b818c7090d1396afceb638607
Reviewed-on: https://go-review.googlesource.com/c/go/+/665155
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
---
 src/math/big/arith.go  | 67 ++++++++++++++++++++++++++----------------
 src/math/big/nat.go    |  6 ++--
 src/math/big/natdiv.go |  4 +--
 3 files changed, 47 insertions(+), 30 deletions(-)

diff --git a/src/math/big/arith.go b/src/math/big/arith.go
index e2cd99f602b..bc27ca6a562 100644
--- a/src/math/big/arith.go
+++ b/src/math/big/arith.go
@@ -26,17 +26,13 @@ const (
 	_M = _B - 1        // digit mask
 )
 
-// Many of the loops in this file are of the form
-//   for i := 0; i < len(z) && i < len(x) && i < len(y); i++
-// i < len(z) is the real condition.
-// However, checking i < len(x) && i < len(y) as well is faster than
-// having the compiler do a bounds check in the body of the loop;
-// remarkably it is even faster than hoisting the bounds check
-// out of the loop, by doing something like
-//   _, _ = x[len(z)-1], y[len(z)-1]
-// There are other ways to hoist the bounds check out of the loop,
-// but the compiler's BCE isn't powerful enough for them (yet?).
-// See the discussion in CL 164966.
+// In these routines, it is the caller's responsibility to arrange for
+// x, y, and z to all have the same length. We check this and panic.
+// The assembly versions of these routines do not include that check.
+//
+// The check+panic also has the effect of teaching the compiler that
+// “i in range for z” implies “i in range for x and y”, eliminating all
+// bounds checks in loops from 0 to len(z) and vice versa.
 
 // ----------------------------------------------------------------------------
 // Elementary operations on words
@@ -65,8 +61,11 @@ func nlz(x Word) uint {
 
 // The resulting carry c is either 0 or 1.
 func addVV_g(z, x, y []Word) (c Word) {
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
+	if len(x) != len(z) || len(y) != len(z) {
+		panic("addVV len")
+	}
+
+	for i := range z {
 		zi, cc := bits.Add(uint(x[i]), uint(y[i]), uint(c))
 		z[i] = Word(zi)
 		c = Word(cc)
@@ -76,8 +75,11 @@ func addVV_g(z, x, y []Word) (c Word) {
 
 // The resulting carry c is either 0 or 1.
 func subVV_g(z, x, y []Word) (c Word) {
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
+	if len(x) != len(z) || len(y) != len(z) {
+		panic("subVV len")
+	}
+
+	for i := range z {
 		zi, cc := bits.Sub(uint(x[i]), uint(y[i]), uint(c))
 		z[i] = Word(zi)
 		c = Word(cc)
@@ -99,7 +101,10 @@ func subVV_g(z, x, y []Word) (c Word) {
 //
 //go:linkname addVW
 func addVW(z, x []Word, y Word) (c Word) {
-	x = x[:len(z)]
+	if len(x) != len(z) {
+		panic("addVW len")
+	}
+
 	if len(z) == 0 {
 		return y
 	}
@@ -150,7 +155,10 @@ func addVW_ref(z, x []Word, y Word) (c Word) {
 //
 //go:linkname subVW
 func subVW(z, x []Word, y Word) (c Word) {
-	x = x[:len(z)]
+	if len(x) != len(z) {
+		panic("subVW len")
+	}
+
 	if len(z) == 0 {
 		return y
 	}
@@ -188,6 +196,10 @@ func subVW_ref(z, x []Word, y Word) (c Word) {
 }
 
 func lshVU_g(z, x []Word, s uint) (c Word) {
+	if len(x) != len(z) {
+		panic("lshVU len")
+	}
+
 	if s == 0 {
 		copy(z, x)
 		return
@@ -207,6 +219,10 @@ func lshVU_g(z, x []Word, s uint) (c Word) {
 }
 
 func rshVU_g(z, x []Word, s uint) (c Word) {
+	if len(x) != len(z) {
+		panic("rshVU len")
+	}
+
 	if s == 0 {
 		copy(z, x)
 		return
@@ -214,10 +230,6 @@ func rshVU_g(z, x []Word, s uint) (c Word) {
 	if len(z) == 0 {
 		return
 	}
-	if len(x) != len(z) {
-		// This is an invariant guaranteed by the caller.
-		panic("len(x) != len(z)")
-	}
 	s &= _W - 1 // hint to the compiler that shifts by s don't need guard code
 	ŝ := _W - s
 	ŝ &= _W - 1 // ditto
@@ -230,18 +242,23 @@ func rshVU_g(z, x []Word, s uint) (c Word) {
 }
 
 func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
+	if len(x) != len(z) {
+		panic("mulAddVWW len")
+	}
 	c = r
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x); i++ {
+	for i := range z {
 		c, z[i] = mulAddWWW_g(x[i], y, c)
 	}
 	return
 }
 
 func addMulVVWW_g(z, x, y []Word, m, a Word) (c Word) {
+	if len(x) != len(z) || len(y) != len(z) {
+		panic("rshVU len")
+	}
+
 	c = a
-	// The comment near the top of this file discusses this for loop condition.
-	for i := 0; i < len(z) && i < len(x) && i < len(y); i++ {
+	for i := range z {
 		z1, z0 := mulAddWWW_g(y[i], m, x[i])
 		lo, cc := bits.Add(uint(z0), uint(c), 0)
 		c, z[i] = Word(cc), Word(lo)
diff --git a/src/math/big/nat.go b/src/math/big/nat.go
index feff4835da4..43e36d30938 100644
--- a/src/math/big/nat.go
+++ b/src/math/big/nat.go
@@ -111,7 +111,7 @@ func (z nat) add(x, y nat) nat {
 	// m > 0
 
 	z = z.make(m + 1)
-	c := addVV(z[0:n], x, y)
+	c := addVV(z[:n], x[:n], y[:n])
 	if m > n {
 		c = addVW(z[n:m], x[n:], c)
 	}
@@ -137,7 +137,7 @@ func (z nat) sub(x, y nat) nat {
 	// m > 0
 
 	z = z.make(m)
-	c := subVV(z[0:n], x, y)
+	c := subVV(z[:n], x[:n], y[:n])
 	if m > n {
 		c = subVW(z[n:], x[n:], c)
 	}
@@ -232,7 +232,7 @@ func alias(x, y nat) bool {
 // slice, and we don't need to normalize z after each addition)
 func addTo(z, x nat) {
 	if n := len(x); n > 0 {
-		if c := addVV(z[:n], z, x); c != 0 {
+		if c := addVV(z[:n], z[:n], x[:n]); c != 0 {
 			if n < len(z) {
 				addVW(z[n:], z[n:], c)
 			}
diff --git a/src/math/big/natdiv.go b/src/math/big/natdiv.go
index c9b7f4e3556..88cb5d9e2e3 100644
--- a/src/math/big/natdiv.go
+++ b/src/math/big/natdiv.go
@@ -699,9 +699,9 @@ func (q nat) divBasic(stk *stack, u, v nat) {
 		// Subtract q̂·v from the current section of u.
 		// If it underflows, q̂·v > u, which we fix up
 		// by decrementing q̂ and adding v back.
-		c := subVV(u[j:j+qhl], u[j:], qhatv)
+		c := subVV(u[j:j+qhl], u[j:j+qhl], qhatv[:qhl])
 		if c != 0 {
-			c := addVV(u[j:j+n], u[j:], v)
+			c := addVV(u[j:j+n], u[j:j+n], v)
 			// If n == qhl, the carry from subVV and the carry from addVV
 			// cancel out and don't affect u[j+n].
 			if n < qhl {
-- 
GitLab