From a4d0269a4f74e8a66ee7487491857e1a1c582231 Mon Sep 17 00:00:00 2001 From: Russ Cox <rsc@golang.org> Date: Fri, 11 Apr 2025 08:54:58 -0400 Subject: [PATCH] math/big: use clearer loop bounds check elimination MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checking that the lengths are equal and panicking teaches the compiler that it can assume “i in range for z” implies “i in range for x”, letting us simplify the actual loops a bit. It also turns up a few places in math/big that were playing maybe a little too fast and loose with slice lengths. Update those to explicitly set all the input slices to the same length. These speedups are basically irrelevant, since they only happen in real code if people are compiling with -tags math_big_pure_go. But at least the code is clearer. benchmark \ system c3h88 c2s16 s7 386 s7-386 c4as16 mac arm loong64 ppc64le riscv64 s390x AddVV/words=1/impl=go ~ +11.20% +5.11% -7.67% -7.77% +1.90% +10.76% -33.22% ~ +10.98% ~ +6.60% AddVV/words=10/impl=go -22.12% -13.48% -10.37% -17.95% -18.07% -24.58% -22.04% -29.95% -14.22% ~ -6.33% +3.66% AddVV/words=16/impl=go -9.75% -13.73% ~ -21.90% -18.66% -30.03% -20.45% -28.09% -17.33% -7.15% -8.96% +12.55% AddVV/words=100/impl=go -5.91% -1.02% ~ -29.23% -22.18% -25.62% -6.49% -23.59% -22.31% -1.88% -14.13% +9.23% AddVV/words=1000/impl=go -0.52% -0.19% -3.58% -33.89% -23.46% -22.46% ~ -24.00% -24.73% +0.93% -15.79% +12.32% AddVV/words=10000/impl=go ~ ~ ~ -33.79% -23.72% -23.79% -5.98% -23.92% ~ +0.78% -15.45% +8.59% AddVV/words=100000/impl=go ~ ~ ~ -33.90% -24.25% -22.82% -4.09% -24.63% ~ +1.00% -13.56% ~ SubVV/words=1/impl=go ~ +11.64% +14.05% ~ -4.07% ~ +10.79% -33.69% ~ ~ +3.89% +12.33% SubVV/words=10/impl=go -10.31% -14.09% -7.38% +13.76% -13.25% -18.05% -20.08% -24.97% -14.15% +10.13% -0.97% -2.51% SubVV/words=16/impl=go -8.06% -13.73% -5.70% +17.00% -12.83% -23.76% -17.52% -25.25% -17.30% -2.80% -4.96% -18.25% SubVV/words=100/impl=go -9.22% -1.30% -2.76% +20.88% -14.35% -15.29% -8.49% -19.64% -22.31% -0.68% -14.30% -9.04% SubVV/words=1000/impl=go -0.60% ~ -3.43% +23.08% -16.14% -11.96% ~ -28.52% -24.73% ~ -15.95% -9.91% SubVV/words=10000/impl=go ~ ~ ~ +26.01% -15.24% -11.92% ~ -28.26% +4.25% ~ -15.42% -5.95% SubVV/words=100000/impl=go ~ ~ ~ +25.71% -15.83% -12.13% ~ -27.88% -1.27% ~ -13.57% -6.72% LshVU/words=1/impl=go +0.56% +0.36% ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ LshVU/words=10/impl=go +13.37% +4.63% ~ ~ ~ ~ ~ -2.90% ~ ~ ~ ~ LshVU/words=16/impl=go +22.83% +6.47% ~ ~ ~ ~ ~ ~ +0.80% ~ ~ +5.88% LshVU/words=100/impl=go +7.56% +13.95% ~ ~ ~ ~ ~ ~ +0.33% -2.50% ~ ~ LshVU/words=1000/impl=go +0.64% +17.92% ~ ~ ~ ~ ~ -6.52% ~ -2.58% ~ ~ LshVU/words=10000/impl=go ~ +17.60% ~ ~ ~ ~ ~ -6.64% -6.22% -1.40% ~ ~ LshVU/words=100000/impl=go ~ +14.57% ~ ~ ~ ~ ~ ~ -5.47% ~ ~ ~ RshVU/words=1/impl=go ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ +2.72% RshVU/words=10/impl=go ~ ~ ~ ~ ~ ~ ~ +2.50% ~ ~ ~ ~ RshVU/words=16/impl=go ~ +0.53% ~ ~ ~ ~ ~ +3.82% ~ ~ ~ ~ RshVU/words=100/impl=go ~ ~ ~ ~ ~ ~ ~ +6.18% ~ ~ ~ ~ RshVU/words=1000/impl=go ~ ~ ~ ~ ~ ~ ~ +7.00% ~ ~ ~ ~ RshVU/words=10000/impl=go ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ RshVU/words=100000/impl=go ~ ~ ~ ~ ~ ~ ~ +7.05% ~ ~ ~ ~ MulAddVWW/words=1/impl=go -10.34% +4.43% +10.62% -1.62% -4.74% -2.86% +11.75% ~ -8.00% +8.89% +3.87% ~ MulAddVWW/words=10/impl=go -1.61% -5.87% ~ -8.30% -4.55% +0.87% ~ -5.28% -20.82% ~ ~ -2.32% MulAddVWW/words=16/impl=go -2.96% -5.28% ~ -9.22% -5.28% ~ ~ -3.74% -19.52% -1.48% -2.53% -9.52% MulAddVWW/words=100/impl=go -3.89% -7.53% +1.93% -10.49% -4.87% -8.27% ~ ~ -0.65% -0.61% -7.59% -20.61% MulAddVWW/words=1000/impl=go -0.45% -3.91% +4.54% -11.46% -4.69% -8.53% ~ ~ -0.05% ~ -8.88% -19.77% MulAddVWW/words=10000/impl=go ~ -3.30% +4.10% -11.34% -4.10% -9.43% ~ -0.61% ~ -0.55% -8.21% -18.48% MulAddVWW/words=100000/impl=go -0.30% -3.03% +4.31% -11.55% -4.41% -9.74% ~ -0.75% +0.63% ~ -7.80% -19.82% AddMulVVWW/words=1/impl=go ~ +13.09% +12.50% -7.05% -10.41% +2.53% +13.32% -3.49% ~ +15.56% +3.62% ~ AddMulVVWW/words=10/impl=go -15.96% -9.06% -5.06% -14.56% -11.83% -5.44% -26.30% -14.23% -11.44% -1.79% -5.93% -6.60% AddMulVVWW/words=16/impl=go -19.05% -12.43% -6.19% -14.24% -12.67% -8.65% -18.64% -16.56% -10.64% -3.00% -7.61% -12.80% AddMulVVWW/words=100/impl=go -22.13% -16.59% -13.04% -13.79% -11.46% -12.01% -6.46% -21.80% -5.08% -3.13% -13.60% -22.53% AddMulVVWW/words=1000/impl=go -17.07% -17.05% -14.08% -13.59% -12.13% -11.21% ~ -22.81% -4.27% -1.27% -16.35% -23.47% AddMulVVWW/words=10000/impl=go -15.03% -16.78% -14.23% -13.86% -11.84% -11.69% ~ -22.75% -13.39% -1.10% -14.37% -22.01% AddMulVVWW/words=100000/impl=go -13.70% -14.90% -14.26% -13.55% -12.04% -11.63% ~ -22.61% ~ -2.53% -10.42% -23.16% Change-Id: Ic6f64344484a762b818c7090d1396afceb638607 Reviewed-on: https://go-review.googlesource.com/c/go/+/665155 Auto-Submit: Russ Cox <rsc@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Alan Donovan <adonovan@google.com> --- src/math/big/arith.go | 67 ++++++++++++++++++++++++++---------------- src/math/big/nat.go | 6 ++-- src/math/big/natdiv.go | 4 +-- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/math/big/arith.go b/src/math/big/arith.go index e2cd99f602b..bc27ca6a562 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -26,17 +26,13 @@ const ( _M = _B - 1 // digit mask ) -// Many of the loops in this file are of the form -// for i := 0; i < len(z) && i < len(x) && i < len(y); i++ -// i < len(z) is the real condition. -// However, checking i < len(x) && i < len(y) as well is faster than -// having the compiler do a bounds check in the body of the loop; -// remarkably it is even faster than hoisting the bounds check -// out of the loop, by doing something like -// _, _ = x[len(z)-1], y[len(z)-1] -// There are other ways to hoist the bounds check out of the loop, -// but the compiler's BCE isn't powerful enough for them (yet?). -// See the discussion in CL 164966. +// In these routines, it is the caller's responsibility to arrange for +// x, y, and z to all have the same length. We check this and panic. +// The assembly versions of these routines do not include that check. +// +// The check+panic also has the effect of teaching the compiler that +// “i in range for z” implies “i in range for x and y”, eliminating all +// bounds checks in loops from 0 to len(z) and vice versa. // ---------------------------------------------------------------------------- // Elementary operations on words @@ -65,8 +61,11 @@ func nlz(x Word) uint { // The resulting carry c is either 0 or 1. func addVV_g(z, x, y []Word) (c Word) { - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + if len(x) != len(z) || len(y) != len(z) { + panic("addVV len") + } + + for i := range z { zi, cc := bits.Add(uint(x[i]), uint(y[i]), uint(c)) z[i] = Word(zi) c = Word(cc) @@ -76,8 +75,11 @@ func addVV_g(z, x, y []Word) (c Word) { // The resulting carry c is either 0 or 1. func subVV_g(z, x, y []Word) (c Word) { - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + if len(x) != len(z) || len(y) != len(z) { + panic("subVV len") + } + + for i := range z { zi, cc := bits.Sub(uint(x[i]), uint(y[i]), uint(c)) z[i] = Word(zi) c = Word(cc) @@ -99,7 +101,10 @@ func subVV_g(z, x, y []Word) (c Word) { // //go:linkname addVW func addVW(z, x []Word, y Word) (c Word) { - x = x[:len(z)] + if len(x) != len(z) { + panic("addVW len") + } + if len(z) == 0 { return y } @@ -150,7 +155,10 @@ func addVW_ref(z, x []Word, y Word) (c Word) { // //go:linkname subVW func subVW(z, x []Word, y Word) (c Word) { - x = x[:len(z)] + if len(x) != len(z) { + panic("subVW len") + } + if len(z) == 0 { return y } @@ -188,6 +196,10 @@ func subVW_ref(z, x []Word, y Word) (c Word) { } func lshVU_g(z, x []Word, s uint) (c Word) { + if len(x) != len(z) { + panic("lshVU len") + } + if s == 0 { copy(z, x) return @@ -207,6 +219,10 @@ func lshVU_g(z, x []Word, s uint) (c Word) { } func rshVU_g(z, x []Word, s uint) (c Word) { + if len(x) != len(z) { + panic("rshVU len") + } + if s == 0 { copy(z, x) return @@ -214,10 +230,6 @@ func rshVU_g(z, x []Word, s uint) (c Word) { if len(z) == 0 { return } - if len(x) != len(z) { - // This is an invariant guaranteed by the caller. - panic("len(x) != len(z)") - } s &= _W - 1 // hint to the compiler that shifts by s don't need guard code ŝ := _W - s ŝ &= _W - 1 // ditto @@ -230,18 +242,23 @@ func rshVU_g(z, x []Word, s uint) (c Word) { } func mulAddVWW_g(z, x []Word, y, r Word) (c Word) { + if len(x) != len(z) { + panic("mulAddVWW len") + } c = r - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x); i++ { + for i := range z { c, z[i] = mulAddWWW_g(x[i], y, c) } return } func addMulVVWW_g(z, x, y []Word, m, a Word) (c Word) { + if len(x) != len(z) || len(y) != len(z) { + panic("rshVU len") + } + c = a - // The comment near the top of this file discusses this for loop condition. - for i := 0; i < len(z) && i < len(x) && i < len(y); i++ { + for i := range z { z1, z0 := mulAddWWW_g(y[i], m, x[i]) lo, cc := bits.Add(uint(z0), uint(c), 0) c, z[i] = Word(cc), Word(lo) diff --git a/src/math/big/nat.go b/src/math/big/nat.go index feff4835da4..43e36d30938 100644 --- a/src/math/big/nat.go +++ b/src/math/big/nat.go @@ -111,7 +111,7 @@ func (z nat) add(x, y nat) nat { // m > 0 z = z.make(m + 1) - c := addVV(z[0:n], x, y) + c := addVV(z[:n], x[:n], y[:n]) if m > n { c = addVW(z[n:m], x[n:], c) } @@ -137,7 +137,7 @@ func (z nat) sub(x, y nat) nat { // m > 0 z = z.make(m) - c := subVV(z[0:n], x, y) + c := subVV(z[:n], x[:n], y[:n]) if m > n { c = subVW(z[n:], x[n:], c) } @@ -232,7 +232,7 @@ func alias(x, y nat) bool { // slice, and we don't need to normalize z after each addition) func addTo(z, x nat) { if n := len(x); n > 0 { - if c := addVV(z[:n], z, x); c != 0 { + if c := addVV(z[:n], z[:n], x[:n]); c != 0 { if n < len(z) { addVW(z[n:], z[n:], c) } diff --git a/src/math/big/natdiv.go b/src/math/big/natdiv.go index c9b7f4e3556..88cb5d9e2e3 100644 --- a/src/math/big/natdiv.go +++ b/src/math/big/natdiv.go @@ -699,9 +699,9 @@ func (q nat) divBasic(stk *stack, u, v nat) { // Subtract q̂·v from the current section of u. // If it underflows, q̂·v > u, which we fix up // by decrementing q̂ and adding v back. - c := subVV(u[j:j+qhl], u[j:], qhatv) + c := subVV(u[j:j+qhl], u[j:j+qhl], qhatv[:qhl]) if c != 0 { - c := addVV(u[j:j+n], u[j:], v) + c := addVV(u[j:j+n], u[j:j+n], v) // If n == qhl, the carry from subVV and the carry from addVV // cancel out and don't affect u[j+n]. if n < qhl { -- GitLab