diff --git a/src/cmd/compile/internal/gc/ssa.go b/src/cmd/compile/internal/gc/ssa.go index d0b3e8df94e164f068291c26691d61daf93a37a1..815ff7f99fb358c5aad6603d3f42cd900ac4757b 100644 --- a/src/cmd/compile/internal/gc/ssa.go +++ b/src/cmd/compile/internal/gc/ssa.go @@ -4022,11 +4022,6 @@ func init() { return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1]) }, sys.ArchAMD64, sys.ArchARM64, sys.ArchPPC64LE, sys.ArchPPC64, sys.ArchS390X) - add("math/big", "divWW", - func(s *state, n *Node, args []*ssa.Value) *ssa.Value { - return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2]) - }, - sys.ArchAMD64) } // findIntrinsic returns a function which builds the SSA equivalent of the diff --git a/src/math/big/arith.go b/src/math/big/arith.go index b0885f261fe9bad8e35290b94200a98a2048505b..750ce8aa398df4ba6d32244ea9e211c592e38821 100644 --- a/src/math/big/arith.go +++ b/src/math/big/arith.go @@ -60,12 +60,6 @@ func nlz(x Word) uint { return uint(bits.LeadingZeros(uint(x))) } -// q = (u1<<_W + u0 - r)/v -func divWW_g(u1, u0, v Word) (q, r Word) { - qq, rr := bits.Div(uint(u1), uint(u0), uint(v)) - return Word(qq), Word(rr) -} - // The resulting carry c is either 0 or 1. func addVV_g(z, x, y []Word) (c Word) { // The comment near the top of this file discusses this for loop condition. @@ -207,10 +201,87 @@ func addMulVVW_g(z, x []Word, y Word) (c Word) { return } -func divWVW_g(z []Word, xn Word, x []Word, y Word) (r Word) { +// q = ( x1 << _W + x0 - r)/y. m = floor(( _B^2 - 1 ) / d - _B). Requiring x1<y. +// An approximate reciprocal with a reference to "Improved Division by Invariant Integers +// (IEEE Transactions on Computers, 11 Jun. 2010)" +func divWW(x1, x0, y, m Word) (q, r Word) { + s := nlz(y) + if s != 0 { + x1 = x1<<s | x0>>(_W-s) + x0 <<= s + y <<= s + } + d := uint(y) + // We know that + // m = ⎣(B^2-1)/d⎦-B + // ⎣(B^2-1)/d⎦ = m+B + // (B^2-1)/d = m+B+delta1 0 <= delta1 <= (d-1)/d + // B^2/d = m+B+delta2 0 <= delta2 <= 1 + // The quotient we're trying to compute is + // quotient = ⎣(x1*B+x0)/d⎦ + // = ⎣(x1*B*(B^2/d)+x0*(B^2/d))/B^2⎦ + // = ⎣(x1*B*(m+B+delta2)+x0*(m+B+delta2))/B^2⎦ + // = ⎣(x1*m+x1*B+x0)/B + x0*m/B^2 + delta2*(x1*B+x0)/B^2⎦ + // The latter two terms of this three-term sum are between 0 and 1. + // So we can compute just the first term, and we will be low by at most 2. + t1, t0 := bits.Mul(uint(m), uint(x1)) + _, c := bits.Add(t0, uint(x0), 0) + t1, _ = bits.Add(t1, uint(x1), c) + // The quotient is either t1, t1+1, or t1+2. + // We'll try t1 and adjust if needed. + qq := t1 + // compute remainder r=x-d*q. + dq1, dq0 := bits.Mul(d, qq) + r0, b := bits.Sub(uint(x0), dq0, 0) + r1, _ := bits.Sub(uint(x1), dq1, b) + // The remainder we just computed is bounded above by B+d: + // r = x1*B + x0 - d*q. + // = x1*B + x0 - d*⎣(x1*m+x1*B+x0)/B⎦ + // = x1*B + x0 - d*((x1*m+x1*B+x0)/B-alpha) 0 <= alpha < 1 + // = x1*B + x0 - x1*d/B*m - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*⎣(B^2-1)/d-B⎦ - x1*d - x0*d/B + d*alpha + // = x1*B + x0 - x1*d/B*((B^2-1)/d-B-beta) - x1*d - x0*d/B + d*alpha 0 <= beta < 1 + // = x1*B + x0 - x1*B + x1/B + x1*d + x1*d/B*beta - x1*d - x0*d/B + d*alpha + // = x0 + x1/B + x1*d/B*beta - x0*d/B + d*alpha + // = x0*(1-d/B) + x1*(1+d*beta)/B + d*alpha + // < B*(1-d/B) + d*B/B + d because x0<B (and 1-d/B>0), x1<d, 1+d*beta<=B, alpha<1 + // = B - d + d + d + // = B+d + // So r1 can only be 0 or 1. If r1 is 1, then we know q was too small. + // Add 1 to q and subtract d from r. That guarantees that r is <B, so + // we no longer need to keep track of r1. + if r1 != 0 { + qq++ + r0 -= d + } + // If the remainder is still too large, increment q one more time. + if r0 >= d { + qq++ + r0 -= d + } + return Word(qq), Word(r0 >> s) +} + +func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { r = xn + if len(x) == 1 { + qq, rr := bits.Div(uint(r), uint(x[0]), uint(y)) + z[0] = Word(qq) + return Word(rr) + } + rec := reciprocalWord(y) for i := len(z) - 1; i >= 0; i-- { - z[i], r = divWW_g(r, x[i], y) + z[i], r = divWW(r, x[i], y, rec) } - return + return r +} + +// reciprocalWord return the reciprocal of the divisor. rec = floor(( _B^2 - 1 ) / u - _B). u = d1 << nlz(d1). +func reciprocalWord(d1 Word) Word { + u := uint(d1 << nlz(d1)) + x1 := ^u + x0 := uint(_M) + rec, _ := bits.Div(x1, x0, u) // (_B^2-1)/U-_B = (_B*(_M-C)+_M)/U + return Word(rec) } diff --git a/src/math/big/arith_386.s b/src/math/big/arith_386.s index f61da2aba7251a46087b394fc2f9e9502f25b057..d0ea949fe6689c950d4fd1a12d875ee8ebdbf91a 100644 --- a/src/math/big/arith_386.s +++ b/src/math/big/arith_386.s @@ -18,16 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - MOVL x1+0(FP), DX - MOVL x0+4(FP), AX - DIVL y+8(FP) - MOVL AX, q+12(FP) - MOVL DX, r+16(FP) - RET - - // func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),NOSPLIT,$0 MOVL z+0(FP), DI @@ -251,21 +241,4 @@ E6: CMPL BX, $0 // i < 0 RET -// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - MOVL z+0(FP), DI - MOVL xn+12(FP), DX // r = xn - MOVL x+16(FP), SI - MOVL y+28(FP), CX - MOVL z_len+4(FP), BX // i = z - JMP E7 -L7: MOVL (SI)(BX*4), AX - DIVL CX - MOVL AX, (DI)(BX*4) - -E7: SUBL $1, BX // i-- - JGE L7 // i >= 0 - - MOVL DX, r+32(FP) - RET diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s index b75639f5406c260ca0f0e76d7f00ff3249aa739b..61043ca2d97c491c8bd9591ae2f2fcdb45a7f0f1 100644 --- a/src/math/big/arith_amd64.s +++ b/src/math/big/arith_amd64.s @@ -18,14 +18,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - MOVQ x1+0(FP), DX - MOVQ x0+8(FP), AX - DIVQ y+16(FP) - MOVQ AX, q+24(FP) - MOVQ DX, r+32(FP) - RET // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0. // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared. @@ -531,21 +523,3 @@ adx_short: -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - MOVQ z+0(FP), R10 - MOVQ xn+24(FP), DX // r = xn - MOVQ x+32(FP), R8 - MOVQ y+56(FP), R9 - MOVQ z_len+8(FP), BX // i = z - JMP E7 - -L7: MOVQ (R8)(BX*8), AX - DIVQ R9 - MOVQ AX, (R10)(BX*8) - -E7: SUBQ $1, BX // i-- - JGE L7 // i >= 0 - - MOVQ DX, r+64(FP) - RET diff --git a/src/math/big/arith_arm.s b/src/math/big/arith_arm.s index 33aa36f7090fb80f8322e9f027300fb3383aedf7..cbf7445e7abded24bca3fbbd2aa27ce896707852 100644 --- a/src/math/big/arith_arm.s +++ b/src/math/big/arith_arm.s @@ -272,17 +272,6 @@ E9: RET -// func divWVW(z* Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - // ARM has no multiword division, so use portable code. - B ·divWVW_g(SB) - - -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - // ARM has no multiword division, so use portable code. - B ·divWW_g(SB) - // func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),NOSPLIT,$0 diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index da6e408e19f726c76d3a27eb804849fcb2034947..22357d088e4c9b9d179dcf22fb638be51350997b 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -23,11 +23,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - B ·divWW_g(SB) // ARM64 has no multiword division - - // func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),NOSPLIT,$0 MOVD z_len+8(FP), R0 @@ -585,6 +580,4 @@ done: MOVD R4, c+56(FP) RET -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -TEXT ·divWVW(SB),NOSPLIT,$0 - B ·divWVW_g(SB) + diff --git a/src/math/big/arith_decl.go b/src/math/big/arith_decl.go index 41e592334c376ecc532bd5bf870c7fe89f3c12ab..d519bdc87b636328d568f5a34d1d77b396c296fe 100644 --- a/src/math/big/arith_decl.go +++ b/src/math/big/arith_decl.go @@ -8,7 +8,6 @@ package big // implemented in arith_$GOARCH.s func mulWW(x, y Word) (z1, z0 Word) -func divWW(x1, x0, y Word) (q, r Word) func addVV(z, x, y []Word) (c Word) func subVV(z, x, y []Word) (c Word) func addVW(z, x []Word, y Word) (c Word) @@ -17,4 +16,3 @@ func shlVU(z, x []Word, s uint) (c Word) func shrVU(z, x []Word, s uint) (c Word) func mulAddVWW(z, x []Word, y, r Word) (c Word) func addMulVVW(z, x []Word, y Word) (c Word) -func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) diff --git a/src/math/big/arith_decl_pure.go b/src/math/big/arith_decl_pure.go index 305f7ee03b42d8078fc43dc33a1a7b8f0c58f8b2..5faa3bd281999eab511cb7384b2d0f7efc80b34b 100644 --- a/src/math/big/arith_decl_pure.go +++ b/src/math/big/arith_decl_pure.go @@ -10,10 +10,6 @@ func mulWW(x, y Word) (z1, z0 Word) { return mulWW_g(x, y) } -func divWW(x1, x0, y Word) (q, r Word) { - return divWW_g(x1, x0, y) -} - func addVV(z, x, y []Word) (c Word) { return addVV_g(z, x, y) } @@ -55,7 +51,3 @@ func mulAddVWW(z, x []Word, y, r Word) (c Word) { func addMulVVW(z, x []Word, y Word) (c Word) { return addMulVVW_g(z, x, y) } - -func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) { - return divWVW_g(z, xn, x, y) -} diff --git a/src/math/big/arith_mips64x.s b/src/math/big/arith_mips64x.s index 983510ee3d42d1e9f8d7b44334fc650b1f22afdd..804b9fe06edc26e9cdac916a06a0d9e631fc6186 100644 --- a/src/math/big/arith_mips64x.s +++ b/src/math/big/arith_mips64x.s @@ -12,9 +12,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 JMP ·mulWW_g(SB) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) - TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/arith_mipsx.s b/src/math/big/arith_mipsx.s index 54cafbd9c0c80cc6108bd2dde5596ab1ff9e359e..efdecb80f3291edf7a75464ed9109924747c4aae 100644 --- a/src/math/big/arith_mipsx.s +++ b/src/math/big/arith_mipsx.s @@ -12,9 +12,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 JMP ·mulWW_g(SB) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) - TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -39,5 +36,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s index 409e10ab48bd4741c168998be6839d5f2fef2e75..b299ccc2fb824cfbbd326f9e2ea79292d4ddad1b 100644 --- a/src/math/big/arith_ppc64x.s +++ b/src/math/big/arith_ppc64x.s @@ -478,44 +478,4 @@ done: MOVD R4, c+56(FP) RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB), NOSPLIT, $0 - MOVD x1+0(FP), R4 - MOVD x0+8(FP), R5 - MOVD y+16(FP), R6 - - CMPU R4, R6 - BGE divbigger - - // from the programmer's note in ch. 3 of the ISA manual, p.74 - DIVDEU R6, R4, R3 - DIVDU R6, R5, R7 - MULLD R6, R3, R8 - MULLD R6, R7, R20 - SUB R20, R5, R10 - ADD R7, R3, R3 - SUB R8, R10, R4 - CMPU R4, R10 - BLT adjust - CMPU R4, R6 - BLT end - -adjust: - MOVD $1, R21 - ADD R21, R3, R3 - SUB R6, R4, R4 - -end: - MOVD R3, q+24(FP) - MOVD R4, r+32(FP) - RET - -divbigger: - MOVD $-1, R7 - MOVD R7, q+24(FP) - MOVD R7, r+32(FP) - RET - -TEXT ·divWVW(SB), NOSPLIT, $0 - BR ·divWVW_g(SB) diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s index 59065c3f7bac37cf6011b15dfd2838d739f450f5..a2f7666c7b9acfe023d3d628f2fb53b0b9130ce3 100644 --- a/src/math/big/arith_riscv64.s +++ b/src/math/big/arith_riscv64.s @@ -19,9 +19,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 MOV X8, z0+24(FP) RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) // riscv64 has no multiword division TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -47,5 +44,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s index 48917681112a981b9685dba693e62a943c26e843..242aca7434a1345f24a5b7bd240029b8134bfbaf 100644 --- a/src/math/big/arith_s390x.s +++ b/src/math/big/arith_s390x.s @@ -17,15 +17,6 @@ TEXT ·mulWW(SB), NOSPLIT, $0 MOVD R11, z0+24(FP) RET -// func divWW(x1, x0, y Word) (q, r Word) -TEXT ·divWW(SB), NOSPLIT, $0 - MOVD x1+0(FP), R10 - MOVD x0+8(FP), R11 - MOVD y+16(FP), R5 - WORD $0xb98700a5 // dlgr r10,r5 - MOVD R11, q+24(FP) - MOVD R10, r+32(FP) - RET // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11 // func addVV(z, x, y []Word) (c Word) @@ -990,27 +981,3 @@ E6: MOVD R4, c+56(FP) RET -// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word) -// CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i -TEXT ·divWVW(SB), NOSPLIT, $0 - MOVD z+0(FP), R2 - MOVD xn+24(FP), R10 // r = xn - MOVD x+32(FP), R8 - MOVD y+56(FP), R9 - MOVD z_len+8(FP), R7 // i = z - SLD $3, R7, R1 // i*8 - MOVD $0, R0 // make sure it's zero - BR E7 - -L7: - MOVD (R8)(R1*1), R11 - WORD $0xB98700A9 // DLGR R10,R9 - MOVD R11, (R2)(R1*1) - -E7: - SUB $1, R7 // i-- - SUB $8, R1 - BGE L7 // i >= 0 - - MOVD R10, r+64(FP) - RET diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go index fc205934c5ca92f568c60a1f2a347e3746efaa33..808d17845927e72bbd0ad7e8fd15bb7cbf94a17d 100644 --- a/src/math/big/arith_test.go +++ b/src/math/big/arith_test.go @@ -7,6 +7,7 @@ package big import ( "fmt" "internal/testenv" + "math/bits" "math/rand" "strings" "testing" @@ -493,7 +494,6 @@ func TestFunVWW(t *testing.T) { if a.y != 0 && a.r < a.y { arg := argWVW{a.x, a.c, a.z, a.y, a.r} - testFunWVW(t, "divWVW_g", divWVW_g, arg) testFunWVW(t, "divWVW", divWVW, arg) } } @@ -536,6 +536,42 @@ func TestMulAddWWW(t *testing.T) { } } +var divWWTests = []struct { + x1, x0, y Word + q, r Word +}{ + {_M >> 1, 0, _M, _M >> 1, _M >> 1}, + {_M - (1 << (_W - 2)), _M, 3 << (_W - 2), _M, _M - (1 << (_W - 2))}, +} + +const testsNumber = 1 << 16 + +func TestDivWW(t *testing.T) { + i := 0 + for i, test := range divWWTests { + rec := reciprocalWord(test.y) + q, r := divWW(test.x1, test.x0, test.y, rec) + if q != test.q || r != test.r { + t.Errorf("#%d got (%x, %x) want (%x, %x)", i, q, r, test.q, test.r) + } + } + //random tests + for ; i < testsNumber; i++ { + x1 := rndW() + x0 := rndW() + y := rndW() + if x1 >= y { + continue + } + rec := reciprocalWord(y) + qGot, rGot := divWW(x1, x0, y, rec) + qWant, rWant := bits.Div(uint(x1), uint(x0), uint(y)) + if uint(qGot) != qWant || uint(rGot) != rWant { + t.Errorf("#%d got (%x, %x) want (%x, %x)", i, qGot, rGot, qWant, rWant) + } + } +} + func BenchmarkMulAddVWW(b *testing.B) { for _, n := range benchSizes { if isRaceBuilder && n > 1e3 { @@ -570,3 +606,19 @@ func BenchmarkAddMulVVW(b *testing.B) { }) } } +func BenchmarkDivWVW(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := rndV(n) + y := rndW() + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _W)) + for i := 0; i < b.N; i++ { + divWVW(z, 0, x, y) + } + }) + } +} diff --git a/src/math/big/arith_wasm.s b/src/math/big/arith_wasm.s index 382597c694535996a215277758121ff90a88ef50..add106446909d6fc9fa0f9c24fee8efc4cde343f 100644 --- a/src/math/big/arith_wasm.s +++ b/src/math/big/arith_wasm.s @@ -9,9 +9,6 @@ TEXT ·mulWW(SB),NOSPLIT,$0 JMP ·mulWW_g(SB) -TEXT ·divWW(SB),NOSPLIT,$0 - JMP ·divWW_g(SB) - TEXT ·addVV(SB),NOSPLIT,$0 JMP ·addVV_g(SB) @@ -36,5 +33,3 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0 TEXT ·addMulVVW(SB),NOSPLIT,$0 JMP ·addMulVVW_g(SB) -TEXT ·divWVW(SB),NOSPLIT,$0 - JMP ·divWVW_g(SB) diff --git a/src/math/big/nat.go b/src/math/big/nat.go index 6a3989bf9d82bf1c9ec3b57aedead466002bfb8e..c2f3787848ccab922d81e56666e24832faa4af48 100644 --- a/src/math/big/nat.go +++ b/src/math/big/nat.go @@ -751,6 +751,7 @@ func (q nat) divBasic(u, v nat) { // D2. vn1 := v[n-1] + rec := reciprocalWord(vn1) for j := m; j >= 0; j-- { // D3. qhat := Word(_M) @@ -760,7 +761,7 @@ func (q nat) divBasic(u, v nat) { } if ujn != vn1 { var rhat Word - qhat, rhat = divWW(ujn, u[j+n-1], vn1) + qhat, rhat = divWW(ujn, u[j+n-1], vn1, rec) // x1 | x2 = q̂v_{n-2} vn2 := v[n-2]