diff --git a/src/math/big/internal/asmgen/386.go b/src/math/big/internal/asmgen/386.go new file mode 100644 index 0000000000000000000000000000000000000000..f8f67ba52697283726e2252cc2b472e92dcd225b --- /dev/null +++ b/src/math/big/internal/asmgen/386.go @@ -0,0 +1,58 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +import "fmt" + +var Arch386 = &Arch{ + Name: "386", + WordBits: 32, + WordBytes: 4, + + regs: []string{ + "BX", "SI", "DI", "BP", + "CX", "DX", "AX", // last, to leave available for hinted allocation + }, + op3: x86Op3, + hint: x86Hint, + memOK: true, + subCarryIsBorrow: true, + maxColumns: 1, // not enough registers for more + + // Note: It would be nice to not set memIndex and then + // delete all the code in pipe.go that supports it. + // But a few routines, notably lshVU and mulAddVWW, + // benefit dramatically from the use of index registers. + // Perhaps some day we will decide 386 performance + // does not matter enough to keep this code. + memIndex: _386MemIndex, + + mov: "MOVL", + adds: "ADDL", + adcs: "ADCL", + subs: "SUBL", + sbcs: "SBBL", + lsh: "SHLL", + lshd: "SHLL", + rsh: "SHRL", + rshd: "SHRL", + and: "ANDL", + or: "ORL", + xor: "XORL", + neg: "NEGL", + lea: "LEAL", + mulWideF: x86MulWide, + + addWords: "LEAL (%[2]s)(%[1]s*4), %[3]s", + + jmpZero: "TESTL %[1]s, %[1]s; JZ %[2]s", + jmpNonZero: "TESTL %[1]s, %[1]s; JNZ %[2]s", + loopBottom: "SUBL $1, %[1]s; JNZ %[2]s", + loopBottomNeg: "ADDL $1, %[1]s; JNZ %[2]s", +} + +func _386MemIndex(a *Asm, off int, ix Reg, p RegPtr) Reg { + return Reg{fmt.Sprintf("%d(%s)(%s*%d)", off, p, ix, a.Arch.WordBytes)} +} diff --git a/src/math/big/internal/asmgen/amd64.go b/src/math/big/internal/asmgen/amd64.go new file mode 100644 index 0000000000000000000000000000000000000000..36b1b5844b107473064c23172216f7129d4ee6d7 --- /dev/null +++ b/src/math/big/internal/asmgen/amd64.go @@ -0,0 +1,146 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +var ArchAMD64 = &Arch{ + Name: "amd64", + WordBits: 64, + WordBytes: 8, + + regs: []string{ + "BX", "SI", "DI", + "R8", "R9", "R10", "R11", "R12", "R13", "R14", "R15", + "AX", "DX", "CX", // last to leave available for hinted allocation + }, + op3: x86Op3, + hint: x86Hint, + memOK: true, + subCarryIsBorrow: true, + + // Note: Not setting memIndex, because code generally runs faster + // if we avoid the use of scaled-index memory references, + // particularly in ADX instructions. + + options: map[Option]func(*Asm, string){ + OptionAltCarry: amd64JmpADX, + }, + + mov: "MOVQ", + adds: "ADDQ", + adcs: "ADCQ", + subs: "SUBQ", + sbcs: "SBBQ", + lsh: "SHLQ", + lshd: "SHLQ", + rsh: "SHRQ", + rshd: "SHRQ", + and: "ANDQ", + or: "ORQ", + xor: "XORQ", + neg: "NEGQ", + lea: "LEAQ", + addF: amd64Add, + mulWideF: x86MulWide, + + addWords: "LEAQ (%[2]s)(%[1]s*8), %[3]s", + + jmpZero: "TESTQ %[1]s, %[1]s; JZ %[2]s", + jmpNonZero: "TESTQ %[1]s, %[1]s; JNZ %[2]s", + loopBottom: "SUBQ $1, %[1]s; JNZ %[2]s", + loopBottomNeg: "ADDQ $1, %[1]s; JNZ %[2]s", +} + +func amd64JmpADX(a *Asm, label string) { + a.Printf("\tCMPB ·hasADX(SB), $0; JNZ %s\n", label) +} + +func amd64Add(a *Asm, src1, src2 Reg, dst Reg, carry Carry) bool { + if a.Enabled(OptionAltCarry) { + // If OptionAltCarry is enabled, the generator is emitting ADD instructions + // both with and without the AltCarry flag set; the AltCarry flag means to + // use ADOX. Otherwise we have to use ADCX. + // Using regular ADD/ADC would smash both carry flags, + // so we reject anything we can't handled with ADCX/ADOX. + if carry&UseCarry != 0 && carry&(SetCarry|SmashCarry) != 0 { + if carry&AltCarry != 0 { + a.op3("ADOXQ", src1, src2, dst) + } else { + a.op3("ADCXQ", src1, src2, dst) + } + return true + } + if carry&(SetCarry|UseCarry) == SetCarry && a.IsZero(src1) && src2 == dst { + // Clearing carry flag. Caller will add EOL comment. + a.Printf("\tTESTQ AX, AX\n") + return true + } + if carry != KeepCarry { + a.Fatalf("unsupported carry") + } + } + return false +} + +// The x86-prefixed functions are shared with Arch386 in 386.go. + +func x86Op3(name string) bool { + // As far as a.op3 is concerned, there are no 3-op instructions. + // (We print instructions like MULX ourselves.) + return false +} + +func x86Hint(a *Asm, h Hint) string { + switch h { + case HintShiftCount: + return "CX" + case HintMulSrc: + if a.Enabled(OptionAltCarry) { // using MULX + return "DX" + } + return "AX" + case HintMulHi: + if a.Enabled(OptionAltCarry) { // using MULX + return "" + } + return "DX" + } + return "" +} + +func x86Suffix(a *Asm) string { + // Note: Not using a.Arch == Arch386 to avoid init cycle. + if a.Arch.Name == "386" { + return "L" + } + return "Q" +} + +func x86MulWide(a *Asm, src1, src2, dstlo, dsthi Reg) { + if a.Enabled(OptionAltCarry) { + // Using ADCX/ADOX; use MULX to avoid clearing carry flag. + if src1.name != "DX" { + if src2.name != "DX" { + a.Fatalf("mul src1 or src2 must be DX") + } + src2 = src1 + } + a.Printf("\tMULXQ %s, %s, %s\n", src2, dstlo, dsthi) + return + } + + if src1.name != "AX" { + if src2.name != "AX" { + a.Fatalf("mulwide src1 or src2 must be AX") + } + src2 = src1 + } + if dstlo.name != "AX" { + a.Fatalf("mulwide dstlo must be AX") + } + if dsthi.name != "DX" { + a.Fatalf("mulwide dsthi must be DX") + } + a.Printf("\tMUL%s %s\n", x86Suffix(a), src2) +} diff --git a/src/math/big/internal/asmgen/arm64.go b/src/math/big/internal/asmgen/arm64.go new file mode 100644 index 0000000000000000000000000000000000000000..ce70d5a1f7d133b01014956fa16d14018cc96906 --- /dev/null +++ b/src/math/big/internal/asmgen/arm64.go @@ -0,0 +1,111 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +var ArchARM64 = &Arch{ + Name: "arm64", + WordBits: 64, + WordBytes: 8, + CarrySafeLoop: true, + + regs: []string{ + // R18 is the platform register. + // R27 is the assembler/linker temporary (which we could potentially use but don't). + // R28 is g. + // R29 is FP. + // R30 is LR. + "R0", "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", + "R10", "R11", "R12", "R13", "R14", "R15", "R16", "R17", "R19", + "R20", "R21", "R22", "R23", "R24", "R25", "R26", + }, + reg0: "ZR", + + mov: "MOVD", + add: "ADD", + adds: "ADDS", + adc: "ADC", + adcs: "ADCS", + sub: "SUB", + subs: "SUBS", + sbc: "SBC", + sbcs: "SBCS", + mul: "MUL", + mulhi: "UMULH", + lsh: "LSL", + rsh: "LSR", + and: "AND", + or: "ORR", + xor: "EOR", + + addWords: "ADD %[1]s<<3, %[2]s, %[3]s", + + jmpZero: "CBZ %s, %s", + jmpNonZero: "CBNZ %s, %s", + + loadIncN: arm64LoadIncN, + loadDecN: arm64LoadDecN, + storeIncN: arm64StoreIncN, + storeDecN: arm64StoreDecN, +} + +func arm64LoadIncN(a *Asm, p RegPtr, regs []Reg) { + if len(regs) == 1 { + a.Printf("\tMOVD.P %d(%s), %s\n", a.Arch.WordBytes, p, regs[0]) + return + } + a.Printf("\tLDP.P %d(%s), (%s, %s)\n", len(regs)*a.Arch.WordBytes, p, regs[0], regs[1]) + var i int + for i = 2; i+2 <= len(regs); i += 2 { + a.Printf("\tLDP %d(%s), (%s, %s)\n", (i-len(regs))*a.Arch.WordBytes, p, regs[i], regs[i+1]) + } + if i < len(regs) { + a.Printf("\tMOVD %d(%s), %s\n", -1*a.Arch.WordBytes, p, regs[i]) + } +} + +func arm64LoadDecN(a *Asm, p RegPtr, regs []Reg) { + if len(regs) == 1 { + a.Printf("\tMOVD.W -%d(%s), %s\n", a.Arch.WordBytes, p, regs[0]) + return + } + a.Printf("\tLDP.W %d(%s), (%s, %s)\n", -len(regs)*a.Arch.WordBytes, p, regs[len(regs)-1], regs[len(regs)-2]) + var i int + for i = 2; i+2 <= len(regs); i += 2 { + a.Printf("\tLDP %d(%s), (%s, %s)\n", i*a.Arch.WordBytes, p, regs[len(regs)-1-i], regs[len(regs)-2-i]) + } + if i < len(regs) { + a.Printf("\tMOVD %d(%s), %s\n", i*a.Arch.WordBytes, p, regs[0]) + } +} + +func arm64StoreIncN(a *Asm, p RegPtr, regs []Reg) { + if len(regs) == 1 { + a.Printf("\tMOVD.P %s, %d(%s)\n", regs[0], a.Arch.WordBytes, p) + return + } + a.Printf("\tSTP.P (%s, %s), %d(%s)\n", regs[0], regs[1], len(regs)*a.Arch.WordBytes, p) + var i int + for i = 2; i+2 <= len(regs); i += 2 { + a.Printf("\tSTP (%s, %s), %d(%s)\n", regs[i], regs[i+1], (i-len(regs))*a.Arch.WordBytes, p) + } + if i < len(regs) { + a.Printf("\tMOVD %s, %d(%s)\n", regs[i], -1*a.Arch.WordBytes, p) + } +} + +func arm64StoreDecN(a *Asm, p RegPtr, regs []Reg) { + if len(regs) == 1 { + a.Printf("\tMOVD.W %s, -%d(%s)\n", regs[0], a.Arch.WordBytes, p) + return + } + a.Printf("\tSTP.W (%s, %s), %d(%s)\n", regs[len(regs)-1], regs[len(regs)-2], -len(regs)*a.Arch.WordBytes, p) + var i int + for i = 2; i+2 <= len(regs); i += 2 { + a.Printf("\tSTP (%s, %s), %d(%s)\n", regs[len(regs)-1-i], regs[len(regs)-2-i], i*a.Arch.WordBytes, p) + } + if i < len(regs) { + a.Printf("\tMOVD %s, %d(%s)\n", regs[0], i*a.Arch.WordBytes, p) + } +} diff --git a/src/math/big/internal/asmgen/cheat.go b/src/math/big/internal/asmgen/cheat.go new file mode 100644 index 0000000000000000000000000000000000000000..0149d9ac565a9da13b70b575a5af25eab628d761 --- /dev/null +++ b/src/math/big/internal/asmgen/cheat.go @@ -0,0 +1,52 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ignore + +// This program can be compiled with -S to produce a “cheat sheet” +// for filling out a new Arch: the compiler will show you how to implement +// the various operations. +// +// Usage (replace TARGET with your target architecture): +// +// GOOS=linux GOARCH=TARGET go build -gcflags='-p=cheat -S' cheat.go + +package p + +import "math/bits" + +func mov(x, y uint) uint { return y } +func zero() uint { return 0 } +func add(x, y uint) uint { return x + y } +func adds(x, y, c uint) (uint, uint) { return bits.Add(x, y, 0) } +func adcs(x, y, c uint) (uint, uint) { return bits.Add(x, y, c) } +func sub(x, y uint) uint { return x + y } +func subs(x, y uint) (uint, uint) { return bits.Sub(x, y, 0) } +func sbcs(x, y, c uint) (uint, uint) { return bits.Sub(x, y, c) } +func mul(x, y uint) uint { return x * y } +func mulWide(x, y uint) (uint, uint) { return bits.Mul(x, y) } +func lsh(x, s uint) uint { return x << s } +func rsh(x, s uint) uint { return x >> s } +func and(x, y uint) uint { return x & y } +func or(x, y uint) uint { return x | y } +func xor(x, y uint) uint { return x ^ y } +func neg(x uint) uint { return -x } +func loop(x int) int { + s := 0 + for i := 1; i < x; i++ { + s += i + if s == 98 { + return 99 + } + if s == 99 { + return 100 + } + if s == 0 { + return 101 + } + s += 2 + } + return s +} +func mem(x *[10]struct{ a, b uint }, i int) uint { return x[i].b } diff --git a/src/math/big/internal/asmgen/loong64.go b/src/math/big/internal/asmgen/loong64.go new file mode 100644 index 0000000000000000000000000000000000000000..e2d05690ab4de4901f72f2bc9a22df3cb7fdd74f --- /dev/null +++ b/src/math/big/internal/asmgen/loong64.go @@ -0,0 +1,45 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +var ArchLoong64 = &Arch{ + Name: "loong64", + WordBits: 64, + WordBytes: 8, + CarrySafeLoop: true, + + regs: []string{ + // R0 is set to 0. + // R1 is LR. + // R2 is ??? + // R3 is SP. + // R22 is g. + // R28 and R29 are our virtual carry flags. + // R30 is the linker/assembler temp, which we use too. + "R4", "R5", "R6", "R7", "R8", "R9", + "R10", "R11", "R12", "R13", "R14", "R15", "R16", "R17", "R18", "R19", + "R20", "R21", "R23", "R24", "R25", "R26", "R27", + "R31", + }, + reg0: "R0", + regCarry: "R28", + regAltCarry: "R29", + regTmp: "R30", + + mov: "MOVV", + add: "ADDVU", + sub: "SUBVU", + sltu: "SGTU", + mul: "MULV", + mulhi: "MULHVU", + lsh: "SLLV", + rsh: "SRLV", + and: "AND", + or: "OR", + xor: "XOR", + + jmpZero: "BEQ %s, %s", + jmpNonZero: "BNE %s, %s", +} diff --git a/src/math/big/internal/asmgen/main.go b/src/math/big/internal/asmgen/main.go index 0214a91b1c60d08ed21a5582e11f8f060b7a81d6..7f7f36c89f6306f138f06c68e4b0b0457b0f4c16 100644 --- a/src/math/big/internal/asmgen/main.go +++ b/src/math/big/internal/asmgen/main.go @@ -15,9 +15,16 @@ package asmgen var arches = []*Arch{ + Arch386, + ArchAMD64, ArchARM, + ArchARM64, + ArchLoong64, ArchMIPS, ArchMIPS64x, + ArchPPC64x, + ArchRISCV64, + ArchS390X, } // generate returns the file name and content of the generated assembly for the given architecture. diff --git a/src/math/big/internal/asmgen/ppc64.go b/src/math/big/internal/asmgen/ppc64.go new file mode 100644 index 0000000000000000000000000000000000000000..e2cf7229a3f571b361fb22a7958e509845ff2d37 --- /dev/null +++ b/src/math/big/internal/asmgen/ppc64.go @@ -0,0 +1,64 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +var ArchPPC64x = &Arch{ + Name: "ppc64x", + Build: "ppc64 || ppc64le", + WordBits: 64, + WordBytes: 8, + CarrySafeLoop: true, + + // Note: The old, hand-written ppc64x assembly used MOVDU + // to avoid explicit pointer updates in a few routines, but the new + // generated code runs just as fast, so we haven't bothered to try + // to add that back. (It's not trivial; you'd have to keep the pointers + // shifted one word in order to make the semantics work.) + // + // The old assembly also used some complex vector instructions + // to implement lshVU and rshVU, but the generated code that uses + // ordinary integer instructions is much faster than the vector code was, + // at least on the power10 gomote. + + regs: []string{ + // R0 is 0 by convention. + // R1 is SP. + // R2 is TOC. + // R30 is g. + // R31 is the assembler/linker temporary (which we use too). + "R3", "R4", "R5", "R6", "R7", "R8", "R9", + "R10", "R11", "R12" /*R13 is TLS*/, "R14", "R15", "R16", "R17", "R18", "R19", + "R20", "R21", "R22", "R23", "R24", "R25", "R26", "R27", "R28", "R29", + }, + reg0: "R0", + regTmp: "R31", + + // Note: Could write an addF and subF to use ADDZE and SUBZE, + // but we have R0 so it doesn't seem to matter much. + + mov: "MOVD", + add: "ADD", + adds: "ADDC", + adcs: "ADDE", + sub: "SUB", + subs: "SUBC", + sbcs: "SUBE", + mul: "MULLD", + mulhi: "MULHDU", + lsh: "SLD", + rsh: "SRD", + and: "ANDCC", // regular AND does not accept immediates + or: "OR", + xor: "XOR", + + jmpZero: "CMP %[1]s, $0; BEQ %[2]s", + jmpNonZero: "CMP %s, $0; BNE %s", + + // Note: Using CTR means that we could free the count register + // during the loop body, but the portable logic doesn't know that, + // and we're not hurting for registers. + loopTop: "CMP %[1]s, $0; BEQ %[2]s; MOVD %[1]s, CTR", + loopBottom: "BDNZ %[2]s", +} diff --git a/src/math/big/internal/asmgen/riscv64.go b/src/math/big/internal/asmgen/riscv64.go new file mode 100644 index 0000000000000000000000000000000000000000..8995c4c1592ca708b10221dd3e6fcf1aa2168fbe --- /dev/null +++ b/src/math/big/internal/asmgen/riscv64.go @@ -0,0 +1,47 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +var ArchRISCV64 = &Arch{ + Name: "riscv64", + WordBits: 64, + WordBytes: 8, + CarrySafeLoop: true, + + regs: []string{ + // X0 is zero. + // X1 is LR. + // X2 is SP. + // X3 is SB. + // X4 is TP. + // X27 is g. + // X28 and X29 are our virtual carry flags. + // X31 is the assembler/linker temporary (which we use too). + "X5", "X6", "X7", "X8", "X9", + "X10", "X11", "X12", "X13", "X14", "X15", "X16", "X17", "X18", "X19", + "X20", "X21", "X22", "X23", "X24", "X25", "X26", + "X30", + }, + + reg0: "X0", + regCarry: "X28", + regAltCarry: "X29", + regTmp: "X31", + + mov: "MOV", + add: "ADD", + sub: "SUB", + mul: "MUL", + mulhi: "MULHU", + lsh: "SLL", + rsh: "SRL", + and: "AND", + or: "OR", + xor: "XOR", + sltu: "SLTU", + + jmpZero: "BEQZ %s, %s", + jmpNonZero: "BNEZ %s, %s", +} diff --git a/src/math/big/internal/asmgen/s390x.go b/src/math/big/internal/asmgen/s390x.go new file mode 100644 index 0000000000000000000000000000000000000000..71c9b165c6253715e7e25ab0ffea79315fb356c6 --- /dev/null +++ b/src/math/big/internal/asmgen/s390x.go @@ -0,0 +1,100 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package asmgen + +var ArchS390X = &Arch{ + Name: "s390x", + WordBits: 64, + WordBytes: 8, + CarrySafeLoop: true, + + regs: []string{ + // R0 is 0 by convention in this code (see setup). + // R10 is the assembler/linker temporary. + // R11 is a second assembler/linker temporary, for wide multiply. + // We allow allocating R10 and R11 so that we can use them as + // direct multiplication targets while tracking whether they're in use. + // R13 is g. + // R14 is LR. + // R15 is SP. + "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", + "R10", "R11", "R12", + }, + reg0: "R0", + regTmp: "R10", + setup: s390xSetup, + maxColumns: 2, + op3: s390xOp3, + hint: s390xHint, + + // Instruction reference: chapter 7 of + // https://www.ibm.com/docs/en/SSQ2R2_15.0.0/com.ibm.tpf.toolkit.hlasm.doc/dz9zr006.pdf + + mov: "MOVD", + adds: "ADDC", // ADD is an alias for ADDC, sets carry + adcs: "ADDE", + subs: "SUBC", // SUB is an alias for SUBC, sets carry + sbcs: "SUBE", + mulWideF: s390MulWide, + lsh: "SLD", + rsh: "SRD", + and: "AND", + or: "OR", + xor: "XOR", + neg: "NEG", + lea: "LAY", // LAY because LA only accepts positive offsets + + jmpZero: "CMPBEQ %s, $0, %s", + jmpNonZero: "CMPBNE %s, $0, %s", +} + +func s390xSetup(f *Func) { + a := f.Asm + if f.Name == "addVV" || f.Name == "subVV" { + // S390x, unlike every other system, has vector instructions + // that can propagate carry bits during parallel adds (VACC). + // Instead of trying to generate that for this one system, + // jump to the hand-written code in arithvec_s390x.s. + a.Printf("\tMOVB ·hasVX(SB), R1\n") + a.Printf("\tCMPBEQ R1, $0, novec\n") + a.Printf("\tJMP ·%svec(SB)\n", f.Name) + a.Printf("novec:\n") + } + a.Printf("\tMOVD $0, R0\n") +} + +func s390xOp3(name string) bool { + if name == "AND" { // AND with immediate only takes imm, reg; not imm, reg, reg. + return false + } + return true +} + +func s390xHint(_ *Asm, h Hint) string { + switch h { + case HintMulSrc: + return "R11" + case HintMulHi: + return "R10" + } + return "" +} + +func s390MulWide(a *Asm, src1, src2, dstlo, dsthi Reg) { + if src1.name != "R11" && src2.name != "R11" { + a.Fatalf("mulWide src1 or src2 must be R11") + } + if dstlo.name != "R11" { + a.Fatalf("mulWide dstlo must be R11") + } + if dsthi.name != "R10" { + a.Fatalf("mulWide dsthi must be R10") + } + src := src1 + if src.name == "R11" { + src = src2 + } + a.Printf("\tMLGR %s, R10\n", src) +}