diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go
new file mode 100644
index 0000000000000000000000000000000000000000..f44531b88cc88af0f17fd8f1b5f82bfc6fdf690f
--- /dev/null
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@@ -0,0 +1,1047 @@
+// Copyright 2024 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ssagen
+
+import (
+	"fmt"
+	"internal/buildcfg"
+
+	"cmd/compile/internal/base"
+	"cmd/compile/internal/ir"
+	"cmd/compile/internal/ssa"
+	"cmd/compile/internal/types"
+	"cmd/internal/sys"
+)
+
+var intrinsics map[intrinsicKey]intrinsicBuilder
+
+// An intrinsicBuilder converts a call node n into an ssa value that
+// implements that call as an intrinsic. args is a list of arguments to the func.
+type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
+
+type intrinsicKey struct {
+	arch *sys.Arch
+	pkg  string
+	fn   string
+}
+
+func initIntrinsics() {
+	intrinsics = map[intrinsicKey]intrinsicBuilder{}
+
+	var p4 []*sys.Arch
+	var p8 []*sys.Arch
+	var lwatomics []*sys.Arch
+	for _, a := range sys.Archs {
+		if a.PtrSize == 4 {
+			p4 = append(p4, a)
+		} else {
+			p8 = append(p8, a)
+		}
+		if a.Family != sys.PPC64 {
+			lwatomics = append(lwatomics, a)
+		}
+	}
+	all := sys.Archs[:]
+
+	// add adds the intrinsic b for pkg.fn for the given list of architectures.
+	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
+		for _, a := range archs {
+			intrinsics[intrinsicKey{a, pkg, fn}] = b
+		}
+	}
+	// addF does the same as add but operates on architecture families.
+	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
+		for _, a := range sys.Archs {
+			if a.InFamily(archFamilies...) {
+				intrinsics[intrinsicKey{a, pkg, fn}] = b
+			}
+		}
+	}
+	// alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
+	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
+		aliased := false
+		for _, a := range archs {
+			if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
+				intrinsics[intrinsicKey{a, pkg, fn}] = b
+				aliased = true
+			}
+		}
+		if !aliased {
+			panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
+		}
+	}
+
+	/******** runtime ********/
+	if !base.Flag.Cfg.Instrumenting {
+		add("runtime", "slicebytetostringtmp",
+			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
+				// for the backend instead of slicebytetostringtmp calls
+				// when not instrumenting.
+				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
+			},
+			all...)
+	}
+	addF("internal/runtime/math", "MulUintptr",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if s.config.PtrSize == 4 {
+				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
+			}
+			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
+		},
+		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64)
+	add("runtime", "KeepAlive",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
+			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
+			return nil
+		},
+		all...)
+	add("runtime", "getclosureptr",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
+		},
+		all...)
+
+	add("runtime", "getcallerpc",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
+		},
+		all...)
+
+	add("runtime", "getcallersp",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
+		},
+		all...)
+
+	addF("runtime", "publicationBarrier",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
+			return nil
+		},
+		sys.ARM64, sys.PPC64, sys.RISCV64)
+
+	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
+	if buildcfg.GOPPC64 >= 10 {
+		// Use only on Power10 as the new byte reverse instructions that Power10 provide
+		// make it worthwhile as an intrinsic
+		brev_arch = append(brev_arch, sys.PPC64)
+	}
+	/******** internal/runtime/sys ********/
+	addF("internal/runtime/sys", "Bswap32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
+		},
+		brev_arch...)
+	addF("internal/runtime/sys", "Bswap64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
+		},
+		brev_arch...)
+
+	/****** Prefetch ******/
+	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
+			return nil
+		}
+	}
+
+	// Make Prefetch intrinsics for supported platforms
+	// On the unsupported platforms stub function will be eliminated
+	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
+		sys.AMD64, sys.ARM64, sys.PPC64)
+	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
+		sys.AMD64, sys.ARM64, sys.PPC64)
+
+	/******** internal/runtime/atomic ********/
+	addF("internal/runtime/atomic", "Load",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Load8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Load64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "LoadAcq",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+		},
+		sys.PPC64, sys.S390X)
+	addF("internal/runtime/atomic", "LoadAcq64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+		},
+		sys.PPC64)
+	addF("internal/runtime/atomic", "Loadp",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+	addF("internal/runtime/atomic", "Store",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Store8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Store64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "StorepNoWB",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "StoreRel",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.PPC64, sys.S390X)
+	addF("internal/runtime/atomic", "StoreRel64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.PPC64)
+
+	addF("internal/runtime/atomic", "Xchg",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+		},
+		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Xchg64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+		},
+		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
+
+	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
+
+		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if buildcfg.GOARM64.LSE {
+				emit(s, n, args, op1, typ, needReturn)
+			} else {
+				// Target Atomic feature is identified by dynamic detection
+				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
+				v := s.load(types.Types[types.TBOOL], addr)
+				b := s.endBlock()
+				b.Kind = ssa.BlockIf
+				b.SetControl(v)
+				bTrue := s.f.NewBlock(ssa.BlockPlain)
+				bFalse := s.f.NewBlock(ssa.BlockPlain)
+				bEnd := s.f.NewBlock(ssa.BlockPlain)
+				b.AddEdgeTo(bTrue)
+				b.AddEdgeTo(bFalse)
+				b.Likely = ssa.BranchLikely
+
+				// We have atomic instructions - use it directly.
+				s.startBlock(bTrue)
+				emit(s, n, args, op1, typ, needReturn)
+				s.endBlock().AddEdgeTo(bEnd)
+
+				// Use original instruction sequence.
+				s.startBlock(bFalse)
+				emit(s, n, args, op0, typ, needReturn)
+				s.endBlock().AddEdgeTo(bEnd)
+
+				// Merge results.
+				s.startBlock(bEnd)
+			}
+			if needReturn {
+				return s.variable(n, types.Types[typ])
+			} else {
+				return nil
+			}
+		}
+	}
+	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
+	}
+	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
+		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
+	}
+
+	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
+		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
+		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+		if needReturn {
+			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+		}
+	}
+	addF("internal/runtime/atomic", "Xchg",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Xchg64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
+		sys.ARM64)
+
+	addF("internal/runtime/atomic", "Xadd",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
+		},
+		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Xadd64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
+		},
+		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+	addF("internal/runtime/atomic", "Xadd",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Xadd64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
+		sys.ARM64)
+
+	addF("internal/runtime/atomic", "Cas",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+		},
+		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Cas64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+		},
+		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "CasRel",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
+		},
+		sys.PPC64)
+
+	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
+		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
+		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
+		if needReturn {
+			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
+		}
+	}
+
+	addF("internal/runtime/atomic", "Cas",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Cas64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
+		sys.ARM64)
+
+	// Old-style atomic logical operation API (all supported archs except arm64).
+	addF("internal/runtime/atomic", "And8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "And",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Or8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("internal/runtime/atomic", "Or",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
+			return nil
+		},
+		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
+
+	// arm64 always uses the new-style atomic logical operations, for both the
+	// old and new style API.
+	addF("internal/runtime/atomic", "And8",
+		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Or8",
+		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "And64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "And32",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "And",
+		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Or64",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Or32",
+		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
+		sys.ARM64)
+	addF("internal/runtime/atomic", "Or",
+		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
+		sys.ARM64)
+
+	// New-style atomic logical operations, which return the old memory value.
+	addF("internal/runtime/atomic", "And64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+			p0, p1 := s.split(v)
+			s.vars[memVar] = p1
+			return p0
+		},
+		sys.AMD64)
+	addF("internal/runtime/atomic", "And32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+			p0, p1 := s.split(v)
+			s.vars[memVar] = p1
+			return p0
+		},
+		sys.AMD64)
+	addF("internal/runtime/atomic", "Or64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
+			p0, p1 := s.split(v)
+			s.vars[memVar] = p1
+			return p0
+		},
+		sys.AMD64)
+	addF("internal/runtime/atomic", "Or32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
+			p0, p1 := s.split(v)
+			s.vars[memVar] = p1
+			return p0
+		},
+		sys.AMD64)
+
+	// Aliases for atomic load operations
+	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
+	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
+	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
+	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
+	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
+	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
+	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
+	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
+	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
+	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
+	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
+	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
+
+	// Aliases for atomic store operations
+	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
+	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
+	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
+	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
+	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
+	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
+	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
+	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
+	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
+	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
+
+	// Aliases for atomic swap operations
+	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
+	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
+	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
+	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
+
+	// Aliases for atomic add operations
+	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
+	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
+	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
+	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
+
+	// Aliases for atomic CAS operations
+	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
+	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
+	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
+	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
+	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
+	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
+	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
+
+	// Aliases for atomic And/Or operations
+	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
+	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
+
+	/******** math ********/
+	addF("math", "sqrt",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+	addF("math", "Trunc",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+	addF("math", "Ceil",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+	addF("math", "Floor",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
+	addF("math", "Round",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.ARM64, sys.PPC64, sys.S390X)
+	addF("math", "RoundToEven",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.ARM64, sys.S390X, sys.Wasm)
+	addF("math", "Abs",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
+		},
+		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
+	addF("math", "Copysign",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
+		},
+		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
+	addF("math", "FMA",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+		},
+		sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
+	addF("math", "FMA",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if !s.config.UseFMA {
+				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+				return s.variable(n, types.Types[types.TFLOAT64])
+			}
+
+			if buildcfg.GOAMD64 >= 3 {
+				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+			}
+
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
+			b := s.endBlock()
+			b.Kind = ssa.BlockIf
+			b.SetControl(v)
+			bTrue := s.f.NewBlock(ssa.BlockPlain)
+			bFalse := s.f.NewBlock(ssa.BlockPlain)
+			bEnd := s.f.NewBlock(ssa.BlockPlain)
+			b.AddEdgeTo(bTrue)
+			b.AddEdgeTo(bFalse)
+			b.Likely = ssa.BranchLikely // >= haswell cpus are common
+
+			// We have the intrinsic - use it directly.
+			s.startBlock(bTrue)
+			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Call the pure Go version.
+			s.startBlock(bFalse)
+			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Merge results.
+			s.startBlock(bEnd)
+			return s.variable(n, types.Types[types.TFLOAT64])
+		},
+		sys.AMD64)
+	addF("math", "FMA",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if !s.config.UseFMA {
+				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+				return s.variable(n, types.Types[types.TFLOAT64])
+			}
+			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
+			v := s.load(types.Types[types.TBOOL], addr)
+			b := s.endBlock()
+			b.Kind = ssa.BlockIf
+			b.SetControl(v)
+			bTrue := s.f.NewBlock(ssa.BlockPlain)
+			bFalse := s.f.NewBlock(ssa.BlockPlain)
+			bEnd := s.f.NewBlock(ssa.BlockPlain)
+			b.AddEdgeTo(bTrue)
+			b.AddEdgeTo(bFalse)
+			b.Likely = ssa.BranchLikely
+
+			// We have the intrinsic - use it directly.
+			s.startBlock(bTrue)
+			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Call the pure Go version.
+			s.startBlock(bFalse)
+			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Merge results.
+			s.startBlock(bEnd)
+			return s.variable(n, types.Types[types.TFLOAT64])
+		},
+		sys.ARM)
+
+	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if buildcfg.GOAMD64 >= 2 {
+				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+			}
+
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
+			b := s.endBlock()
+			b.Kind = ssa.BlockIf
+			b.SetControl(v)
+			bTrue := s.f.NewBlock(ssa.BlockPlain)
+			bFalse := s.f.NewBlock(ssa.BlockPlain)
+			bEnd := s.f.NewBlock(ssa.BlockPlain)
+			b.AddEdgeTo(bTrue)
+			b.AddEdgeTo(bFalse)
+			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
+
+			// We have the intrinsic - use it directly.
+			s.startBlock(bTrue)
+			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Call the pure Go version.
+			s.startBlock(bFalse)
+			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Merge results.
+			s.startBlock(bEnd)
+			return s.variable(n, types.Types[types.TFLOAT64])
+		}
+	}
+	addF("math", "RoundToEven",
+		makeRoundAMD64(ssa.OpRoundToEven),
+		sys.AMD64)
+	addF("math", "Floor",
+		makeRoundAMD64(ssa.OpFloor),
+		sys.AMD64)
+	addF("math", "Ceil",
+		makeRoundAMD64(ssa.OpCeil),
+		sys.AMD64)
+	addF("math", "Trunc",
+		makeRoundAMD64(ssa.OpTrunc),
+		sys.AMD64)
+
+	/******** math/bits ********/
+	addF("math/bits", "TrailingZeros64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+	addF("math/bits", "TrailingZeros32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+	addF("math/bits", "TrailingZeros16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+			c := s.constInt32(types.Types[types.TUINT32], 1<<16)
+			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+		},
+		sys.MIPS)
+	addF("math/bits", "TrailingZeros16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+	addF("math/bits", "TrailingZeros16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+			c := s.constInt64(types.Types[types.TUINT64], 1<<16)
+			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+		},
+		sys.S390X, sys.PPC64)
+	addF("math/bits", "TrailingZeros8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+			c := s.constInt32(types.Types[types.TUINT32], 1<<8)
+			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
+			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
+		},
+		sys.MIPS)
+	addF("math/bits", "TrailingZeros8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
+	addF("math/bits", "TrailingZeros8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+			c := s.constInt64(types.Types[types.TUINT64], 1<<8)
+			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
+			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
+		},
+		sys.S390X)
+	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
+	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
+	// ReverseBytes inlines correctly, no need to intrinsify it.
+	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
+	// On Power10, 16-bit rotate is not available so use BRH instruction
+	if buildcfg.GOPPC64 >= 10 {
+		addF("math/bits", "ReverseBytes16",
+			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
+			},
+			sys.PPC64)
+	}
+
+	addF("math/bits", "Len64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+	addF("math/bits", "Len32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.ARM64, sys.PPC64)
+	addF("math/bits", "Len32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if s.config.PtrSize == 4 {
+				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+			}
+			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
+			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+		},
+		sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
+	addF("math/bits", "Len16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if s.config.PtrSize == 4 {
+				x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
+				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+			}
+			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
+			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+		},
+		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+	addF("math/bits", "Len16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64)
+	addF("math/bits", "Len8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if s.config.PtrSize == 4 {
+				x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
+				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
+			}
+			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
+			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
+		},
+		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+	addF("math/bits", "Len8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64)
+	addF("math/bits", "Len",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if s.config.PtrSize == 4 {
+				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
+			}
+			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
+		},
+		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
+	// LeadingZeros is handled because it trivially calls Len.
+	addF("math/bits", "Reverse64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
+		},
+		sys.ARM64)
+	addF("math/bits", "Reverse32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
+		},
+		sys.ARM64)
+	addF("math/bits", "Reverse16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
+		},
+		sys.ARM64)
+	addF("math/bits", "Reverse8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
+		},
+		sys.ARM64)
+	addF("math/bits", "Reverse",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
+		},
+		sys.ARM64)
+	addF("math/bits", "RotateLeft8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
+		},
+		sys.AMD64, sys.RISCV64)
+	addF("math/bits", "RotateLeft16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
+		},
+		sys.AMD64, sys.RISCV64)
+	addF("math/bits", "RotateLeft32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
+		},
+		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+	addF("math/bits", "RotateLeft64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
+		},
+		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
+	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
+
+	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			if buildcfg.GOAMD64 >= 2 {
+				return s.newValue1(op, types.Types[types.TINT], args[0])
+			}
+
+			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
+			b := s.endBlock()
+			b.Kind = ssa.BlockIf
+			b.SetControl(v)
+			bTrue := s.f.NewBlock(ssa.BlockPlain)
+			bFalse := s.f.NewBlock(ssa.BlockPlain)
+			bEnd := s.f.NewBlock(ssa.BlockPlain)
+			b.AddEdgeTo(bTrue)
+			b.AddEdgeTo(bFalse)
+			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
+
+			// We have the intrinsic - use it directly.
+			s.startBlock(bTrue)
+			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Call the pure Go version.
+			s.startBlock(bFalse)
+			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
+			s.endBlock().AddEdgeTo(bEnd)
+
+			// Merge results.
+			s.startBlock(bEnd)
+			return s.variable(n, types.Types[types.TINT])
+		}
+	}
+	addF("math/bits", "OnesCount64",
+		makeOnesCountAMD64(ssa.OpPopCount64),
+		sys.AMD64)
+	addF("math/bits", "OnesCount64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
+		},
+		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
+	addF("math/bits", "OnesCount32",
+		makeOnesCountAMD64(ssa.OpPopCount32),
+		sys.AMD64)
+	addF("math/bits", "OnesCount32",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
+		},
+		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
+	addF("math/bits", "OnesCount16",
+		makeOnesCountAMD64(ssa.OpPopCount16),
+		sys.AMD64)
+	addF("math/bits", "OnesCount16",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
+		},
+		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
+	addF("math/bits", "OnesCount8",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
+		},
+		sys.S390X, sys.PPC64, sys.Wasm)
+	addF("math/bits", "OnesCount",
+		makeOnesCountAMD64(ssa.OpPopCount64),
+		sys.AMD64)
+	addF("math/bits", "Mul64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
+		},
+		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
+	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
+	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
+	addF("math/bits", "Add64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+		},
+		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
+	alias("math/bits", "Add", "math/bits", "Add64", p8...)
+	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
+	addF("math/bits", "Sub64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+		},
+		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
+	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
+	addF("math/bits", "Div64",
+		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+			// check for divide-by-zero/overflow and panic with appropriate message
+			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
+			s.check(cmpZero, ir.Syms.Panicdivide)
+			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
+			s.check(cmpOverflow, ir.Syms.Panicoverflow)
+			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
+		},
+		sys.AMD64)
+	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
+
+	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
+	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
+	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
+	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
+	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
+	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
+
+	/******** sync/atomic ********/
+
+	// Note: these are disabled by flag_race in findIntrinsic below.
+	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
+	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
+	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
+	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
+	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
+	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
+	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
+
+	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
+	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
+	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
+	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
+	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
+	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
+	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
+
+	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
+	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
+	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
+	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
+	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
+	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
+
+	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
+	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
+	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
+	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
+	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
+	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
+
+	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
+	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
+	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
+	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
+	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
+	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
+
+	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
+
+	/******** math/big ********/
+	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
+}
+
+// findIntrinsic returns a function which builds the SSA equivalent of the
+// function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
+func findIntrinsic(sym *types.Sym) intrinsicBuilder {
+	if sym == nil || sym.Pkg == nil {
+		return nil
+	}
+	pkg := sym.Pkg.Path
+	if sym.Pkg == ir.Pkgs.Runtime {
+		pkg = "runtime"
+	}
+	if base.Flag.Race && pkg == "sync/atomic" {
+		// The race detector needs to be able to intercept these calls.
+		// We can't intrinsify them.
+		return nil
+	}
+	// Skip intrinsifying math functions (which may contain hard-float
+	// instructions) when soft-float
+	if Arch.SoftFloat && pkg == "math" {
+		return nil
+	}
+
+	fn := sym.Name
+	if ssa.IntrinsicsDisable {
+		if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") {
+			// These runtime functions don't have definitions, must be intrinsics.
+		} else {
+			return nil
+		}
+	}
+	return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
+}
+
+func IsIntrinsicCall(n *ir.CallExpr) bool {
+	if n == nil {
+		return false
+	}
+	name, ok := n.Fun.(*ir.Name)
+	if !ok {
+		return false
+	}
+	return findIntrinsic(name.Sym()) != nil
+}
diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go
index c02f5f51291d47f9f1e008d1f53a4d29832f651c..67479ace3b4fd3cbc19a13c00691e2d63c8b6e82 100644
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@@ -221,6 +221,10 @@ func InitConfig() {
 	ir.Syms.SigPanic = typecheck.LookupRuntimeFunc("sigpanic")
 }
 
+func InitTables() {
+	initIntrinsics()
+}
+
 // AbiForBodylessFuncStackMap returns the ABI for a bodyless function's stack map.
 // This is not necessarily the ABI used to call it.
 // Currently (1.17 dev) such a stack map is always ABI0;
@@ -4200,1037 +4204,6 @@ func (s *state) split(v *ssa.Value) (*ssa.Value, *ssa.Value) {
 	return p0, p1
 }
 
-var intrinsics map[intrinsicKey]intrinsicBuilder
-
-// An intrinsicBuilder converts a call node n into an ssa value that
-// implements that call as an intrinsic. args is a list of arguments to the func.
-type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
-
-type intrinsicKey struct {
-	arch *sys.Arch
-	pkg  string
-	fn   string
-}
-
-func InitTables() {
-	intrinsics = map[intrinsicKey]intrinsicBuilder{}
-
-	var p4 []*sys.Arch
-	var p8 []*sys.Arch
-	var lwatomics []*sys.Arch
-	for _, a := range sys.Archs {
-		if a.PtrSize == 4 {
-			p4 = append(p4, a)
-		} else {
-			p8 = append(p8, a)
-		}
-		if a.Family != sys.PPC64 {
-			lwatomics = append(lwatomics, a)
-		}
-	}
-	all := sys.Archs[:]
-
-	// add adds the intrinsic b for pkg.fn for the given list of architectures.
-	add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
-		for _, a := range archs {
-			intrinsics[intrinsicKey{a, pkg, fn}] = b
-		}
-	}
-	// addF does the same as add but operates on architecture families.
-	addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
-		for _, a := range sys.Archs {
-			if a.InFamily(archFamilies...) {
-				intrinsics[intrinsicKey{a, pkg, fn}] = b
-			}
-		}
-	}
-	// alias defines pkg.fn = pkg2.fn2 for all architectures in archs for which pkg2.fn2 exists.
-	alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
-		aliased := false
-		for _, a := range archs {
-			if b, ok := intrinsics[intrinsicKey{a, pkg2, fn2}]; ok {
-				intrinsics[intrinsicKey{a, pkg, fn}] = b
-				aliased = true
-			}
-		}
-		if !aliased {
-			panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
-		}
-	}
-
-	/******** runtime ********/
-	if !base.Flag.Cfg.Instrumenting {
-		add("runtime", "slicebytetostringtmp",
-			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-				// Compiler frontend optimizations emit OBYTES2STRTMP nodes
-				// for the backend instead of slicebytetostringtmp calls
-				// when not instrumenting.
-				return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
-			},
-			all...)
-	}
-	addF("internal/runtime/math", "MulUintptr",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if s.config.PtrSize == 4 {
-				return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
-			}
-			return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
-		},
-		sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.RISCV64, sys.ARM64)
-	add("runtime", "KeepAlive",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
-			s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
-			return nil
-		},
-		all...)
-	add("runtime", "getclosureptr",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
-		},
-		all...)
-
-	add("runtime", "getcallerpc",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
-		},
-		all...)
-
-	add("runtime", "getcallersp",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
-		},
-		all...)
-
-	addF("runtime", "publicationBarrier",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
-			return nil
-		},
-		sys.ARM64, sys.PPC64, sys.RISCV64)
-
-	brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X}
-	if buildcfg.GOPPC64 >= 10 {
-		// Use only on Power10 as the new byte reverse instructions that Power10 provide
-		// make it worthwhile as an intrinsic
-		brev_arch = append(brev_arch, sys.PPC64)
-	}
-	/******** internal/runtime/sys ********/
-	addF("internal/runtime/sys", "Bswap32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
-		},
-		brev_arch...)
-	addF("internal/runtime/sys", "Bswap64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
-		},
-		brev_arch...)
-
-	/****** Prefetch ******/
-	makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
-			return nil
-		}
-	}
-
-	// Make Prefetch intrinsics for supported platforms
-	// On the unsupported platforms stub function will be eliminated
-	addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
-		sys.AMD64, sys.ARM64, sys.PPC64)
-	addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
-		sys.AMD64, sys.ARM64, sys.PPC64)
-
-	/******** internal/runtime/atomic ********/
-	addF("internal/runtime/atomic", "Load",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Load8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Load64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "LoadAcq",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-		},
-		sys.PPC64, sys.S390X)
-	addF("internal/runtime/atomic", "LoadAcq64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-		},
-		sys.PPC64)
-	addF("internal/runtime/atomic", "Loadp",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-	addF("internal/runtime/atomic", "Store",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Store8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Store64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "StorepNoWB",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "StoreRel",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.PPC64, sys.S390X)
-	addF("internal/runtime/atomic", "StoreRel64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.PPC64)
-
-	addF("internal/runtime/atomic", "Xchg",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-		},
-		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Xchg64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-		},
-		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-	type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
-
-	makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
-
-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if buildcfg.GOARM64.LSE {
-				emit(s, n, args, op1, typ, needReturn)
-			} else {
-				// Target Atomic feature is identified by dynamic detection
-				addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
-				v := s.load(types.Types[types.TBOOL], addr)
-				b := s.endBlock()
-				b.Kind = ssa.BlockIf
-				b.SetControl(v)
-				bTrue := s.f.NewBlock(ssa.BlockPlain)
-				bFalse := s.f.NewBlock(ssa.BlockPlain)
-				bEnd := s.f.NewBlock(ssa.BlockPlain)
-				b.AddEdgeTo(bTrue)
-				b.AddEdgeTo(bFalse)
-				b.Likely = ssa.BranchLikely
-
-				// We have atomic instructions - use it directly.
-				s.startBlock(bTrue)
-				emit(s, n, args, op1, typ, needReturn)
-				s.endBlock().AddEdgeTo(bEnd)
-
-				// Use original instruction sequence.
-				s.startBlock(bFalse)
-				emit(s, n, args, op0, typ, needReturn)
-				s.endBlock().AddEdgeTo(bEnd)
-
-				// Merge results.
-				s.startBlock(bEnd)
-			}
-			if needReturn {
-				return s.variable(n, types.Types[typ])
-			} else {
-				return nil
-			}
-		}
-	}
-	makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
-		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
-	}
-	makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
-		return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
-	}
-
-	atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
-		v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
-		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-		if needReturn {
-			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
-		}
-	}
-	addF("internal/runtime/atomic", "Xchg",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Xchg64",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
-		sys.ARM64)
-
-	addF("internal/runtime/atomic", "Xadd",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
-		},
-		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Xadd64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
-		},
-		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-	addF("internal/runtime/atomic", "Xadd",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Xadd64",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
-		sys.ARM64)
-
-	addF("internal/runtime/atomic", "Cas",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
-		},
-		sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Cas64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
-		},
-		sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "CasRel",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-			s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-			return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
-		},
-		sys.PPC64)
-
-	atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
-		v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
-		s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
-		if needReturn {
-			s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
-		}
-	}
-
-	addF("internal/runtime/atomic", "Cas",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Cas64",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
-		sys.ARM64)
-
-	// Old-style atomic logical operation API (all supported archs except arm64).
-	addF("internal/runtime/atomic", "And8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "And",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Or8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("internal/runtime/atomic", "Or",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
-			return nil
-		},
-		sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
-
-	// arm64 always uses the new-style atomic logical operations, for both the
-	// old and new style API.
-	addF("internal/runtime/atomic", "And8",
-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Or8",
-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "And64",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "And32",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "And",
-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Or64",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Or32",
-		makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
-		sys.ARM64)
-	addF("internal/runtime/atomic", "Or",
-		makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
-		sys.ARM64)
-
-	// New-style atomic logical operations, which return the old memory value.
-	addF("internal/runtime/atomic", "And64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-			p0, p1 := s.split(v)
-			s.vars[memVar] = p1
-			return p0
-		},
-		sys.AMD64)
-	addF("internal/runtime/atomic", "And32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-			p0, p1 := s.split(v)
-			s.vars[memVar] = p1
-			return p0
-		},
-		sys.AMD64)
-	addF("internal/runtime/atomic", "Or64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
-			p0, p1 := s.split(v)
-			s.vars[memVar] = p1
-			return p0
-		},
-		sys.AMD64)
-	addF("internal/runtime/atomic", "Or32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
-			p0, p1 := s.split(v)
-			s.vars[memVar] = p1
-			return p0
-		},
-		sys.AMD64)
-
-	// Aliases for atomic load operations
-	alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
-	alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
-	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
-	alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
-	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
-	alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
-	alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
-	alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
-	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
-	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
-	alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
-	alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
-
-	// Aliases for atomic store operations
-	alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
-	alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
-	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
-	alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
-	alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
-	alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
-	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
-	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
-	alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
-	alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
-
-	// Aliases for atomic swap operations
-	alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
-	alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
-	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
-	alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
-
-	// Aliases for atomic add operations
-	alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
-	alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
-	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
-	alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
-
-	// Aliases for atomic CAS operations
-	alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
-	alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
-	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
-	alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
-	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
-	alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
-	alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
-
-	// Aliases for atomic And/Or operations
-	alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64)
-	alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64)
-
-	/******** math ********/
-	addF("math", "sqrt",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
-	addF("math", "Trunc",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
-	addF("math", "Ceil",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
-	addF("math", "Floor",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
-	addF("math", "Round",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.ARM64, sys.PPC64, sys.S390X)
-	addF("math", "RoundToEven",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.ARM64, sys.S390X, sys.Wasm)
-	addF("math", "Abs",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
-		},
-		sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
-	addF("math", "Copysign",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
-		},
-		sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
-	addF("math", "FMA",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-		},
-		sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
-	addF("math", "FMA",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if !s.config.UseFMA {
-				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-				return s.variable(n, types.Types[types.TFLOAT64])
-			}
-
-			if buildcfg.GOAMD64 >= 3 {
-				return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-			}
-
-			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
-			b := s.endBlock()
-			b.Kind = ssa.BlockIf
-			b.SetControl(v)
-			bTrue := s.f.NewBlock(ssa.BlockPlain)
-			bFalse := s.f.NewBlock(ssa.BlockPlain)
-			bEnd := s.f.NewBlock(ssa.BlockPlain)
-			b.AddEdgeTo(bTrue)
-			b.AddEdgeTo(bFalse)
-			b.Likely = ssa.BranchLikely // >= haswell cpus are common
-
-			// We have the intrinsic - use it directly.
-			s.startBlock(bTrue)
-			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Call the pure Go version.
-			s.startBlock(bFalse)
-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Merge results.
-			s.startBlock(bEnd)
-			return s.variable(n, types.Types[types.TFLOAT64])
-		},
-		sys.AMD64)
-	addF("math", "FMA",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if !s.config.UseFMA {
-				s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-				return s.variable(n, types.Types[types.TFLOAT64])
-			}
-			addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
-			v := s.load(types.Types[types.TBOOL], addr)
-			b := s.endBlock()
-			b.Kind = ssa.BlockIf
-			b.SetControl(v)
-			bTrue := s.f.NewBlock(ssa.BlockPlain)
-			bFalse := s.f.NewBlock(ssa.BlockPlain)
-			bEnd := s.f.NewBlock(ssa.BlockPlain)
-			b.AddEdgeTo(bTrue)
-			b.AddEdgeTo(bFalse)
-			b.Likely = ssa.BranchLikely
-
-			// We have the intrinsic - use it directly.
-			s.startBlock(bTrue)
-			s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Call the pure Go version.
-			s.startBlock(bFalse)
-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Merge results.
-			s.startBlock(bEnd)
-			return s.variable(n, types.Types[types.TFLOAT64])
-		},
-		sys.ARM)
-
-	makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if buildcfg.GOAMD64 >= 2 {
-				return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
-			}
-
-			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
-			b := s.endBlock()
-			b.Kind = ssa.BlockIf
-			b.SetControl(v)
-			bTrue := s.f.NewBlock(ssa.BlockPlain)
-			bFalse := s.f.NewBlock(ssa.BlockPlain)
-			bEnd := s.f.NewBlock(ssa.BlockPlain)
-			b.AddEdgeTo(bTrue)
-			b.AddEdgeTo(bFalse)
-			b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
-
-			// We have the intrinsic - use it directly.
-			s.startBlock(bTrue)
-			s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Call the pure Go version.
-			s.startBlock(bFalse)
-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Merge results.
-			s.startBlock(bEnd)
-			return s.variable(n, types.Types[types.TFLOAT64])
-		}
-	}
-	addF("math", "RoundToEven",
-		makeRoundAMD64(ssa.OpRoundToEven),
-		sys.AMD64)
-	addF("math", "Floor",
-		makeRoundAMD64(ssa.OpFloor),
-		sys.AMD64)
-	addF("math", "Ceil",
-		makeRoundAMD64(ssa.OpCeil),
-		sys.AMD64)
-	addF("math", "Trunc",
-		makeRoundAMD64(ssa.OpTrunc),
-		sys.AMD64)
-
-	/******** math/bits ********/
-	addF("math/bits", "TrailingZeros64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-	addF("math/bits", "TrailingZeros32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-	addF("math/bits", "TrailingZeros16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
-			c := s.constInt32(types.Types[types.TUINT32], 1<<16)
-			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
-		},
-		sys.MIPS)
-	addF("math/bits", "TrailingZeros16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
-	addF("math/bits", "TrailingZeros16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
-			c := s.constInt64(types.Types[types.TUINT64], 1<<16)
-			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
-		},
-		sys.S390X, sys.PPC64)
-	addF("math/bits", "TrailingZeros8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
-			c := s.constInt32(types.Types[types.TUINT32], 1<<8)
-			y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
-			return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
-		},
-		sys.MIPS)
-	addF("math/bits", "TrailingZeros8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
-	addF("math/bits", "TrailingZeros8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
-			c := s.constInt64(types.Types[types.TUINT64], 1<<8)
-			y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
-			return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
-		},
-		sys.S390X)
-	alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
-	alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
-	// ReverseBytes inlines correctly, no need to intrinsify it.
-	// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
-	// On Power10, 16-bit rotate is not available so use BRH instruction
-	if buildcfg.GOPPC64 >= 10 {
-		addF("math/bits", "ReverseBytes16",
-			func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-				return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
-			},
-			sys.PPC64)
-	}
-
-	addF("math/bits", "Len64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-	addF("math/bits", "Len32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.ARM64, sys.PPC64)
-	addF("math/bits", "Len32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if s.config.PtrSize == 4 {
-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
-			}
-			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
-		},
-		sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
-	addF("math/bits", "Len16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if s.config.PtrSize == 4 {
-				x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
-			}
-			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
-		},
-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-	addF("math/bits", "Len16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64)
-	addF("math/bits", "Len8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if s.config.PtrSize == 4 {
-				x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
-			}
-			x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
-		},
-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-	addF("math/bits", "Len8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64)
-	addF("math/bits", "Len",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if s.config.PtrSize == 4 {
-				return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
-			}
-			return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
-		},
-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
-	// LeadingZeros is handled because it trivially calls Len.
-	addF("math/bits", "Reverse64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
-		},
-		sys.ARM64)
-	addF("math/bits", "Reverse32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
-		},
-		sys.ARM64)
-	addF("math/bits", "Reverse16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
-		},
-		sys.ARM64)
-	addF("math/bits", "Reverse8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
-		},
-		sys.ARM64)
-	addF("math/bits", "Reverse",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
-		},
-		sys.ARM64)
-	addF("math/bits", "RotateLeft8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
-		},
-		sys.AMD64, sys.RISCV64)
-	addF("math/bits", "RotateLeft16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
-		},
-		sys.AMD64, sys.RISCV64)
-	addF("math/bits", "RotateLeft32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
-		},
-		sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
-	addF("math/bits", "RotateLeft64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
-		},
-		sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
-	alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
-
-	makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-		return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			if buildcfg.GOAMD64 >= 2 {
-				return s.newValue1(op, types.Types[types.TINT], args[0])
-			}
-
-			v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
-			b := s.endBlock()
-			b.Kind = ssa.BlockIf
-			b.SetControl(v)
-			bTrue := s.f.NewBlock(ssa.BlockPlain)
-			bFalse := s.f.NewBlock(ssa.BlockPlain)
-			bEnd := s.f.NewBlock(ssa.BlockPlain)
-			b.AddEdgeTo(bTrue)
-			b.AddEdgeTo(bFalse)
-			b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
-
-			// We have the intrinsic - use it directly.
-			s.startBlock(bTrue)
-			s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Call the pure Go version.
-			s.startBlock(bFalse)
-			s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
-			s.endBlock().AddEdgeTo(bEnd)
-
-			// Merge results.
-			s.startBlock(bEnd)
-			return s.variable(n, types.Types[types.TINT])
-		}
-	}
-	addF("math/bits", "OnesCount64",
-		makeOnesCountAMD64(ssa.OpPopCount64),
-		sys.AMD64)
-	addF("math/bits", "OnesCount64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
-		},
-		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
-	addF("math/bits", "OnesCount32",
-		makeOnesCountAMD64(ssa.OpPopCount32),
-		sys.AMD64)
-	addF("math/bits", "OnesCount32",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
-		},
-		sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
-	addF("math/bits", "OnesCount16",
-		makeOnesCountAMD64(ssa.OpPopCount16),
-		sys.AMD64)
-	addF("math/bits", "OnesCount16",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
-		},
-		sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
-	addF("math/bits", "OnesCount8",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
-		},
-		sys.S390X, sys.PPC64, sys.Wasm)
-	addF("math/bits", "OnesCount",
-		makeOnesCountAMD64(ssa.OpPopCount64),
-		sys.AMD64)
-	addF("math/bits", "Mul64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
-		},
-		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
-	alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
-	alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
-	addF("math/bits", "Add64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
-		},
-		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
-	alias("math/bits", "Add", "math/bits", "Add64", p8...)
-	alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
-	addF("math/bits", "Sub64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
-		},
-		sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
-	alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
-	addF("math/bits", "Div64",
-		func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
-			// check for divide-by-zero/overflow and panic with appropriate message
-			cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
-			s.check(cmpZero, ir.Syms.Panicdivide)
-			cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
-			s.check(cmpOverflow, ir.Syms.Panicoverflow)
-			return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
-		},
-		sys.AMD64)
-	alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
-
-	alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
-	alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
-	alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
-	alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
-	alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
-	alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
-
-	/******** sync/atomic ********/
-
-	// Note: these are disabled by flag_race in findIntrinsic below.
-	alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
-	alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
-	alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
-	alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
-	alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
-	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
-	alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
-
-	alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
-	alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
-	// Note: not StorePointer, that needs a write barrier.  Same below for {CompareAnd}Swap.
-	alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
-	alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
-	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
-	alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
-
-	alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
-	alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
-	alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
-	alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
-	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
-	alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
-
-	alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
-	alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
-	alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
-	alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
-	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
-	alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
-
-	alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
-	alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
-	alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
-	alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
-	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
-	alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
-
-	alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
-	alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64)
-
-	/******** math/big ********/
-	alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
-}
-
-// findIntrinsic returns a function which builds the SSA equivalent of the
-// function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
-func findIntrinsic(sym *types.Sym) intrinsicBuilder {
-	if sym == nil || sym.Pkg == nil {
-		return nil
-	}
-	pkg := sym.Pkg.Path
-	if sym.Pkg == ir.Pkgs.Runtime {
-		pkg = "runtime"
-	}
-	if base.Flag.Race && pkg == "sync/atomic" {
-		// The race detector needs to be able to intercept these calls.
-		// We can't intrinsify them.
-		return nil
-	}
-	// Skip intrinsifying math functions (which may contain hard-float
-	// instructions) when soft-float
-	if Arch.SoftFloat && pkg == "math" {
-		return nil
-	}
-
-	fn := sym.Name
-	if ssa.IntrinsicsDisable {
-		if pkg == "runtime" && (fn == "getcallerpc" || fn == "getcallersp" || fn == "getclosureptr") {
-			// These runtime functions don't have definitions, must be intrinsics.
-		} else {
-			return nil
-		}
-	}
-	return intrinsics[intrinsicKey{Arch.LinkArch.Arch, pkg, fn}]
-}
-
-func IsIntrinsicCall(n *ir.CallExpr) bool {
-	if n == nil {
-		return false
-	}
-	name, ok := n.Fun.(*ir.Name)
-	if !ok {
-		return false
-	}
-	return findIntrinsic(name.Sym()) != nil
-}
-
 // intrinsicCall converts a call to a recognized intrinsic function into the intrinsic SSA operation.
 func (s *state) intrinsicCall(n *ir.CallExpr) *ssa.Value {
 	v := findIntrinsic(n.Fun.Sym())(s, n, s.intrinsicArgs(n))