Skip to content
Snippets Groups Projects
ssa.go 37.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • // Copyright 2016 The Go Authors. All rights reserved.
    // Use of this source code is governed by a BSD-style
    // license that can be found in the LICENSE file.
    
    package amd64
    
    import (
    	"fmt"
    	"math"
    
    	"cmd/compile/internal/gc"
    	"cmd/compile/internal/ssa"
    
    	"cmd/compile/internal/types"
    
    	"cmd/internal/obj"
    	"cmd/internal/obj/x86"
    )
    
    // markMoves marks any MOVXconst ops that need to avoid clobbering flags.
    func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
    	flive := b.FlagsLiveAtEnd
    	if b.Control != nil && b.Control.Type.IsFlags() {
    		flive = true
    	}
    	for i := len(b.Values) - 1; i >= 0; i-- {
    		v := b.Values[i]
    
    		if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
    
    			// The "mark" is any non-nil Aux value.
    			v.Aux = v
    		}
    		if v.Type.IsFlags() {
    			flive = false
    		}
    		for _, a := range v.Args {
    			if a.Type.IsFlags() {
    				flive = true
    			}
    		}
    	}
    }
    
    // loadByType returns the load instruction of the given type.
    
    func loadByType(t *types.Type) obj.As {
    
    	if !t.IsFloat() && t.Size() <= 2 {
    		if t.Size() == 1 {
    
    			return x86.AMOVBLZX
    		} else {
    			return x86.AMOVWLZX
    		}
    	}
    	// Otherwise, there's no difference between load and store opcodes.
    	return storeByType(t)
    }
    
    // storeByType returns the store instruction of the given type.
    
    func storeByType(t *types.Type) obj.As {
    
    	if t.IsFloat() {
    		switch width {
    		case 4:
    			return x86.AMOVSS
    		case 8:
    			return x86.AMOVSD
    		}
    	} else {
    		switch width {
    		case 1:
    			return x86.AMOVB
    		case 2:
    			return x86.AMOVW
    		case 4:
    			return x86.AMOVL
    		case 8:
    			return x86.AMOVQ
    		}
    	}
    	panic("bad store type")
    }
    
    // moveByType returns the reg->reg move instruction of the given type.
    
    func moveByType(t *types.Type) obj.As {
    
    	if t.IsFloat() {
    		// Moving the whole sse2 register is faster
    		// than moving just the correct low portion of it.
    		// There is no xmm->xmm move with 1 byte opcode,
    		// so use movups, which has 2 byte opcode.
    		return x86.AMOVUPS
    	} else {
    
    		case 1:
    			// Avoids partial register write
    			return x86.AMOVL
    		case 2:
    			return x86.AMOVL
    		case 4:
    			return x86.AMOVL
    		case 8:
    			return x86.AMOVQ
    		case 16:
    			return x86.AMOVUPS // int128s are in SSE registers
    		default:
    
    			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
    
    		}
    	}
    }
    
    // opregreg emits instructions for
    //     dest := dest(To) op src(From)
    // and also returns the created obj.Prog so it
    // may be further adjusted (offset, scale, etc).
    
    func opregreg(s *gc.SSAGenState, op obj.As, dest, src int16) *obj.Prog {
    	p := s.Prog(op)
    
    	p.From.Type = obj.TYPE_REG
    	p.To.Type = obj.TYPE_REG
    	p.To.Reg = dest
    	p.From.Reg = src
    	return p
    }
    
    
    // memIdx fills out a as an indexed memory reference for v.
    // It assumes that the base register and the index register
    // are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
    // The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
    func memIdx(a *obj.Addr, v *ssa.Value) {
    	r, i := v.Args[0].Reg(), v.Args[1].Reg()
    	a.Type = obj.TYPE_MEM
    	a.Scale = v.Op.Scale()
    	if a.Scale == 1 && i == x86.REG_SP {
    		r, i = i, r
    	}
    	a.Reg = r
    	a.Index = i
    }
    
    
    // DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
    
    // See runtime/mkduff.go.
    func duffStart(size int64) int64 {
    	x, _ := duff(size)
    	return x
    }
    func duffAdj(size int64) int64 {
    	_, x := duff(size)
    	return x
    }
    
    // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
    // required to use the duffzero mechanism for a block of the given size.
    func duff(size int64) (int64, int64) {
    	if size < 32 || size > 1024 || size%dzClearStep != 0 {
    		panic("bad duffzero size")
    	}
    	steps := size / dzClearStep
    	blocks := steps / dzBlockLen
    	steps %= dzBlockLen
    	off := dzBlockSize * (dzBlocks - blocks)
    	var adj int64
    	if steps != 0 {
    
    		off -= dzMovSize * steps
    		adj -= dzClearStep * (dzBlockLen - steps)
    	}
    	return off, adj
    }
    
    
    func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
    	switch v.Op {
    
    	case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
    
    		r := v.Reg()
    		r1 := v.Args[0].Reg()
    		r2 := v.Args[1].Reg()
    
    			p.From.Type = obj.TYPE_REG
    			p.From.Reg = r2
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    		case r == r2:
    
    			p.From.Type = obj.TYPE_REG
    			p.From.Reg = r1
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    		default:
    			var asm obj.As
    
    			p.From.Type = obj.TYPE_MEM
    			p.From.Reg = r1
    			p.From.Scale = 1
    			p.From.Index = r2
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    		}
    
    	// 2-address opcode arithmetic
    
    	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
    		ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
    		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
    		ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
    		ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
    		ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
    
    		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
    		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
    
    		ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
    		ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
    
    		ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
    		ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
    
    		ssa.OpAMD64PXOR,
    		ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
    		ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
    		ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ:
    
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    
    		opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
    
    	case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
    		// Arg[0] (the dividend) is in AX.
    		// Arg[1] (the divisor) can be in any other register.
    		// Result[0] (the quotient) is in AX.
    		// Result[1] (the remainder) is in DX.
    
    		r := v.Args[1].Reg()
    
    
    		// Zero extend dividend.
    
    		c.From.Type = obj.TYPE_REG
    		c.From.Reg = x86.REG_DX
    		c.To.Type = obj.TYPE_REG
    		c.To.Reg = x86.REG_DX
    
    		// Issue divide.
    
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = r
    
    	case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
    		// Arg[0] (the dividend) is in AX.
    		// Arg[1] (the divisor) can be in any other register.
    		// Result[0] (the quotient) is in AX.
    		// Result[1] (the remainder) is in DX.
    
    		r := v.Args[1].Reg()
    
    		// CPU faults upon signed overflow, which occurs when the most
    		// negative int is divided by -1. Handle divide by -1 as a special case.
    
    		if ssa.NeedsFixUp(v) {
    			var c *obj.Prog
    			switch v.Op {
    			case ssa.OpAMD64DIVQ:
    				c = s.Prog(x86.ACMPQ)
    			case ssa.OpAMD64DIVL:
    				c = s.Prog(x86.ACMPL)
    			case ssa.OpAMD64DIVW:
    				c = s.Prog(x86.ACMPW)
    			}
    			c.From.Type = obj.TYPE_REG
    			c.From.Reg = r
    			c.To.Type = obj.TYPE_CONST
    			c.To.Offset = -1
    			j1 = s.Prog(x86.AJEQ)
    			j1.To.Type = obj.TYPE_BRANCH
    
    		}
    
    		// Sign extend dividend.
    		switch v.Op {
    		case ssa.OpAMD64DIVQ:
    
    		case ssa.OpAMD64DIVL:
    
    		case ssa.OpAMD64DIVW:
    
    		if j1 != nil {
    			// Skip over -1 fixup code.
    			j2 := s.Prog(obj.AJMP)
    			j2.To.Type = obj.TYPE_BRANCH
    
    			// Issue -1 fixup code.
    			// n / -1 = -n
    			var n1 *obj.Prog
    			switch v.Op {
    			case ssa.OpAMD64DIVQ:
    				n1 = s.Prog(x86.ANEGQ)
    			case ssa.OpAMD64DIVL:
    				n1 = s.Prog(x86.ANEGL)
    			case ssa.OpAMD64DIVW:
    				n1 = s.Prog(x86.ANEGW)
    			}
    			n1.To.Type = obj.TYPE_REG
    			n1.To.Reg = x86.REG_AX
    
    			// n % -1 == 0
    			n2 := s.Prog(x86.AXORL)
    			n2.From.Type = obj.TYPE_REG
    			n2.From.Reg = x86.REG_DX
    			n2.To.Type = obj.TYPE_REG
    			n2.To.Reg = x86.REG_DX
    
    			// TODO(khr): issue only the -1 fixup code we need.
    			// For instance, if only the quotient is used, no point in zeroing the remainder.
    
    	case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
    
    		// the frontend rewrites constant division by 8/16/32 bit integers into
    		// HMUL by a constant
    		// SSA rewrites generate the 64 bit versions
    
    		// Arg[0] is already in AX as it's the only register we allow
    		// and DX is the only output we care about (the high bits)
    
    		p.From.Reg = v.Args[1].Reg()
    
    
    		// IMULB puts the high portion in AH instead of DL,
    		// so move it to DL for consistency
    
    			m.From.Type = obj.TYPE_REG
    			m.From.Reg = x86.REG_AH
    			m.To.Type = obj.TYPE_REG
    			m.To.Reg = x86.REG_DX
    		}
    
    
    	case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
    		// Arg[0] is already in AX as it's the only register we allow
    		// results lo in AX
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = v.Args[1].Reg()
    
    
    	case ssa.OpAMD64MULQU2:
    		// Arg[0] is already in AX as it's the only register we allow
    		// results hi in DX, lo in AX
    
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = v.Args[1].Reg()
    
    	case ssa.OpAMD64DIVQU2:
    		// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
    		// results q in AX, r in DX
    
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = v.Args[2].Reg()
    
    
    	case ssa.OpAMD64AVGQU:
    		// compute (x+y)/2 unsigned.
    		// Do a 64-bit add, the overflow goes into the carry.
    		// Shift right once and pull the carry back into the 63rd bit.
    
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    
    		p.From.Type = obj.TYPE_REG
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    		p.From.Reg = v.Args[1].Reg()
    
    		p.From.Type = obj.TYPE_CONST
    		p.From.Offset = 1
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    
    	case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
    		r := v.Reg0()
    		r0 := v.Args[0].Reg()
    		r1 := v.Args[1].Reg()
    		switch r {
    		case r0:
    			p := s.Prog(v.Op.Asm())
    			p.From.Type = obj.TYPE_REG
    			p.From.Reg = r1
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    		case r1:
    			p := s.Prog(v.Op.Asm())
    			p.From.Type = obj.TYPE_REG
    			p.From.Reg = r0
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    		default:
    			v.Fatalf("output not in same register as an input %s", v.LongString())
    		}
    
    
    	case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = v.Args[1].Reg()
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg0()
    
    	case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
    
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_CONST
    		p.From.Offset = v.AuxInt
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg0()
    
    
    	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
    
    		r := v.Reg()
    		a := v.Args[0].Reg()
    
    			switch v.AuxInt {
    			case 1:
    
    				var asm obj.As
    				// Software optimization manual recommends add $1,reg.
    				// But inc/dec is 1 byte smaller. ICC always uses inc
    				// Clang/GCC choose depending on flags, but prefer add.
    				// Experiments show that inc/dec is both a little faster
    				// and make a binary a little smaller.
    
    				if v.Op == ssa.OpAMD64ADDQconst {
    
    				p.To.Type = obj.TYPE_REG
    				p.To.Reg = r
    				return
    
    				if v.Op == ssa.OpAMD64ADDQconst {
    
    				p.To.Type = obj.TYPE_REG
    				p.To.Reg = r
    				return
    
    			case 0x80:
    				// 'SUBQ $-0x80, r' is shorter to encode than
    				// and functionally equivalent to 'ADDQ $0x80, r'.
    				asm := x86.ASUBL
    				if v.Op == ssa.OpAMD64ADDQconst {
    					asm = x86.ASUBQ
    				}
    				p := s.Prog(asm)
    				p.From.Type = obj.TYPE_CONST
    				p.From.Offset = -0x80
    				p.To.Type = obj.TYPE_REG
    				p.To.Reg = r
    				return
    
    
    			p.From.Type = obj.TYPE_CONST
    			p.From.Offset = v.AuxInt
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    			return
    
    		if v.Op == ssa.OpAMD64ADDQconst {
    
    		p.From.Type = obj.TYPE_MEM
    		p.From.Reg = a
    
    		p.From.Offset = v.AuxInt
    
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    	case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
    		ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
    		ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
    		ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
    		ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
    		ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
    		ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
    		ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
    		ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
    		ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
    		ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
    		ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
    
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    
    		p.From.Type = obj.TYPE_REG
    
    		p.From.Reg = v.Args[1].Reg()
    
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    
    	case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    		}
    		// Flag condition: ^ZERO || PARITY
    		// Generate:
    		//   CMOV*NE  SRC,DST
    		//   CMOV*PS  SRC,DST
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = v.Args[1].Reg()
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    		var q *obj.Prog
    		if v.Op == ssa.OpAMD64CMOVQNEF {
    			q = s.Prog(x86.ACMOVQPS)
    		} else if v.Op == ssa.OpAMD64CMOVLNEF {
    			q = s.Prog(x86.ACMOVLPS)
    		} else {
    			q = s.Prog(x86.ACMOVWPS)
    		}
    		q.From.Type = obj.TYPE_REG
    		q.From.Reg = v.Args[1].Reg()
    		q.To.Type = obj.TYPE_REG
    		q.To.Reg = r
    
    	case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    		}
    
    		// Flag condition: ZERO && !PARITY
    		// Generate:
    		//   MOV      SRC,AX
    		//   CMOV*NE  DST,AX
    		//   CMOV*PC  AX,DST
    		//
    		// TODO(rasky): we could generate:
    		//   CMOV*NE  DST,SRC
    		//   CMOV*PC  SRC,DST
    		// But this requires a way for regalloc to know that SRC might be
    		// clobbered by this instruction.
    		if v.Args[1].Reg() != x86.REG_AX {
    			opregreg(s, moveByType(v.Type), x86.REG_AX, v.Args[1].Reg())
    		}
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = r
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = x86.REG_AX
    		var q *obj.Prog
    		if v.Op == ssa.OpAMD64CMOVQEQF {
    			q = s.Prog(x86.ACMOVQPC)
    		} else if v.Op == ssa.OpAMD64CMOVLEQF {
    			q = s.Prog(x86.ACMOVLPC)
    		} else {
    			q = s.Prog(x86.ACMOVWPC)
    		}
    		q.From.Type = obj.TYPE_REG
    		q.From.Reg = x86.REG_AX
    		q.To.Type = obj.TYPE_REG
    		q.To.Reg = r
    
    
    	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
    
    		p.From.Offset = v.AuxInt
    
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
    
    	case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
    		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst,
    		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
    		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
    		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
    
    		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
    		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
    		ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
    
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    
    		p.From.Offset = v.AuxInt
    
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    	case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
    
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = r
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    	case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
    		ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
    		ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
    		p := s.Prog(v.Op.Asm())
    
    		memIdx(&p.From, v)
    		o := v.Reg()
    
    		p.To.Reg = o
    		if v.AuxInt != 0 && v.Aux == nil {
    			// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
    			switch v.Op {
    			case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
    				p = s.Prog(x86.ALEAQ)
    			case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
    				p = s.Prog(x86.ALEAL)
    			case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
    				p = s.Prog(x86.ALEAW)
    			}
    			p.From.Type = obj.TYPE_MEM
    			p.From.Reg = o
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = o
    		}
    		gc.AddAux(&p.From, v)
    
    	case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
    
    		p.From.Reg = v.Args[0].Reg()
    
    		gc.AddAux(&p.From, v)
    		p.To.Type = obj.TYPE_REG
    
    		p.To.Reg = v.Reg()
    
    	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
    
    		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
    		ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
    
    		opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
    
    	case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
    		// Go assembler has swapped operands for UCOMISx relative to CMP,
    		// must account for that right here.
    
    		opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
    
    	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
    
    		p.From.Reg = v.Args[0].Reg()
    
    		p.To.Offset = v.AuxInt
    
    	case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
    		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
    		ssa.OpAMD64BTSLconst, ssa.OpAMD64BTSQconst,
    		ssa.OpAMD64BTCLconst, ssa.OpAMD64BTCQconst,
    		ssa.OpAMD64BTRLconst, ssa.OpAMD64BTRQconst:
    
    		op := v.Op
    		if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
    			// Emit 32-bit version because it's shorter
    			op = ssa.OpAMD64BTLconst
    		}
    		p := s.Prog(op.Asm())
    		p.From.Type = obj.TYPE_CONST
    		p.From.Offset = v.AuxInt
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Args[0].Reg()
    
    	case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
    
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_MEM
    		p.From.Reg = v.Args[0].Reg()
    		gc.AddAux(&p.From, v)
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Args[1].Reg()
    
    	case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
    
    		sc := v.AuxValAndOff()
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_MEM
    		p.From.Reg = v.Args[0].Reg()
    		gc.AddAux2(&p.From, v, sc.Off())
    		p.To.Type = obj.TYPE_CONST
    		p.To.Offset = sc.Val()
    
    	case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
    
    
    		// If flags aren't live (indicated by v.Aux == nil),
    		// then we can rewrite MOV $0, AX into XOR AX, AX.
    		if v.AuxInt == 0 && v.Aux == nil {
    			p := s.Prog(x86.AXORL)
    			p.From.Type = obj.TYPE_REG
    			p.From.Reg = x
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = x
    			break
    		}
    
    
    		asm := v.Op.Asm()
    		// Use MOVL to move a small constant into a register
    		// when the constant is positive and fits into 32 bits.
    		if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
    			// The upper 32bit are zeroed automatically when using MOVL.
    			asm = x86.AMOVL
    		}
    		p := s.Prog(asm)
    
    		p.From.Offset = v.AuxInt
    
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = x
    	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
    
    		p.From.Type = obj.TYPE_FCONST
    		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = x
    
    	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
    
    		p.From.Reg = v.Args[0].Reg()
    
    		gc.AddAux(&p.From, v)
    		p.To.Type = obj.TYPE_REG
    
    		p.To.Reg = v.Reg()
    
    	case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
    		ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2:
    
    		memIdx(&p.From, v)
    
    		gc.AddAux(&p.From, v)
    
    		p.To.Reg = v.Reg()
    
    	case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
    
    		ssa.OpAMD64BTCQmodify, ssa.OpAMD64BTCLmodify, ssa.OpAMD64BTRQmodify, ssa.OpAMD64BTRLmodify, ssa.OpAMD64BTSQmodify, ssa.OpAMD64BTSLmodify,
    
    		ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
    		ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
    
    		p.From.Reg = v.Args[1].Reg()
    
    		p.To.Reg = v.Args[0].Reg()
    
    	case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
    		ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2:
    
    		p.From.Reg = v.Args[2].Reg()
    
    	case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
    
    		sc := v.AuxValAndOff()
    		off := sc.Off()
    		val := sc.Val()
    
    		if val == 1 || val == -1 {
    
    			if v.Op == ssa.OpAMD64ADDQconstmodify {
    
    				if val == 1 {
    					asm = x86.AINCQ
    				} else {
    					asm = x86.ADECQ
    				}
    
    				if val == 1 {
    					asm = x86.AINCL
    				} else {
    					asm = x86.ADECL
    				}
    
    			}
    			p := s.Prog(asm)
    			p.To.Type = obj.TYPE_MEM
    			p.To.Reg = v.Args[0].Reg()
    			gc.AddAux2(&p.To, v, off)
    
    	case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
    
    		ssa.OpAMD64BTCQconstmodify, ssa.OpAMD64BTCLconstmodify, ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTSLconstmodify,
    		ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTRLconstmodify, ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify:
    
    		sc := v.AuxValAndOff()
    		off := sc.Off()
    		val := sc.Val()
    		p := s.Prog(v.Op.Asm())
    		p.From.Type = obj.TYPE_CONST
    		p.From.Offset = val
    		p.To.Type = obj.TYPE_MEM
    		p.To.Reg = v.Args[0].Reg()
    		gc.AddAux2(&p.To, v, off)
    
    	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
    
    		p.From.Type = obj.TYPE_CONST
    		sc := v.AuxValAndOff()
    
    		p.From.Offset = sc.Val()
    
    		p.To.Reg = v.Args[0].Reg()
    
    	case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
    
    		p.From.Type = obj.TYPE_CONST
    		sc := v.AuxValAndOff()
    
    		p.From.Offset = sc.Val()
    
    		gc.AddAux2(&p.To, v, sc.Off())
    	case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
    		ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
    		ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
    
    		opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
    
    	case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
    		r := v.Reg()
    		// Break false dependency on destination register.
    
    		opregreg(s, x86.AXORPS, r, r)
    		opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
    
    	case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
    		var p *obj.Prog
    		switch v.Op {
    		case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
    			p = s.Prog(x86.AMOVQ)
    		case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
    			p = s.Prog(x86.AMOVL)
    		}
    
    		p.From.Type = obj.TYPE_REG
    		p.From.Reg = v.Args[0].Reg()
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg()
    
    	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
    		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
    		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
    
    		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
    		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
    
    		p.From.Type = obj.TYPE_MEM
    		p.From.Reg = v.Args[1].Reg()
    		gc.AddAux(&p.From, v)
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg()
    		if v.Reg() != v.Args[0].Reg() {
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    		}
    
    		off := duffStart(v.AuxInt)
    		adj := duffAdj(v.AuxInt)
    		var p *obj.Prog
    		if adj != 0 {
    
    			p = s.Prog(x86.ALEAQ)
    			p.From.Type = obj.TYPE_MEM
    
    			p.From.Reg = x86.REG_DI
    
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = x86.REG_DI
    		}
    
    	case ssa.OpAMD64MOVOconst:
    		if v.AuxInt != 0 {
    
    			v.Fatalf("MOVOconst can only do constant=0")
    
    	case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
    
    		if v.Type.IsMemory() {
    			return
    		}
    
    		x := v.Args[0].Reg()
    		y := v.Reg()
    
    			opregreg(s, moveByType(v.Type), y, x)
    
    		}
    	case ssa.OpLoadReg:
    		if v.Type.IsFlags() {
    
    			v.Fatalf("load flags not implemented: %v", v.LongString())
    
    		p := s.Prog(loadByType(v.Type))
    
    		gc.AddrAuto(&p.From, v.Args[0])
    
    		p.To.Reg = v.Reg()
    
    
    	case ssa.OpStoreReg:
    		if v.Type.IsFlags() {
    
    			v.Fatalf("store flags not implemented: %v", v.LongString())
    
    		p := s.Prog(storeByType(v.Type))
    
    		p.From.Reg = v.Args[0].Reg()
    
    		gc.AddrAuto(&p.To, v)
    
    	case ssa.OpAMD64LoweredGetClosurePtr:
    
    		// Closure pointer is DX.
    		gc.CheckLoweredGetClosurePtr(v)
    
    		// See the comments in cmd/internal/obj/x86/obj6.go
    		// near CanUse1InsnTLS for a detailed explanation of these instructions.
    		if x86.CanUse1InsnTLS(gc.Ctxt) {
    			// MOVQ (TLS), r
    
    			p.From.Type = obj.TYPE_MEM
    			p.From.Reg = x86.REG_TLS
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    		} else {
    			// MOVQ TLS, r
    			// MOVQ (r)(TLS*1), r
    
    			p.From.Type = obj.TYPE_REG
    			p.From.Reg = x86.REG_TLS
    			p.To.Type = obj.TYPE_REG
    			p.To.Reg = r
    
    			q.From.Type = obj.TYPE_MEM
    			q.From.Reg = r
    			q.From.Index = x86.REG_TLS
    			q.From.Scale = 1
    			q.To.Type = obj.TYPE_REG
    			q.To.Reg = r
    		}
    
    	case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
    		s.Call(v)
    
    
    	case ssa.OpAMD64LoweredGetCallerPC:
    		p := s.Prog(x86.AMOVQ)
    		p.From.Type = obj.TYPE_MEM
    		p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
    		p.From.Name = obj.NAME_PARAM
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg()
    
    
    	case ssa.OpAMD64LoweredGetCallerSP:
    		// caller's SP is the address of the first arg
    		mov := x86.AMOVQ
    		if gc.Widthptr == 4 {
    			mov = x86.AMOVL
    		}
    		p := s.Prog(mov)
    		p.From.Type = obj.TYPE_ADDR
    		p.From.Offset = -gc.Ctxt.FixedFrameSize() // 0 on amd64, just to be consistent with other architectures
    		p.From.Name = obj.NAME_PARAM
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg()
    
    
    	case ssa.OpAMD64LoweredWB:
    		p := s.Prog(obj.ACALL)
    		p.To.Type = obj.TYPE_MEM
    		p.To.Name = obj.NAME_EXTERN
    		p.To.Sym = v.Aux.(*obj.LSym)
    
    
    	case ssa.OpAMD64LoweredPanicBoundsA, ssa.OpAMD64LoweredPanicBoundsB, ssa.OpAMD64LoweredPanicBoundsC:
    		p := s.Prog(obj.ACALL)
    		p.To.Type = obj.TYPE_MEM
    		p.To.Name = obj.NAME_EXTERN
    		p.To.Sym = gc.BoundsCheckFunc[v.AuxInt]
    		s.UseArgs(int64(2 * gc.Widthptr)) // space used in callee args area by assembly stubs
    
    	case ssa.OpAMD64LoweredPanicExtendA, ssa.OpAMD64LoweredPanicExtendB, ssa.OpAMD64LoweredPanicExtendC:
    		p := s.Prog(obj.ACALL)
    		p.To.Type = obj.TYPE_MEM
    		p.To.Name = obj.NAME_EXTERN
    		p.To.Sym = gc.ExtendCheckFunc[v.AuxInt]
    		s.UseArgs(int64(3 * gc.Widthptr)) // space used in callee args area by assembly stubs
    
    
    	case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
    
    		ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
    
    		ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
    
    		r := v.Reg()
    		if r != v.Args[0].Reg() {
    
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    
    	case ssa.OpAMD64NEGLflags:
    		r := v.Reg0()
    		if r != v.Args[0].Reg() {
    			v.Fatalf("input[0] and output not in same register %s", v.LongString())
    		}
    		p := s.Prog(v.Op.Asm())
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = r
    
    
    	case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
    
    		p.From.Reg = v.Args[0].Reg()
    
    		switch v.Op {
    		case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
    			p.To.Reg = v.Reg0()
    		case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD:
    			p.To.Reg = v.Reg()
    		}
    
    	case ssa.OpAMD64ROUNDSD:
    		p := s.Prog(v.Op.Asm())
    		val := v.AuxInt
    
    		// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
    		if val != 0 && val != 1 && val != 2 && val != 3 {
    
    			v.Fatalf("Invalid rounding mode")
    		}
    		p.From.Offset = val
    		p.From.Type = obj.TYPE_CONST
    		p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[0].Reg()})
    		p.To.Type = obj.TYPE_REG
    		p.To.Reg = v.Reg()
    
    	case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL: