From 705f3c74e6af802fcf54f402e6d0862832249712 Mon Sep 17 00:00:00 2001
From: Ben Shi <powerman1st@163.com>
Date: Thu, 21 Jun 2018 10:14:18 +0000
Subject: [PATCH] cmd/compile: optimize AMD64 with DIVSSload and DIVSDload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DIVSSload & DIVSDload directly operate on a memory operand. And
binary size can be reduced by them, while the performance is
not affected.

The total size of pkg/linux_amd64 (excluding cmd/compile) decreases
about 6KB.

There is little regression in the go1 benchmark test (excluding noise).
name                     old time/op    new time/op    delta
BinaryTree17-4              2.63s ± 4%     2.62s ± 4%    ~     (p=0.809 n=30+30)
Fannkuch11-4                2.40s ± 2%     2.40s ± 2%    ~     (p=0.109 n=30+30)
FmtFprintfEmpty-4          43.1ns ± 4%    43.2ns ± 9%    ~     (p=0.168 n=30+30)
FmtFprintfString-4         73.6ns ± 4%    74.1ns ± 4%    ~     (p=0.069 n=30+30)
FmtFprintfInt-4            81.0ns ± 3%    81.4ns ± 5%    ~     (p=0.350 n=30+30)
FmtFprintfIntInt-4          127ns ± 4%     129ns ± 4%  +0.99%  (p=0.021 n=30+30)
FmtFprintfPrefixedInt-4     156ns ± 4%     155ns ± 4%    ~     (p=0.415 n=30+30)
FmtFprintfFloat-4           219ns ± 4%     218ns ± 4%    ~     (p=0.071 n=30+30)
FmtManyArgs-4               522ns ± 3%     518ns ± 3%  -0.68%  (p=0.034 n=30+30)
GobDecode-4                6.49ms ± 6%    6.52ms ± 6%    ~     (p=0.832 n=30+30)
GobEncode-4                6.10ms ± 9%    6.14ms ± 7%    ~     (p=0.485 n=30+30)
Gzip-4                      227ms ± 1%     224ms ± 4%    ~     (p=0.484 n=24+30)
Gunzip-4                   37.2ms ± 3%    36.8ms ± 4%    ~     (p=0.889 n=30+30)
HTTPClientServer-4         58.9µs ± 1%    58.7µs ± 2%  -0.42%  (p=0.003 n=28+28)
JSONEncode-4               12.0ms ± 3%    12.0ms ± 4%    ~     (p=0.523 n=30+30)
JSONDecode-4               54.6ms ± 4%    54.5ms ± 4%    ~     (p=0.708 n=30+30)
Mandelbrot200-4            3.78ms ± 4%    3.81ms ± 3%  +0.99%  (p=0.016 n=30+30)
GoParse-4                  3.20ms ± 4%    3.20ms ± 5%    ~     (p=0.994 n=30+30)
RegexpMatchEasy0_32-4      77.0ns ± 4%    75.9ns ± 3%  -1.39%  (p=0.006 n=29+30)
RegexpMatchEasy0_1K-4       255ns ± 4%     253ns ± 4%    ~     (p=0.091 n=30+30)
RegexpMatchEasy1_32-4      69.7ns ± 3%    70.3ns ± 4%    ~     (p=0.120 n=30+30)
RegexpMatchEasy1_1K-4       373ns ± 2%     378ns ± 3%  +1.43%  (p=0.000 n=21+26)
RegexpMatchMedium_32-4      107ns ± 2%     108ns ± 4%  +1.50%  (p=0.012 n=22+30)
RegexpMatchMedium_1K-4     34.0µs ± 1%    34.3µs ± 3%  +1.08%  (p=0.008 n=24+30)
RegexpMatchHard_32-4       1.53µs ± 3%    1.54µs ± 3%    ~     (p=0.234 n=30+30)
RegexpMatchHard_1K-4       46.7µs ± 4%    47.0µs ± 4%    ~     (p=0.420 n=30+30)
Revcomp-4                   411ms ± 7%     415ms ± 6%    ~     (p=0.059 n=30+30)
Template-4                 65.5ms ± 5%    66.9ms ± 4%  +2.21%  (p=0.001 n=30+30)
TimeParse-4                 317ns ± 3%     311ns ± 3%  -1.97%  (p=0.000 n=30+30)
TimeFormat-4                293ns ± 3%     294ns ± 3%    ~     (p=0.243 n=30+30)
[Geo mean]                 47.4µs         47.5µs       +0.17%

name                     old speed      new speed      delta
GobDecode-4               118MB/s ± 5%   118MB/s ± 6%    ~     (p=0.832 n=30+30)
GobEncode-4               125MB/s ± 7%   125MB/s ± 7%    ~     (p=0.625 n=29+30)
Gzip-4                   85.3MB/s ± 1%  86.6MB/s ± 4%    ~     (p=0.486 n=24+30)
Gunzip-4                  522MB/s ± 3%   527MB/s ± 4%    ~     (p=0.889 n=30+30)
JSONEncode-4              162MB/s ± 3%   162MB/s ± 4%    ~     (p=0.520 n=30+30)
JSONDecode-4             35.5MB/s ± 4%  35.6MB/s ± 4%    ~     (p=0.701 n=30+30)
GoParse-4                18.1MB/s ± 4%  18.1MB/s ± 4%    ~     (p=0.891 n=29+30)
RegexpMatchEasy0_32-4     416MB/s ± 4%   422MB/s ± 3%  +1.43%  (p=0.005 n=29+30)
RegexpMatchEasy0_1K-4    4.01GB/s ± 4%  4.04GB/s ± 4%    ~     (p=0.091 n=30+30)
RegexpMatchEasy1_32-4     460MB/s ± 3%   456MB/s ± 5%    ~     (p=0.123 n=30+30)
RegexpMatchEasy1_1K-4    2.74GB/s ± 2%  2.70GB/s ± 3%  -1.33%  (p=0.000 n=22+26)
RegexpMatchMedium_32-4   9.39MB/s ± 3%  9.19MB/s ± 4%  -2.06%  (p=0.001 n=28+30)
RegexpMatchMedium_1K-4   30.1MB/s ± 1%  29.8MB/s ± 3%  -1.04%  (p=0.008 n=24+30)
RegexpMatchHard_32-4     20.9MB/s ± 3%  20.8MB/s ± 3%    ~     (p=0.234 n=30+30)
RegexpMatchHard_1K-4     21.9MB/s ± 4%  21.8MB/s ± 4%    ~     (p=0.420 n=30+30)
Revcomp-4                 619MB/s ± 7%   612MB/s ± 7%    ~     (p=0.059 n=30+30)
Template-4               29.6MB/s ± 4%  29.0MB/s ± 4%  -2.16%  (p=0.002 n=30+30)
[Geo mean]                123MB/s        123MB/s       -0.33%

Change-Id: Ia59e077feae4f2824df79059daea4d0f678e3e4c
Reviewed-on: https://go-review.googlesource.com/120275
Run-TryBot: Ben Shi <powerman1st@163.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
---
 src/cmd/compile/internal/amd64/ssa.go        |   3 +-
 src/cmd/compile/internal/ssa/gen/AMD64.rules |  20 +--
 src/cmd/compile/internal/ssa/gen/AMD64Ops.go |   2 +
 src/cmd/compile/internal/ssa/opGen.go        |  38 ++++
 src/cmd/compile/internal/ssa/rewriteAMD64.go | 178 +++++++++++++++++++
 test/codegen/arithmetic.go                   |   5 +
 6 files changed, 235 insertions(+), 11 deletions(-)

diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go
index 584cc4c4bd5..4ecdb769f3a 100644
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@@ -837,7 +837,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 	case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
 		ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
 		ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
-		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload:
+		ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
+		ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[1].Reg()
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index bf3a1160459..eab66d17abc 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -1037,10 +1037,10 @@
 	((ADD|SUB|AND|OR|XOR)Qload [off1+off2] {sym} val base mem)
 ((ADD|SUB|AND|OR|XOR)Lload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(off1+off2) ->
 	((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
-((ADD|SUB|MUL)SSload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(off1+off2) ->
-	((ADD|SUB|MUL)SSload [off1+off2] {sym} val base mem)
-((ADD|SUB|MUL)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(off1+off2) ->
-	((ADD|SUB|MUL)SDload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(off1+off2) ->
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(off1+off2) ->
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
 ((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd(off2) ->
 	((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).add(off2)] {sym} base mem)
 ((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd(off2) ->
@@ -1079,12 +1079,12 @@
 ((ADD|SUB|AND|OR|XOR)Lload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
 	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	((ADD|SUB|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
-((ADD|SUB|MUL)SSload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
 	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-	((ADD|SUB|MUL)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
-((ADD|SUB|MUL)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
 	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-	((ADD|SUB|MUL)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
 ((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem)
 	&& ValAndOff(valoff1).canAdd(off2) && canMergeSym(sym1, sym2) ->
 	((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).add(off2)] {mergeSym(sym1,sym2)} base mem)
@@ -2274,8 +2274,8 @@
 // TODO: add indexed variants?
 ((ADD|SUB|AND|OR|XOR)Q x l:(MOVQload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|AND|OR|XOR)Qload x [off] {sym} ptr mem)
 ((ADD|SUB|AND|OR|XOR)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|AND|OR|XOR)Lload x [off] {sym} ptr mem)
-((ADD|SUB|MUL)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|MUL)SDload x [off] {sym} ptr mem)
-((ADD|SUB|MUL)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|MUL)SSload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoad(v, l, x) && clobber(l) -> ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
 
 // Merge ADDQconst and LEAQ into atomic loads.
 (MOVQatomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(off1+off2) ->
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index 1140958670d..4735ea1bc08 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -189,6 +189,8 @@ func init() {
 		{name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
 		{name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
 		{name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
+		{name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
 
 		// binary ops
 		{name: "ADDQ", argLength: 2, reg: gp21sp, asm: "ADDQ", commutative: true, clobberFlags: true},                                                                   // arg0 + arg1
diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go
index 34b1d8a4bba..35546238409 100644
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -462,6 +462,8 @@ const (
 	OpAMD64SUBSDload
 	OpAMD64MULSSload
 	OpAMD64MULSDload
+	OpAMD64DIVSSload
+	OpAMD64DIVSDload
 	OpAMD64ADDQ
 	OpAMD64ADDL
 	OpAMD64ADDQconst
@@ -5543,6 +5545,42 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:           "DIVSSload",
+		auxType:        auxSymOff,
+		argLen:         3,
+		resultInArg0:   true,
+		faultOnNilArg1: true,
+		symEffect:      SymRead,
+		asm:            x86.ADIVSS,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
+	{
+		name:           "DIVSDload",
+		auxType:        auxSymOff,
+		argLen:         3,
+		resultInArg0:   true,
+		faultOnNilArg1: true,
+		symEffect:      SymRead,
+		asm:            x86.ADIVSD,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+				{1, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R14 R15 SB
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+			},
+		},
+	},
 	{
 		name:         "ADDQ",
 		argLen:       2,
diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go
index 4ffd6b5d186..245f795d900 100644
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -157,6 +157,14 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAMD64CMPXCHGLlock_0(v)
 	case OpAMD64CMPXCHGQlock:
 		return rewriteValueAMD64_OpAMD64CMPXCHGQlock_0(v)
+	case OpAMD64DIVSD:
+		return rewriteValueAMD64_OpAMD64DIVSD_0(v)
+	case OpAMD64DIVSDload:
+		return rewriteValueAMD64_OpAMD64DIVSDload_0(v)
+	case OpAMD64DIVSS:
+		return rewriteValueAMD64_OpAMD64DIVSS_0(v)
+	case OpAMD64DIVSSload:
+		return rewriteValueAMD64_OpAMD64DIVSSload_0(v)
 	case OpAMD64LEAL:
 		return rewriteValueAMD64_OpAMD64LEAL_0(v)
 	case OpAMD64LEAL1:
@@ -8808,6 +8816,176 @@ func rewriteValueAMD64_OpAMD64CMPXCHGQlock_0(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpAMD64DIVSD_0(v *Value) bool {
+	// match: (DIVSD x l:(MOVSDload [off] {sym} ptr mem))
+	// cond: canMergeLoad(v, l, x) && clobber(l)
+	// result: (DIVSDload x [off] {sym} ptr mem)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		l := v.Args[1]
+		if l.Op != OpAMD64MOVSDload {
+			break
+		}
+		off := l.AuxInt
+		sym := l.Aux
+		_ = l.Args[1]
+		ptr := l.Args[0]
+		mem := l.Args[1]
+		if !(canMergeLoad(v, l, x) && clobber(l)) {
+			break
+		}
+		v.reset(OpAMD64DIVSDload)
+		v.AuxInt = off
+		v.Aux = sym
+		v.AddArg(x)
+		v.AddArg(ptr)
+		v.AddArg(mem)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64DIVSDload_0(v *Value) bool {
+	// match: (DIVSDload [off1] {sym} val (ADDQconst [off2] base) mem)
+	// cond: is32Bit(off1+off2)
+	// result: (DIVSDload [off1+off2] {sym} val base mem)
+	for {
+		off1 := v.AuxInt
+		sym := v.Aux
+		_ = v.Args[2]
+		val := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64ADDQconst {
+			break
+		}
+		off2 := v_1.AuxInt
+		base := v_1.Args[0]
+		mem := v.Args[2]
+		if !(is32Bit(off1 + off2)) {
+			break
+		}
+		v.reset(OpAMD64DIVSDload)
+		v.AuxInt = off1 + off2
+		v.Aux = sym
+		v.AddArg(val)
+		v.AddArg(base)
+		v.AddArg(mem)
+		return true
+	}
+	// match: (DIVSDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	// cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2)
+	// result: (DIVSDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+	for {
+		off1 := v.AuxInt
+		sym1 := v.Aux
+		_ = v.Args[2]
+		val := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64LEAQ {
+			break
+		}
+		off2 := v_1.AuxInt
+		sym2 := v_1.Aux
+		base := v_1.Args[0]
+		mem := v.Args[2]
+		if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2)) {
+			break
+		}
+		v.reset(OpAMD64DIVSDload)
+		v.AuxInt = off1 + off2
+		v.Aux = mergeSym(sym1, sym2)
+		v.AddArg(val)
+		v.AddArg(base)
+		v.AddArg(mem)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64DIVSS_0(v *Value) bool {
+	// match: (DIVSS x l:(MOVSSload [off] {sym} ptr mem))
+	// cond: canMergeLoad(v, l, x) && clobber(l)
+	// result: (DIVSSload x [off] {sym} ptr mem)
+	for {
+		_ = v.Args[1]
+		x := v.Args[0]
+		l := v.Args[1]
+		if l.Op != OpAMD64MOVSSload {
+			break
+		}
+		off := l.AuxInt
+		sym := l.Aux
+		_ = l.Args[1]
+		ptr := l.Args[0]
+		mem := l.Args[1]
+		if !(canMergeLoad(v, l, x) && clobber(l)) {
+			break
+		}
+		v.reset(OpAMD64DIVSSload)
+		v.AuxInt = off
+		v.Aux = sym
+		v.AddArg(x)
+		v.AddArg(ptr)
+		v.AddArg(mem)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64DIVSSload_0(v *Value) bool {
+	// match: (DIVSSload [off1] {sym} val (ADDQconst [off2] base) mem)
+	// cond: is32Bit(off1+off2)
+	// result: (DIVSSload [off1+off2] {sym} val base mem)
+	for {
+		off1 := v.AuxInt
+		sym := v.Aux
+		_ = v.Args[2]
+		val := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64ADDQconst {
+			break
+		}
+		off2 := v_1.AuxInt
+		base := v_1.Args[0]
+		mem := v.Args[2]
+		if !(is32Bit(off1 + off2)) {
+			break
+		}
+		v.reset(OpAMD64DIVSSload)
+		v.AuxInt = off1 + off2
+		v.Aux = sym
+		v.AddArg(val)
+		v.AddArg(base)
+		v.AddArg(mem)
+		return true
+	}
+	// match: (DIVSSload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem)
+	// cond: is32Bit(off1+off2) && canMergeSym(sym1, sym2)
+	// result: (DIVSSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+	for {
+		off1 := v.AuxInt
+		sym1 := v.Aux
+		_ = v.Args[2]
+		val := v.Args[0]
+		v_1 := v.Args[1]
+		if v_1.Op != OpAMD64LEAQ {
+			break
+		}
+		off2 := v_1.AuxInt
+		sym2 := v_1.Aux
+		base := v_1.Args[0]
+		mem := v.Args[2]
+		if !(is32Bit(off1+off2) && canMergeSym(sym1, sym2)) {
+			break
+		}
+		v.reset(OpAMD64DIVSSload)
+		v.AuxInt = off1 + off2
+		v.Aux = mergeSym(sym1, sym2)
+		v.AddArg(val)
+		v.AddArg(base)
+		v.AddArg(mem)
+		return true
+	}
+	return false
+}
 func rewriteValueAMD64_OpAMD64LEAL_0(v *Value) bool {
 	// match: (LEAL [c] {s} (ADDLconst [d] x))
 	// cond: is32Bit(c+d)
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
index f358020f559..a7f2906db9b 100644
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -112,6 +112,11 @@ func ConstDivs(n1 uint, n2 int) (uint, int) {
 	return a, b
 }
 
+func FloatDivs(a []float32) float32 {
+	// amd64:`DIVSS\s8\([A-Z]+\),\sX[0-9]+`
+	return a[1] / a[2]
+}
+
 func Pow2Mods(n1 uint, n2 int) (uint, int) {
 	// 386:"ANDL\t[$]31",-"DIVL"
 	// amd64:"ANDQ\t[$]31",-"DIVQ"
-- 
GitLab