From d82c294da778a789099f3b52cd9c34ef0d798465 Mon Sep 17 00:00:00 2001
From: "Paul E. Murphy" <murp@ibm.com>
Date: Tue, 8 Feb 2022 09:09:36 -0600
Subject: [PATCH] runtime: fix 32B backward copy on ppc64x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The test to enter the 32b copy loop always fails, and execution
falls back to a single 8B/iteration copy loop for copies of more
than 7 bytes. Likewise, the 32B loop has SRC/DST args mixed,
and fails to truncate DWORDS after completing.

Fix these, and unroll the 8B/iteration loop as it will only
execute 1-3 times if reached.

POWER10 benchmarks:

name                             old speed      new speed       delta
MemmoveOverlap/32                5.28GB/s ± 0%  10.37GB/s ± 0%   +96.22%
MemmoveOverlap/64                5.97GB/s ± 0%  18.15GB/s ± 0%  +203.95%
MemmoveOverlap/128               7.67GB/s ± 0%  24.35GB/s ± 0%  +217.41%
MemmoveOverlap/256               14.1GB/s ± 0%   25.0GB/s ± 0%   +77.48%
MemmoveOverlap/512               14.2GB/s ± 0%   30.9GB/s ± 0%  +118.19%
MemmoveOverlap/1024              12.3GB/s ± 0%   36.4GB/s ± 0%  +194.75%
MemmoveOverlap/2048              13.7GB/s ± 0%   48.8GB/s ± 0%  +255.24%
MemmoveOverlap/4096              14.1GB/s ± 0%   43.4GB/s ± 0%  +208.80%
MemmoveUnalignedDstOverlap/32    5.07GB/s ± 0%   3.78GB/s ± 0%   -25.33%
MemmoveUnalignedDstOverlap/64    6.00GB/s ± 0%   9.59GB/s ± 0%   +59.78%
MemmoveUnalignedDstOverlap/128   7.66GB/s ± 0%  13.51GB/s ± 0%   +76.42%
MemmoveUnalignedDstOverlap/256   13.4GB/s ± 0%   24.3GB/s ± 0%   +80.92%
MemmoveUnalignedDstOverlap/512   13.9GB/s ± 0%   30.3GB/s ± 0%  +118.29%
MemmoveUnalignedDstOverlap/1024  12.3GB/s ± 0%   37.3GB/s ± 0%  +203.07%
MemmoveUnalignedDstOverlap/2048  13.7GB/s ± 0%   45.9GB/s ± 0%  +235.39%
MemmoveUnalignedDstOverlap/4096  13.9GB/s ± 0%   41.2GB/s ± 0%  +196.34%
MemmoveUnalignedSrcOverlap/32    5.13GB/s ± 0%   5.18GB/s ± 0%    +0.98%
MemmoveUnalignedSrcOverlap/64    6.26GB/s ± 0%   9.53GB/s ± 0%   +52.29%
MemmoveUnalignedSrcOverlap/128   7.94GB/s ± 0%  18.40GB/s ± 0%  +131.76%
MemmoveUnalignedSrcOverlap/256   14.1GB/s ± 0%   25.5GB/s ± 0%   +81.40%
MemmoveUnalignedSrcOverlap/512   14.2GB/s ± 0%   30.9GB/s ± 0%  +116.76%
MemmoveUnalignedSrcOverlap/1024  12.4GB/s ± 0%   46.4GB/s ± 0%  +275.22%
MemmoveUnalignedSrcOverlap/2048  13.7GB/s ± 0%   48.7GB/s ± 0%  +255.16%
MemmoveUnalignedSrcOverlap/4096  14.0GB/s ± 0%   43.2GB/s ± 0%  +208.89%

Change-Id: I9fc6956ff454a2856d56077d1014388fb74c1f52
Reviewed-on: https://go-review.googlesource.com/c/go/+/384074
Trust: Paul Murphy <murp@ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
---
 src/runtime/memmove_ppc64x.s | 46 +++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index e69e71a4a18..2152fb4f69f 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -139,36 +139,38 @@ backwardtailloop:
 	BC	16, 0, backwardtailloop // bndz
 
 nobackwardtail:
-	BC	4, 5, LR		// ble CR1 lr
+	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
+	SRDCC	$2,DWORDS,QWORDS	// Compute number of 32B blocks and compare to 0
+	BNE	backward32setup		// If QWORDS != 0, start the 32B copy loop.
 
-backwardlarge:
-	MOVD	DWORDS, CTR
-	SUB	TGT, SRC, TMP		// Use vsx if moving
-	CMP	TMP, $32		// at least 32 byte chunks
-	BLT	backwardlargeloop	// and distance >= 32
-	SRDCC	$2,DWORDS,QWORDS	// 32 byte chunks
-	BNE	backward32setup
+backward24:
+	// DWORDS is a value between 1-3.
+	CMP	DWORDS, $2
 
-backwardlargeloop:
 	MOVD 	-8(SRC), TMP
-	SUB	$8,SRC
 	MOVD 	TMP, -8(TGT)
-	SUB	$8,TGT
-	BC	16, 0, backwardlargeloop // bndz
+	BC	12, 0, LR		// bltlr, return if DWORDS == 1
+
+	MOVD 	-16(SRC), TMP
+	MOVD 	TMP, -16(TGT)
+	BC	12, 2, LR		// beqlr, return if DWORDS == 2
+
+	MOVD 	-24(SRC), TMP
+	MOVD 	TMP, -24(TGT)
 	RET
 
 backward32setup:
-	MOVD	QWORDS, CTR			// set up loop ctr
-	MOVD	$16, IDX16			// 32 bytes at a time
+	ANDCC   $3,DWORDS		// Compute remaining DWORDS and compare to 0
+	MOVD	QWORDS, CTR		// set up loop ctr
+	MOVD	$16, IDX16		// 32 bytes at a time
 
 backward32loop:
 	SUB	$32, TGT
 	SUB	$32, SRC
-	LXVD2X	(R0)(TGT), VS32           // load 16 bytes
-	LXVD2X	(IDX16)(TGT), VS33
-	STXVD2X	VS32, (R0)(SRC)           // store 16 bytes
-	STXVD2X	VS33, (IDX16)(SRC)
-	BC      16, 0, backward32loop   // bndz
-	BC	4, 5, LR		// ble CR1 lr
-	MOVD	DWORDS, CTR
-	BR	backwardlargeloop
+	LXVD2X	(R0)(SRC), VS32		// load 16x2 bytes
+	LXVD2X	(IDX16)(SRC), VS33
+	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
+	STXVD2X	VS33, (IDX16)(TGT)
+	BC      16, 0, backward32loop	// bndz
+	BC	12, 2, LR		// beqlr, return if DWORDS == 0
+	BR	backward24
-- 
GitLab