runtime: fix 32B backward copy on ppc64x

The test to enter the 32b copy loop always fails, and execution falls back to a single 8B/iteration copy loop for copies of more than 7 bytes. Likewise, the 32B loop has SRC/DST args mixed, and fails to truncate DWORDS after completing. Fix these, and unroll the 8B/iteration loop as it will only execute 1-3 times if reached. POWER10 benchmarks: name old speed new speed delta MemmoveOverlap/32 5.28GB/s ± 0% 10.37GB/s ± 0% +96.22% MemmoveOverlap/64 5.97GB/s ± 0% 18.15GB/s ± 0% +203.95% MemmoveOverlap/128 7.67GB/s ± 0% 24.35GB/s ± 0% +217.41% MemmoveOverlap/256 14.1GB/s ± 0% 25.0GB/s ± 0% +77.48% MemmoveOverlap/512 14.2GB/s ± 0% 30.9GB/s ± 0% +118.19% MemmoveOverlap/1024 12.3GB/s ± 0% 36.4GB/s ± 0% +194.75% MemmoveOverlap/2048 13.7GB/s ± 0% 48.8GB/s ± 0% +255.24% MemmoveOverlap/4096 14.1GB/s ± 0% 43.4GB/s ± 0% +208.80% MemmoveUnalignedDstOverlap/32 5.07GB/s ± 0% 3.78GB/s ± 0% -25.33% MemmoveUnalignedDstOverlap/64 6.00GB/s ± 0% 9.59GB/s ± 0% +59.78% MemmoveUnalignedDstOverlap/128 7.66GB/s ± 0% 13.51GB/s ± 0% +76.42% MemmoveUnalignedDstOverlap/256 13.4GB/s ± 0% 24.3GB/s ± 0% +80.92% MemmoveUnalignedDstOverlap/512 13.9GB/s ± 0% 30.3GB/s ± 0% +118.29% MemmoveUnalignedDstOverlap/1024 12.3GB/s ± 0% 37.3GB/s ± 0% +203.07% MemmoveUnalignedDstOverlap/2048 13.7GB/s ± 0% 45.9GB/s ± 0% +235.39% MemmoveUnalignedDstOverlap/4096 13.9GB/s ± 0% 41.2GB/s ± 0% +196.34% MemmoveUnalignedSrcOverlap/32 5.13GB/s ± 0% 5.18GB/s ± 0% +0.98% MemmoveUnalignedSrcOverlap/64 6.26GB/s ± 0% 9.53GB/s ± 0% +52.29% MemmoveUnalignedSrcOverlap/128 7.94GB/s ± 0% 18.40GB/s ± 0% +131.76% MemmoveUnalignedSrcOverlap/256 14.1GB/s ± 0% 25.5GB/s ± 0% +81.40% MemmoveUnalignedSrcOverlap/512 14.2GB/s ± 0% 30.9GB/s ± 0% +116.76% MemmoveUnalignedSrcOverlap/1024 12.4GB/s ± 0% 46.4GB/s ± 0% +275.22% MemmoveUnalignedSrcOverlap/2048 13.7GB/s ± 0% 48.7GB/s ± 0% +255.16% MemmoveUnalignedSrcOverlap/4096 14.0GB/s ± 0% 43.2GB/s ± 0% +208.89% Change-Id: I9fc6956ff454a2856d56077d1014388fb74c1f52 Reviewed-on: https://go-review.googlesource.com/c/go/+/384074 Trust: Paul Murphy <murp@ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>

runtime: fix 32B backward copy on ppc64x
d82c294d · Paul E. Murphy · Paul Murphy · da6f9a54 · d82c294d
Commit d82c294d authored 3 years ago by Paul E. Murphy Committed by Paul Murphy 3 years ago
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -139,36 +139,38 @@ backwardtailloop:
 	BC	16, 0, backwardtailloop // bndz

 nobackwardtail:
-	BC	4, 5, LR		// ble CR1 lr
+	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
+	SRDCC	$2,DWORDS,QWORDS	// Compute number of 32B blocks and compare to 0
+	BNE	backward32setup		// If QWORDS != 0, start the 32B copy loop.

-backwardlarge:
-	MOVD	DWORDS, CTR
-	SUB	TGT, SRC, TMP		// Use vsx if moving
-	CMP	TMP, $32		// at least 32 byte chunks
-	BLT	backwardlargeloop	// and distance >= 32
-	SRDCC	$2,DWORDS,QWORDS	// 32 byte chunks
-	BNE	backward32setup
+backward24:
+	// DWORDS is a value between 1-3.
+	CMP	DWORDS, $2

-backwardlargeloop:
 	MOVD 	-8(SRC), TMP
-	SUB	$8,SRC
 	MOVD 	TMP, -8(TGT)
-	SUB	$8,TGT
-	BC	16, 0, backwardlargeloop // bndz
+	BC	12, 0, LR		// bltlr, return if DWORDS == 1
+
+	MOVD 	-16(SRC), TMP
+	MOVD 	TMP, -16(TGT)
+	BC	12, 2, LR		// beqlr, return if DWORDS == 2
+
+	MOVD 	-24(SRC), TMP
+	MOVD 	TMP, -24(TGT)
 	RET

 backward32setup:
-	MOVD	QWORDS, CTR			// set up loop ctr
-	MOVD	$16, IDX16			// 32 bytes at a time
+	ANDCC   $3,DWORDS		// Compute remaining DWORDS and compare to 0
+	MOVD	QWORDS, CTR		// set up loop ctr
+	MOVD	$16, IDX16		// 32 bytes at a time

 backward32loop:
 	SUB	$32, TGT
 	SUB	$32, SRC
-	LXVD2X	(R0)(TGT), VS32           // load 16 bytes
-	LXVD2X	(IDX16)(TGT), VS33
-	STXVD2X	VS32, (R0)(SRC)           // store 16 bytes
-	STXVD2X	VS33, (IDX16)(SRC)
-	BC      16, 0, backward32loop   // bndz
-	BC	4, 5, LR		// ble CR1 lr
-	MOVD	DWORDS, CTR
-	BR	backwardlargeloop
+	LXVD2X	(R0)(SRC), VS32		// load 16x2 bytes
+	LXVD2X	(IDX16)(SRC), VS33
+	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
+	STXVD2X	VS33, (IDX16)(TGT)
+	BC      16, 0, backward32loop	// bndz
+	BC	12, 2, LR		// beqlr, return if DWORDS == 0
+	BR	backward24