diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index e69e71a4a1872427a870ae3eae0fd728c55a9ee2..2152fb4f69fedf2381a7abbef688119b792cf71e 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -139,36 +139,38 @@ backwardtailloop:
 	BC	16, 0, backwardtailloop // bndz
 
 nobackwardtail:
-	BC	4, 5, LR		// ble CR1 lr
+	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
+	SRDCC	$2,DWORDS,QWORDS	// Compute number of 32B blocks and compare to 0
+	BNE	backward32setup		// If QWORDS != 0, start the 32B copy loop.
 
-backwardlarge:
-	MOVD	DWORDS, CTR
-	SUB	TGT, SRC, TMP		// Use vsx if moving
-	CMP	TMP, $32		// at least 32 byte chunks
-	BLT	backwardlargeloop	// and distance >= 32
-	SRDCC	$2,DWORDS,QWORDS	// 32 byte chunks
-	BNE	backward32setup
+backward24:
+	// DWORDS is a value between 1-3.
+	CMP	DWORDS, $2
 
-backwardlargeloop:
 	MOVD 	-8(SRC), TMP
-	SUB	$8,SRC
 	MOVD 	TMP, -8(TGT)
-	SUB	$8,TGT
-	BC	16, 0, backwardlargeloop // bndz
+	BC	12, 0, LR		// bltlr, return if DWORDS == 1
+
+	MOVD 	-16(SRC), TMP
+	MOVD 	TMP, -16(TGT)
+	BC	12, 2, LR		// beqlr, return if DWORDS == 2
+
+	MOVD 	-24(SRC), TMP
+	MOVD 	TMP, -24(TGT)
 	RET
 
 backward32setup:
-	MOVD	QWORDS, CTR			// set up loop ctr
-	MOVD	$16, IDX16			// 32 bytes at a time
+	ANDCC   $3,DWORDS		// Compute remaining DWORDS and compare to 0
+	MOVD	QWORDS, CTR		// set up loop ctr
+	MOVD	$16, IDX16		// 32 bytes at a time
 
 backward32loop:
 	SUB	$32, TGT
 	SUB	$32, SRC
-	LXVD2X	(R0)(TGT), VS32           // load 16 bytes
-	LXVD2X	(IDX16)(TGT), VS33
-	STXVD2X	VS32, (R0)(SRC)           // store 16 bytes
-	STXVD2X	VS33, (IDX16)(SRC)
-	BC      16, 0, backward32loop   // bndz
-	BC	4, 5, LR		// ble CR1 lr
-	MOVD	DWORDS, CTR
-	BR	backwardlargeloop
+	LXVD2X	(R0)(SRC), VS32		// load 16x2 bytes
+	LXVD2X	(IDX16)(SRC), VS33
+	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
+	STXVD2X	VS33, (IDX16)(TGT)
+	BC      16, 0, backward32loop	// bndz
+	BC	12, 2, LR		// beqlr, return if DWORDS == 0
+	BR	backward24