From 49d6777d87a0abb3eda032da95eff024156835f7 Mon Sep 17 00:00:00 2001
From: Julian Zhu <jz531210@gmail.com>
Date: Thu, 17 Apr 2025 16:04:08 +0800
Subject: [PATCH] crypto/internal/fips140/subtle: add assembly implementation
 of xorBytes for mips64x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

goos: linux
goarch: mips64le
pkg: crypto/subtle
                                     │  oldsubtle   │             newsubtle              │
                                     │    sec/op    │   sec/op     vs base               │
ConstantTimeByteEq-4                    5.011n ± 0%   5.014n ± 0%        ~ (p=0.110 n=8)
ConstantTimeEq-4                        3.342n ± 0%   3.342n ± 0%        ~ (p=0.993 n=8)
ConstantTimeLessOrEq-4                  4.455n ± 0%   4.458n ± 0%        ~ (p=0.182 n=8)
XORBytes/8Bytes-4                       36.48n ± 0%   26.73n ± 0%  -26.74% (p=0.000 n=8)
XORBytes/128Bytes-4                     70.21n ± 0%   50.14n ± 0%  -28.59% (p=0.000 n=8)
XORBytes/2048Bytes-4                    566.1n ± 0%   257.2n ± 0%  -54.58% (p=0.000 n=8)
XORBytes/8192Bytes-4                   2123.0n ± 0%   966.8n ± 0%  -54.46% (p=0.000 n=8)
XORBytes/32768Bytes-4                  13.740µ ± 0%   5.614µ ± 0%  -59.14% (p=0.000 n=8)
XORBytesAlignment/8Bytes0Offset-4       38.98n ± 0%   26.53n ± 0%  -31.95% (p=0.000 n=8)
XORBytesAlignment/8Bytes1Offset-4       43.27n ± 0%   26.54n ± 0%  -38.68% (p=0.000 n=8)
XORBytesAlignment/8Bytes2Offset-4       43.28n ± 0%   26.54n ± 0%  -38.69% (p=0.000 n=8)
XORBytesAlignment/8Bytes3Offset-4       43.32n ± 0%   26.54n ± 0%  -38.74% (p=0.000 n=8)
XORBytesAlignment/8Bytes4Offset-4       43.49n ± 0%   26.53n ± 0%  -38.99% (p=0.000 n=8)
XORBytesAlignment/8Bytes5Offset-4       43.53n ± 0%   26.54n ± 0%  -39.03% (p=0.000 n=8)
XORBytesAlignment/8Bytes6Offset-4       43.48n ± 0%   26.53n ± 0%  -38.98% (p=0.000 n=8)
XORBytesAlignment/8Bytes7Offset-4       43.46n ± 1%   26.53n ± 0%  -38.96% (p=0.000 n=8)
XORBytesAlignment/128Bytes0Offset-4     71.84n ± 0%   47.70n ± 1%  -33.60% (p=0.000 n=8)
XORBytesAlignment/128Bytes1Offset-4    260.60n ± 0%   59.87n ± 0%  -77.03% (p=0.000 n=8)
XORBytesAlignment/128Bytes2Offset-4    260.60n ± 0%   59.81n ± 0%  -77.05% (p=0.000 n=8)
XORBytesAlignment/128Bytes3Offset-4    260.55n ± 0%   59.89n ± 0%  -77.01% (p=0.000 n=8)
XORBytesAlignment/128Bytes4Offset-4    260.60n ± 0%   59.84n ± 0%  -77.04% (p=0.000 n=8)
XORBytesAlignment/128Bytes5Offset-4    260.70n ± 0%   59.82n ± 0%  -77.05% (p=0.000 n=8)
XORBytesAlignment/128Bytes6Offset-4    260.60n ± 0%   59.89n ± 0%  -77.02% (p=0.000 n=8)
XORBytesAlignment/128Bytes7Offset-4    260.70n ± 0%   59.85n ± 0%  -77.04% (p=0.000 n=8)
XORBytesAlignment/2048Bytes0Offset-4    552.2n ± 1%   250.0n ± 0%  -54.73% (p=0.000 n=8)
XORBytesAlignment/2048Bytes1Offset-4   3603.0n ± 0%   548.6n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes2Offset-4   3602.0n ± 0%   548.6n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes3Offset-4   3604.0n ± 0%   548.6n ± 0%  -84.78% (p=0.000 n=8)
XORBytesAlignment/2048Bytes4Offset-4   3603.5n ± 0%   548.9n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes5Offset-4   3603.0n ± 0%   548.8n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes6Offset-4   3602.0n ± 0%   548.6n ± 0%  -84.77% (p=0.000 n=8)
XORBytesAlignment/2048Bytes7Offset-4   3601.5n ± 0%   548.5n ± 0%  -84.77% (p=0.000 n=8)
geomean                                 220.0n        81.91n       -62.77%

                                     │  oldsubtle   │               newsubtle               │
                                     │     B/s      │      B/s       vs base                │
XORBytes/8Bytes-4                      209.1Mi ± 0%    285.5Mi ± 0%   +36.52% (p=0.000 n=8)
XORBytes/128Bytes-4                    1.698Gi ± 0%    2.378Gi ± 0%   +40.04% (p=0.000 n=8)
XORBytes/2048Bytes-4                   3.369Gi ± 0%    7.418Gi ± 0%  +120.17% (p=0.000 n=8)
XORBytes/8192Bytes-4                   3.594Gi ± 0%    7.892Gi ± 0%  +119.59% (p=0.000 n=8)
XORBytes/32768Bytes-4                  2.221Gi ± 0%    5.436Gi ± 0%  +144.76% (p=0.000 n=8)
XORBytesAlignment/8Bytes0Offset-4      195.7Mi ± 0%    287.6Mi ± 0%   +46.96% (p=0.000 n=8)
XORBytesAlignment/8Bytes1Offset-4      176.3Mi ± 0%    287.5Mi ± 0%   +63.06% (p=0.000 n=8)
XORBytesAlignment/8Bytes2Offset-4      176.3Mi ± 0%    287.4Mi ± 0%   +63.07% (p=0.000 n=8)
XORBytesAlignment/8Bytes3Offset-4      176.1Mi ± 0%    287.5Mi ± 0%   +63.25% (p=0.000 n=8)
XORBytesAlignment/8Bytes4Offset-4      175.5Mi ± 0%    287.6Mi ± 0%   +63.90% (p=0.000 n=8)
XORBytesAlignment/8Bytes5Offset-4      175.3Mi ± 0%    287.5Mi ± 0%   +64.02% (p=0.000 n=8)
XORBytesAlignment/8Bytes6Offset-4      175.5Mi ± 0%    287.6Mi ± 0%   +63.86% (p=0.000 n=8)
XORBytesAlignment/8Bytes7Offset-4      175.5Mi ± 0%    287.6Mi ± 0%   +63.85% (p=0.000 n=8)
XORBytesAlignment/128Bytes0Offset-4    1.659Gi ± 0%    2.499Gi ± 1%   +50.61% (p=0.000 n=8)
XORBytesAlignment/128Bytes1Offset-4    468.4Mi ± 0%   2039.0Mi ± 0%  +335.30% (p=0.000 n=8)
XORBytesAlignment/128Bytes2Offset-4    468.4Mi ± 0%   2040.9Mi ± 0%  +335.73% (p=0.000 n=8)
XORBytesAlignment/128Bytes3Offset-4    468.5Mi ± 0%   2038.1Mi ± 0%  +335.02% (p=0.000 n=8)
XORBytesAlignment/128Bytes4Offset-4    468.4Mi ± 0%   2040.0Mi ± 0%  +335.52% (p=0.000 n=8)
XORBytesAlignment/128Bytes5Offset-4    468.2Mi ± 0%   2040.5Mi ± 0%  +335.82% (p=0.000 n=8)
XORBytesAlignment/128Bytes6Offset-4    468.4Mi ± 0%   2038.2Mi ± 0%  +335.13% (p=0.000 n=8)
XORBytesAlignment/128Bytes7Offset-4    468.2Mi ± 0%   2039.4Mi ± 0%  +335.58% (p=0.000 n=8)
XORBytesAlignment/2048Bytes0Offset-4   3.454Gi ± 1%    7.629Gi ± 0%  +120.90% (p=0.000 n=8)
XORBytesAlignment/2048Bytes1Offset-4   542.1Mi ± 0%   3560.1Mi ± 0%  +556.68% (p=0.000 n=8)
XORBytesAlignment/2048Bytes2Offset-4   542.3Mi ± 0%   3560.1Mi ± 0%  +556.48% (p=0.000 n=8)
XORBytesAlignment/2048Bytes3Offset-4   541.9Mi ± 0%   3560.0Mi ± 0%  +556.93% (p=0.000 n=8)
XORBytesAlignment/2048Bytes4Offset-4   542.0Mi ± 0%   3558.8Mi ± 0%  +556.67% (p=0.000 n=8)
XORBytesAlignment/2048Bytes5Offset-4   542.1Mi ± 3%   3558.8Mi ± 0%  +556.53% (p=0.000 n=8)
XORBytesAlignment/2048Bytes6Offset-4   542.2Mi ± 0%   3560.2Mi ± 0%  +556.57% (p=0.000 n=8)
XORBytesAlignment/2048Bytes7Offset-4   542.3Mi ± 0%   3560.5Mi ± 0%  +556.56% (p=0.000 n=8)
geomean                                514.9Mi         1.496Gi       +197.56%

Change-Id: I649fa6bfca31296d65cccdf5fceb3dcfa0c588a1
Reviewed-on: https://go-review.googlesource.com/c/go/+/666255
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/crypto/internal/fips140/subtle/xor_asm.go |   2 +-
 .../internal/fips140/subtle/xor_generic.go    |   2 +-
 .../internal/fips140/subtle/xor_mips64x.s     | 153 ++++++++++++++++++
 3 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 src/crypto/internal/fips140/subtle/xor_mips64x.s

diff --git a/src/crypto/internal/fips140/subtle/xor_asm.go b/src/crypto/internal/fips140/subtle/xor_asm.go
index 9a5da424aed..1ff120edef5 100644
--- a/src/crypto/internal/fips140/subtle/xor_asm.go
+++ b/src/crypto/internal/fips140/subtle/xor_asm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (amd64 || arm64 || loong64 || ppc64 || ppc64le || riscv64) && !purego
+//go:build (amd64 || arm64 || loong64 || mips64 || mips64le || ppc64 || ppc64le || riscv64) && !purego
 
 package subtle
 
diff --git a/src/crypto/internal/fips140/subtle/xor_generic.go b/src/crypto/internal/fips140/subtle/xor_generic.go
index 0b31eec6019..08af84de2a3 100644
--- a/src/crypto/internal/fips140/subtle/xor_generic.go
+++ b/src/crypto/internal/fips140/subtle/xor_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (!amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le && !riscv64) || purego
+//go:build (!amd64 && !arm64 && !loong64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !riscv64) || purego
 
 package subtle
 
diff --git a/src/crypto/internal/fips140/subtle/xor_mips64x.s b/src/crypto/internal/fips140/subtle/xor_mips64x.s
new file mode 100644
index 00000000000..e580235914a
--- /dev/null
+++ b/src/crypto/internal/fips140/subtle/xor_mips64x.s
@@ -0,0 +1,153 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (mips64 || mips64le) && !purego
+
+#include "textflag.h"
+
+// func xorBytes(dst, a, b *byte, n int)
+TEXT ·xorBytes(SB), NOSPLIT|NOFRAME, $0
+	MOVV	dst+0(FP), R1
+	MOVV	a+8(FP), R2
+	MOVV	b+16(FP), R3
+	MOVV	n+24(FP), R4
+
+xor_64_check:
+	SGTU	$64, R4, R5 // R5 = 1 if (64 > R4)
+	BNE	R5, xor_32_check
+xor_64:
+	MOVV	(R2), R6
+	MOVV	8(R2), R7
+	MOVV	16(R2), R8
+	MOVV	24(R2), R9
+	MOVV	(R3), R10
+	MOVV	8(R3), R11
+	MOVV	16(R3), R12
+	MOVV	24(R3), R13
+	XOR	R6, R10
+	XOR	R7, R11
+	XOR	R8, R12
+	XOR	R9, R13
+	MOVV	R10, (R1)
+	MOVV	R11, 8(R1)
+	MOVV	R12, 16(R1)
+	MOVV	R13, 24(R1)
+	MOVV	32(R2), R6
+	MOVV	40(R2), R7
+	MOVV	48(R2), R8
+	MOVV	56(R2), R9
+	MOVV	32(R3), R10
+	MOVV	40(R3), R11
+	MOVV	48(R3), R12
+	MOVV	56(R3), R13
+	XOR	R6, R10
+	XOR	R7, R11
+	XOR	R8, R12
+	XOR	R9, R13
+	MOVV	R10, 32(R1)
+	MOVV	R11, 40(R1)
+	MOVV	R12, 48(R1)
+	MOVV	R13, 56(R1)
+	ADDV	$64, R2
+	ADDV	$64, R3
+	ADDV	$64, R1
+	SUBV	$64, R4
+	SGTU	$64, R4, R5
+	BEQ	R0, R5, xor_64
+	BEQ	R0, R4, end
+
+xor_32_check:
+	SGTU	$32, R4, R5
+	BNE	R5, xor_16_check
+xor_32:
+	MOVV	(R2), R6
+	MOVV	8(R2), R7
+	MOVV	16(R2), R8
+	MOVV	24(R2), R9
+	MOVV	(R3), R10
+	MOVV	8(R3), R11
+	MOVV	16(R3), R12
+	MOVV	24(R3), R13
+	XOR	R6, R10
+	XOR	R7, R11
+	XOR	R8, R12
+	XOR	R9, R13
+	MOVV	R10, (R1)
+	MOVV	R11, 8(R1)
+	MOVV	R12, 16(R1)
+	MOVV	R13, 24(R1)
+	ADDV	$32, R2
+	ADDV	$32, R3
+	ADDV	$32, R1
+	SUBV	$32, R4
+	BEQ	R0, R4, end
+
+xor_16_check:
+	SGTU	$16, R4, R5
+	BNE	R5, xor_8_check
+xor_16:
+	MOVV	(R2), R6
+	MOVV	8(R2), R7
+	MOVV	(R3), R8
+	MOVV	8(R3), R9
+	XOR	R6, R8
+	XOR	R7, R9
+	MOVV	R8, (R1)
+	MOVV	R9, 8(R1)
+	ADDV	$16, R2
+	ADDV	$16, R3
+	ADDV	$16, R1
+	SUBV	$16, R4
+	BEQ	R0, R4, end
+
+xor_8_check:
+	SGTU	$8, R4, R5
+	BNE	R5, xor_4_check
+xor_8:
+	MOVV	(R2), R6
+	MOVV	(R3), R7
+	XOR	R6, R7
+	MOVV	R7, (R1)
+	ADDV	$8, R1
+	ADDV	$8, R2
+	ADDV	$8, R3
+	SUBV	$8, R4
+	BEQ	R0, R4, end
+
+xor_4_check:
+	SGTU	$4, R4, R5
+	BNE	R5, xor_2_check
+xor_4:
+	MOVW	(R2), R6
+	MOVW	(R3), R7
+	XOR	R6, R7
+	MOVW	R7, (R1)
+	ADDV	$4, R2
+	ADDV	$4, R3
+	ADDV	$4, R1
+	SUBV	$4, R4
+	BEQ	R0, R4, end
+
+xor_2_check:
+	SGTU	$2, R4, R5
+	BNE	R5, xor_1
+xor_2:
+	MOVH	(R2), R6
+	MOVH	(R3), R7
+	XOR	R6, R7
+	MOVH	R7, (R1)
+	ADDV	$2, R2
+	ADDV	$2, R3
+	ADDV	$2, R1
+	SUBV	$2, R4
+	BEQ	R0, R4, end
+
+xor_1:
+	MOVB	(R2), R6
+	MOVB	(R3), R7
+	XOR	R6, R7
+	MOVB	R7, (R1)
+
+end:
+	RET
-- 
GitLab