Skip to content
Snippets Groups Projects
Commit ecdd429a authored by limeidan's avatar limeidan Committed by abner chenc
Browse files

runtime: optimize the function memequal using SIMD on loong64

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
                              │      old      │                 new                  │
                              │    sec/op     │    sec/op     vs base                │
Equal/0                          0.4012n ± 0%   0.4003n ± 0%   -0.21% (p=0.000 n=10)
Equal/same/1                      2.555n ± 1%    2.419n ± 0%   -5.32% (p=0.000 n=10)
Equal/same/6                      2.574n ± 1%    2.425n ± 1%   -5.79% (p=0.000 n=10)
Equal/same/9                      2.578n ± 0%    2.419n ± 1%   -6.19% (p=0.000 n=10)
Equal/same/15                     2.565n ± 1%    2.417n ± 0%   -5.73% (p=0.000 n=10)
Equal/same/16                     2.576n ± 1%    2.414n ± 0%   -6.31% (p=0.000 n=10)
Equal/same/20                     2.573n ± 1%    2.416n ± 0%   -6.10% (p=0.000 n=10)
Equal/same/32                     2.559n ± 0%    2.411n ± 0%   -5.80% (p=0.000 n=10)
Equal/same/4K                     2.579n ± 1%    2.410n ± 0%   -6.53% (p=0.000 n=10)
Equal/same/4M                     2.571n ± 0%    2.411n ± 0%   -6.22% (p=0.000 n=10)
Equal/same/64M                    2.568n ± 1%    2.413n ± 0%   -6.05% (p=0.000 n=10)
Equal/1                           5.215n ± 0%    6.404n ± 0%  +22.80% (p=0.000 n=10)
Equal/6                          11.630n ± 0%    6.404n ± 0%  -44.94% (p=0.000 n=10)
Equal/9                          15.240n ± 0%    6.404n ± 0%  -57.98% (p=0.000 n=10)
Equal/15                         22.925n ± 0%    6.404n ± 0%  -72.07% (p=0.000 n=10)
Equal/16                         24.070n ± 0%    5.203n ± 0%  -78.38% (p=0.000 n=10)
Equal/20                         28.880n ± 0%    6.404n ± 0%  -77.83% (p=0.000 n=10)
Equal/32                         43.320n ± 0%    6.404n ± 0%  -85.22% (p=0.000 n=10)
Equal/4K                        4938.50n ± 0%    55.43n ± 0%  -98.88% (p=0.000 n=10)
Equal/4M                         5048.8µ ± 0%    202.0µ ± 0%  -96.00% (p=0.000 n=10)
Equal/64M                        80.819m ± 0%    4.539m ± 0%  -94.38% (p=0.000 n=10)
EqualBothUnaligned/64_0          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_1          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_4          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_7          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/4096_0       4937.00n ± 0%    65.64n ± 0%  -98.67% (p=0.000 n=10)
EqualBothUnaligned/4096_1       4937.00n ± 0%    78.85n ± 0%  -98.40% (p=0.000 n=10)
EqualBothUnaligned/4096_4       4937.00n ± 0%    78.87n ± 0%  -98.40% (p=0.000 n=10)
EqualBothUnaligned/4096_7       4937.00n ± 0%    78.87n ± 0%  -98.40% (p=0.000 n=10)
EqualBothUnaligned/4194304_0     5049.2µ ± 0%    204.2µ ± 0%  -95.96% (p=0.000 n=10)
EqualBothUnaligned/4194304_1     5049.2µ ± 0%    205.1µ ± 0%  -95.94% (p=0.000 n=10)
EqualBothUnaligned/4194304_4     5049.4µ ± 0%    205.1µ ± 0%  -95.94% (p=0.000 n=10)
EqualBothUnaligned/4194304_7     5049.2µ ± 0%    205.1µ ± 0%  -95.94% (p=0.000 n=10)
EqualBothUnaligned/67108864_0    80.796m ± 0%    3.863m ± 0%  -95.22% (p=0.000 n=10)
EqualBothUnaligned/67108864_1    80.801m ± 0%    3.706m ± 0%  -95.41% (p=0.000 n=10)
EqualBothUnaligned/67108864_4    80.799m ± 0%    3.706m ± 0%  -95.41% (p=0.000 n=10)
EqualBothUnaligned/67108864_7    80.781m ± 0%    3.706m ± 0%  -95.41% (p=0.000 n=10)
geomean                           1.040µ         149.6n       -85.63%

Change-Id: Id4c2bc0ca758337dd9759df83750c761814be488
Reviewed-on: https://go-review.googlesource.com/c/go/+/667255


Reviewed-by: default avatarabner chenc <chenguoqi@loongson.cn>
Reviewed-by: default avatarMichael Pratt <mpratt@google.com>
Reviewed-by: default avatarsophie zhao <zhaoxiaolin@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: default avatarJunyang Shao <shaojunyang@google.com>
parent 93e4e26d
Branches
No related tags found
No related merge requests found
...@@ -8,37 +8,266 @@ ...@@ -8,37 +8,266 @@
#define REGCTXT R29 #define REGCTXT R29
// memequal(a, b unsafe.Pointer, size uintptr) bool // memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25 TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
BEQ R4, R5, eq // R4 = a_base
ADDV R4, R6, R7 // R5 = b_base
PCALIGN $16 // R6 = size
loop: JMP equalbody<>(SB)
BNE R4, R7, test
MOVV $1, R4
RET
test:
MOVBU (R4), R9
ADDV $1, R4
MOVBU (R5), R10
ADDV $1, R5
BEQ R9, R10, loop
MOVB R0, R4
RET
eq:
MOVV $1, R4
RET
// memequal_varlen(a, b unsafe.Pointer) bool // memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17 TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0
BEQ R4, R5, eq // R4 = a_base
// R5 = b_base
MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure MOVV 8(REGCTXT), R6 // compiler stores size at offset 8 in the closure
MOVV R4, 8(R3) JMP equalbody<>(SB)
MOVV R5, 16(R3)
MOVV R6, 24(R3) // input:
JAL runtime·memequal(SB) // R4 = a_base
MOVBU 32(R3), R4 // R5 = b_base
RET // R6 = size
eq: TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
// a_base == b_base
BEQ R4, R5, equal
// 0 bytes
BEQ R6, equal
MOVV $64, R7
BGE R6, R7, lasx
// size < 64 bytes
tail:
MOVV $16, R7
BLT R6, R7, lt_16
generic16_loop:
ADDV $-16, R6
MOVV 0(R4), R8
MOVV 8(R4), R9
MOVV 0(R5), R10
MOVV 8(R5), R11
BNE R8, R10, not_equal
BNE R9, R11, not_equal
BEQ R6, equal
ADDV $16, R4
ADDV $16, R5
BGE R6, R7, generic16_loop
// size < 16 bytes
lt_16:
MOVV $8, R7
BLT R6, R7, lt_8
ADDV $-8, R6
MOVV 0(R4), R8
MOVV 0(R5), R9
BNE R8, R9, not_equal
BEQ R6, equal
ADDV $8, R4
ADDV $8, R5
// size < 8 bytes
lt_8:
MOVV $4, R7
BLT R6, R7, lt_4
ADDV $-4, R6
MOVW 0(R4), R8
MOVW 0(R5), R9
BNE R8, R9, not_equal
BEQ R6, equal
ADDV $4, R4
ADDV $4, R5
// size < 4 bytes
lt_4:
MOVV $2, R7
BLT R6, R7, lt_2
ADDV $-2, R6
MOVH 0(R4), R8
MOVH 0(R5), R9
BNE R8, R9, not_equal
BEQ R6, equal
ADDV $2, R4
ADDV $2, R5
// size < 2 bytes
lt_2:
MOVB 0(R4), R8
MOVB 0(R5), R9
BNE R8, R9, not_equal
equal:
MOVV $1, R4 MOVV $1, R4
RET RET
not_equal:
MOVV R0, R4
RET
// Implemented using 256-bit SIMD instructions
lasx:
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
BEQ R7, lsx
lasx256:
MOVV $256, R7
BLT R6, R7, lasx64
lasx256_loop:
ADDV $-256, R6
XVMOVQ 0(R4), X0
XVMOVQ 32(R4), X1
XVMOVQ 64(R4), X2
XVMOVQ 96(R4), X3
XVMOVQ 128(R4), X4
XVMOVQ 160(R4), X5
XVMOVQ 192(R4), X6
XVMOVQ 224(R4), X7
XVMOVQ 0(R5), X8
XVMOVQ 32(R5), X9
XVMOVQ 64(R5), X10
XVMOVQ 96(R5), X11
XVMOVQ 128(R5), X12
XVMOVQ 160(R5), X13
XVMOVQ 192(R5), X14
XVMOVQ 224(R5), X15
XVSEQV X0, X8, X0
XVSEQV X1, X9, X1
XVSEQV X2, X10, X2
XVSEQV X3, X11, X3
XVSEQV X4, X12, X4
XVSEQV X5, X13, X5
XVSEQV X6, X14, X6
XVSEQV X7, X15, X7
XVANDV X0, X1, X0
XVANDV X2, X3, X2
XVANDV X4, X5, X4
XVANDV X6, X7, X6
XVANDV X0, X2, X0
XVANDV X4, X6, X4
XVANDV X0, X4, X0
XVSETALLNEV X0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $256, R4
ADDV $256, R5
BGE R6, R7, lasx256_loop
lasx64:
MOVV $64, R7
BLT R6, R7, tail
lasx64_loop:
ADDV $-64, R6
XVMOVQ 0(R4), X0
XVMOVQ 32(R4), X1
XVMOVQ 0(R5), X2
XVMOVQ 32(R5), X3
XVSEQV X0, X2, X0
XVSEQV X1, X3, X1
XVANDV X0, X1, X0
XVSETALLNEV X0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $64, R4
ADDV $64, R5
BGE R6, R7, lasx64_loop
JMP tail
// Implemented using 128-bit SIMD instructions
lsx:
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
BEQ R7, generic64_loop
lsx128:
MOVV $128, R7
BLT R6, R7, lsx32
lsx128_loop:
ADDV $-128, R6
VMOVQ 0(R4), V0
VMOVQ 16(R4), V1
VMOVQ 32(R4), V2
VMOVQ 48(R4), V3
VMOVQ 64(R4), V4
VMOVQ 80(R4), V5
VMOVQ 96(R4), V6
VMOVQ 112(R4), V7
VMOVQ 0(R5), V8
VMOVQ 16(R5), V9
VMOVQ 32(R5), V10
VMOVQ 48(R5), V11
VMOVQ 64(R5), V12
VMOVQ 80(R5), V13
VMOVQ 96(R5), V14
VMOVQ 112(R5), V15
VSEQV V0, V8, V0
VSEQV V1, V9, V1
VSEQV V2, V10, V2
VSEQV V3, V11, V3
VSEQV V4, V12, V4
VSEQV V5, V13, V5
VSEQV V6, V14, V6
VSEQV V7, V15, V7
VANDV V0, V1, V0
VANDV V2, V3, V2
VANDV V4, V5, V4
VANDV V6, V7, V6
VANDV V0, V2, V0
VANDV V4, V6, V4
VANDV V0, V4, V0
VSETALLNEV V0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $128, R4
ADDV $128, R5
BGE R6, R7, lsx128_loop
lsx32:
MOVV $32, R7
BLT R6, R7, tail
lsx32_loop:
ADDV $-32, R6
VMOVQ 0(R4), V0
VMOVQ 16(R4), V1
VMOVQ 0(R5), V2
VMOVQ 16(R5), V3
VSEQV V0, V2, V0
VSEQV V1, V3, V1
VANDV V0, V1, V0
VSETALLNEV V0, FCC0
BFPF not_equal
BEQ R6, equal
ADDV $32, R4
ADDV $32, R5
BGE R6, R7, lsx32_loop
JMP tail
// Implemented using general instructions
generic64_loop:
ADDV $-64, R6
MOVV 0(R4), R7
MOVV 8(R4), R8
MOVV 16(R4), R9
MOVV 24(R4), R10
MOVV 0(R5), R15
MOVV 8(R5), R16
MOVV 16(R5), R17
MOVV 24(R5), R18
BNE R7, R15, not_equal
BNE R8, R16, not_equal
BNE R9, R17, not_equal
BNE R10, R18, not_equal
MOVV 32(R4), R11
MOVV 40(R4), R12
MOVV 48(R4), R13
MOVV 56(R4), R14
MOVV 32(R5), R19
MOVV 40(R5), R20
MOVV 48(R5), R21
MOVV 56(R5), R23
BNE R11, R19, not_equal
BNE R12, R20, not_equal
BNE R13, R21, not_equal
BNE R14, R23, not_equal
BEQ R6, equal
ADDV $64, R4
ADDV $64, R5
MOVV $64, R7
BGE R6, R7, generic64_loop
JMP tail
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment