Skip to content
Snippets Groups Projects
Commit 7bc714d6 authored by limeidan's avatar limeidan Committed by abner chenc
Browse files

internal/bytealg: optimize the function indexbyte using SIMD on loong64

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3C5000 @ 2200.00MHz
              │     old      │                 new                 │
              │    sec/op    │   sec/op     vs base                │
IndexByte/10     19.32n ± 0%   11.84n ± 0%  -38.72% (p=0.000 n=10)
IndexByte/32     49.34n ± 0%   14.11n ± 0%  -71.40% (p=0.000 n=10)
IndexByte/4K    5608.0n ± 0%   138.8n ± 0%  -97.52% (p=0.000 n=10)
IndexByte/4M    3822.8µ ± 0%   119.4µ ± 0%  -96.88% (p=0.000 n=10)
IndexByte/64M   61.826m ± 1%   3.812m ± 0%  -93.83% (p=0.000 n=10)
geomean          16.61µ        1.602µ       -90.35%

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
              │      old      │                 new                  │
              │    sec/op     │    sec/op     vs base                │
IndexByte/10      6.809n ± 0%   5.804n ±  0%  -14.75% (p=0.000 n=10)
IndexByte/32     16.015n ± 0%   6.404n ±  0%  -60.01% (p=0.000 n=10)
IndexByte/4K    1651.00n ± 0%   52.83n ±  0%  -96.80% (p=0.000 n=10)
IndexByte/4M    1680.76µ ± 0%   91.10µ ±  0%  -94.58% (p=0.000 n=10)
IndexByte/64M    26.878m ± 0%   2.010m ± 27%  -92.52% (p=0.000 n=10)
geomean           6.054µ        815.0n        -86.54%

Change-Id: Ib75b997249708f921c6717eba43543c6650bf376
Reviewed-on: https://go-review.googlesource.com/c/go/+/668055


Reviewed-by: default avatarKeith Randall <khr@google.com>
Reviewed-by: default avatarabner chenc <chenguoqi@loongson.cn>
Reviewed-by: default avatarCherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: default avatarsophie zhao <zhaoxiaolin@loongson.cn>
parent d65c209b
No related branches found
No related tags found
No related merge requests found
......@@ -5,48 +5,288 @@
#include "go_asm.h"
#include "textflag.h"
// input:
// R4 = b_base
// R5 = b_len
// R6 = b_cap (unused)
// R7 = byte to find
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
// R4 = b_base
// R5 = b_len
// R6 = b_cap (unused)
// R7 = byte to find
AND $0xff, R7
JMP indexbytebody<>(SB)
// input:
// R4 = s_base
// R5 = s_len
// R6 = byte to find
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
AND $0xff, R6, R7 // byte to find
JMP indexbytebody<>(SB)
// input:
// R4: b_base
// R5: len
// R7: byte to find
TEXT indexbytebody<>(SB),NOSPLIT,$0
BEQ R5, notfound // len == 0
MOVV R4, R6 // store base for later
ADDV R4, R5 // end
ADDV $-1, R4
ADDV R4, R5, R8 // end
MOVV $32, R9
BGE R5, R9, lasx
tail:
MOVV $8, R9
BLT R5, R9, lt_8
generic8_loop:
MOVV (R4), R10
AND $0xff, R10, R11
BEQ R7, R11, found
BSTRPICKV $15, R10, $8, R11
BEQ R7, R11, byte_1th
BSTRPICKV $23, R10, $16, R11
BEQ R7, R11, byte_2th
BSTRPICKV $31, R10, $24, R11
BEQ R7, R11, byte_3th
PCALIGN $16
loop:
BSTRPICKV $39, R10, $32, R11
BEQ R7, R11, byte_4th
BSTRPICKV $47, R10, $40, R11
BEQ R7, R11, byte_5th
BSTRPICKV $55, R10, $48, R11
BEQ R7, R11, byte_6th
BSTRPICKV $63, R10, $56, R11
BEQ R7, R11, byte_7th
ADDV $8, R4
ADDV $-8, R5
BGE R5, R9, generic8_loop
lt_8:
BEQ R4, R8, notfound
MOVBU (R4), R10
BEQ R7, R10, found
ADDV $1, R4
BEQ R4, R5, notfound
MOVBU (R4), R8
BNE R7, R8, loop
JMP lt_8
SUBV R6, R4 // remove base
byte_1th:
ADDV $1, R4
SUBV R6, R4
RET
notfound:
MOVV $-1, R4
byte_2th:
ADDV $2, R4
SUBV R6, R4
RET
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
// R4 = s_base
// R5 = s_len
// R6 = byte to find
MOVV R4, R7 // store base for later
ADDV R4, R5 // end
ADDV $-1, R4
PCALIGN $16
loop:
ADDV $1, R4
BEQ R4, R5, notfound
MOVBU (R4), R8
BNE R6, R8, loop
byte_3th:
ADDV $3, R4
SUBV R6, R4
RET
byte_4th:
ADDV $4, R4
SUBV R6, R4
RET
byte_5th:
ADDV $5, R4
SUBV R6, R4
RET
byte_6th:
ADDV $6, R4
SUBV R6, R4
RET
byte_7th:
ADDV $7, R4
SUBV R7, R4 // remove base
found:
SUBV R6, R4
RET
notfound:
MOVV $-1, R4
RET
lasx:
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
BEQ R9, lsx
XVMOVQ R7, X0.B32
MOVV $128, R9
BLT R5, R9, lasx32_loop
lasx128_loop:
XVMOVQ 0(R4), X1
XVMOVQ 32(R4), X2
XVMOVQ 64(R4), X3
XVMOVQ 96(R4), X4
XVSEQB X1, X0, X1
XVSETNEV X1, FCC0
BFPT lasx_found_add_0
XVSEQB X2, X0, X1
XVSETNEV X1, FCC0
BFPT lasx_found_add_32
XVSEQB X3, X0, X1
XVSETNEV X1, FCC0
BFPT lasx_found_add_64
XVSEQB X4, X0, X1
XVSETNEV X1, FCC0
BFPT lasx_found_add_96
ADDV $128, R4
ADDV $-128, R5
BGE R5, R9, lasx128_loop
BEQ R5, notfound
MOVV $32, R9
BLT R5, R9, tail
lasx32_loop:
XVMOVQ 0(R4), X1
XVSEQB X1, X0, X1
XVSETNEV X1, FCC0
BFPT lasx_found_add_0
ADDV $32, R4
ADDV $-32, R5
BGE R5, R9, lasx32_loop
BEQ R5, notfound
JMP tail
lasx_found_add_0:
MOVV R0, R11
JMP lasx_index_cal
lasx_found_add_32:
MOVV $32, R11
JMP lasx_index_cal
lasx_found_add_64:
MOVV $64, R11
JMP lasx_index_cal
lasx_found_add_96:
MOVV $96, R11
JMP lasx_index_cal
lasx_index_cal:
MOVV $64, R9
XVMOVQ X1.V[0], R10
CTZV R10, R10
BNE R10, R9, index_cal
ADDV $8, R11
XVMOVQ X1.V[1], R10
CTZV R10, R10
BNE R10, R9, index_cal
ADDV $8, R11
XVMOVQ X1.V[2], R10
CTZV R10, R10
BNE R10, R9, index_cal
ADDV $8, R11
XVMOVQ X1.V[3], R10
CTZV R10, R10
JMP index_cal
lsx:
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
BEQ R9, tail
VMOVQ R7, V0.B16
MOVV $64, R9
BLT R5, R9, lsx16_loop
lsx64_loop:
VMOVQ 0(R4), V1
VMOVQ 16(R4), V2
VMOVQ 32(R4), V3
VMOVQ 48(R4), V4
VSEQB V1, V0, V1
VSETNEV V1, FCC0
BFPT lsx_found_add_0
VSEQB V2, V0, V1
VSETNEV V1, FCC0
BFPT lsx_found_add_16
VSEQB V3, V0, V1
VSETNEV V1, FCC0
BFPT lsx_found_add_32
VSEQB V4, V0, V1
VSETNEV V1, FCC0
BFPT lsx_found_add_48
ADDV $64, R4
ADDV $-64, R5
BGE R5, R9, lsx64_loop
BEQ R5, notfound
MOVV $16, R9
BLT R5, R9, tail
lsx16_loop:
VMOVQ 0(R4), V1
VSEQB V1, V0, V1
VSETNEV V1, FCC0
BFPT lsx_found_add_0
ADDV $16, R4
ADDV $-16, R5
BGE R5, R9, lsx16_loop
BEQ R5, notfound
JMP tail
lsx_found_add_0:
MOVV R0, R11
JMP lsx_index_cal
lsx_found_add_16:
MOVV $16, R11
JMP lsx_index_cal
lsx_found_add_32:
MOVV $32, R11
JMP lsx_index_cal
lsx_found_add_48:
MOVV $48, R11
JMP lsx_index_cal
lsx_index_cal:
MOVV $64, R9
VMOVQ V1.V[0], R10
CTZV R10, R10
BNE R10, R9, index_cal
ADDV $8, R11
VMOVQ V1.V[1], R10
CTZV R10, R10
JMP index_cal
index_cal:
SRLV $3, R10
ADDV R11, R10
ADDV R10, R4
JMP found
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment