diff --git a/src/math/big/arith_riscv64.s b/src/math/big/arith_riscv64.s index 44580338b3547617c5254d040a71f25590f08edc..069a4080f475327cbb2f7686793e7d01942c4be0 100644 --- a/src/math/big/arith_riscv64.s +++ b/src/math/big/arith_riscv64.s @@ -375,5 +375,96 @@ done: RET TEXT ·addMulVVW(SB),NOSPLIT,$0 - JMP ·addMulVVW_g(SB) + MOV x+24(FP), X5 + MOV y+48(FP), X6 + MOV z+0(FP), X7 + MOV z_len+8(FP), X30 + + MOV $4, X28 + MOV $0, X29 // c = 0 + + BEQZ X30, done + BLTU X30, X28, loop1 + +loop4: + MOV 0(X5), X8 // x[0] + MOV 0(X7), X10 // z[0] + MOV 8(X5), X11 // x[1] + MOV 8(X7), X13 // z[1] + MOV 16(X5), X14 // x[2] + MOV 16(X7), X16 // z[2] + MOV 24(X5), X17 // x[3] + MOV 24(X7), X19 // z[3] + + MULHU X8, X6, X9 // z_hi[0] = x[0] * y + MUL X8, X6, X8 // z_lo[0] = x[0] * y + ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0] + SLTU X8, X21, X22 + ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0] + ADD X21, X29, X10 // z[0] = x[0] * y + z[0] + c + SLTU X21, X10, X22 + ADD X9, X22, X29 // next c + + MULHU X11, X6, X12 // z_hi[1] = x[1] * y + MUL X11, X6, X11 // z_lo[1] = x[1] * y + ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1] + SLTU X11, X21, X22 + ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1] + ADD X21, X29, X13 // z[1] = x[1] * y + z[1] + c + SLTU X21, X13, X22 + ADD X12, X22, X29 // next c + + MULHU X14, X6, X15 // z_hi[2] = x[2] * y + MUL X14, X6, X14 // z_lo[2] = x[2] * y + ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2] + SLTU X14, X21, X22 + ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2] + ADD X21, X29, X16 // z[2] = x[2] * y + z[2] + c + SLTU X21, X16, X22 + ADD X15, X22, X29 // next c + + MULHU X17, X6, X18 // z_hi[3] = x[3] * y + MUL X17, X6, X17 // z_lo[3] = x[3] * y + ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3] + SLTU X17, X21, X22 + ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3] + ADD X21, X29, X19 // z[3] = x[3] * y + z[3] + c + SLTU X21, X19, X22 + ADD X18, X22, X29 // next c + MOV X10, 0(X7) // z[0] + MOV X13, 8(X7) // z[1] + MOV X16, 16(X7) // z[2] + MOV X19, 24(X7) // z[3] + + ADD $32, X5 + ADD $32, X7 + SUB $4, X30 + + BGEU X30, X28, loop4 + BEQZ X30, done + +loop1: + MOV 0(X5), X10 // x + MOV 0(X7), X11 // z + + MULHU X10, X6, X12 // z_hi = x * y + MUL X10, X6, X10 // z_lo = x * y + ADD X10, X11, X13 // z_lo = x * y + z + SLTU X10, X13, X15 + ADD X12, X15, X12 // z_hi = x * y + z + ADD X13, X29, X10 // z = x * y + z + c + SLTU X13, X10, X15 + ADD X12, X15, X29 // next c + + MOV X10, 0(X7) // z + + ADD $8, X5 + ADD $8, X7 + SUB $1, X30 + + BNEZ X30, loop1 + +done: + MOV X29, c+56(FP) // return c + RET