diff --git a/src/crypto/aes/aes_gcm.go b/src/crypto/aes/aes_gcm.go
index 13ae2fcb82032b73a356a61cde04c119c2320900..49b78c3a8becf3c3c0f60c97fe4ec4ade8ca59c6 100644
--- a/src/crypto/aes/aes_gcm.go
+++ b/src/crypto/aes/aes_gcm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64
+// +build amd64 arm64
 
 package aes
 
@@ -13,10 +13,7 @@ import (
 	"errors"
 )
 
-// The following functions are defined in gcm_amd64.s.
-
-//go:noescape
-func aesEncBlock(dst, src *[16]byte, ks []uint32)
+// The following functions are defined in gcm_*.s.
 
 //go:noescape
 func gcmAesInit(productTable *[256]byte, ks []uint32)
@@ -118,7 +115,7 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
 		gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
 	}
 
-	aesEncBlock(&tagMask, &counter, g.ks)
+	encryptBlockAsm(len(g.ks)/4-1, &g.ks[0], &tagMask[0], &counter[0])
 
 	var tagOut [gcmTagSize]byte
 	gcmAesData(&g.productTable, data, &tagOut)
@@ -171,7 +168,7 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
 		gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
 	}
 
-	aesEncBlock(&tagMask, &counter, g.ks)
+	encryptBlockAsm(len(g.ks)/4-1, &g.ks[0], &tagMask[0], &counter[0])
 
 	var expectedTag [gcmTagSize]byte
 	gcmAesData(&g.productTable, data, &expectedTag)
diff --git a/src/crypto/aes/asm_arm64.s b/src/crypto/aes/asm_arm64.s
index d2e8c8597f9c6323ba766763b7fe61e724b78eb4..13aee5ca299f53efd8b872130167a8060340d6ca 100644
--- a/src/crypto/aes/asm_arm64.s
+++ b/src/crypto/aes/asm_arm64.s
@@ -3,7 +3,12 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
-
+DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
+DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
+GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
+DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
+DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
+GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
 // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
 	MOVD	nr+0(FP), R9
@@ -105,3 +110,172 @@ dec128:
 	VEOR    V0.B16, V15.B16, V0.B16
 	VST1	[V0.B16], (R11)
 	RET
+
+// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
+// Note that round keys are stored in uint128 format, not uint32
+TEXT ·expandKeyAsm(SB),NOSPLIT,$0
+	MOVD	nr+0(FP), R8
+	MOVD	key+8(FP), R9
+	MOVD	enc+16(FP), R10
+	MOVD	dec+24(FP), R11
+	LDP	rotInvSRows<>(SB), (R0, R1)
+	VMOV	R0, V3.D[0]
+	VMOV	R1, V3.D[1]
+	VEOR	V0.B16, V0.B16, V0.B16 // All zeroes
+	MOVW	$1, R13
+	TBZ	$1, R8, ks192
+	TBNZ	$2, R8, ks256
+	LDPW	(R9), (R4, R5)
+	LDPW	8(R9), (R6, R7)
+	STPW.P	(R4, R5), 8(R10)
+	STPW.P	(R6, R7), 8(R10)
+	MOVW	$0x1b, R14
+ks128Loop:
+		VMOV	R7, V2.S[0]
+		WORD	$0x4E030042       // TBL V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16    // Use AES to compute the SBOX
+		EORW	R13, R4
+		LSLW	$1, R13           // Compute next Rcon
+		ANDSW	$0x100, R13, ZR
+		CSELW	NE, R14, R13, R13 // Fake modulo
+		SUBS	$1, R8
+		VMOV	V2.S[0], R0
+		EORW	R0, R4
+		EORW	R4, R5
+		EORW	R5, R6
+		EORW	R6, R7
+		STPW.P	(R4, R5), 8(R10)
+		STPW.P	(R6, R7), 8(R10)
+	BNE	ks128Loop
+	CBZ	R11, ksDone       // If dec is nil we are done
+	SUB	$176, R10
+        // Decryption keys are encryption keys with InverseMixColumns applied
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VMOV	V0.B16, V7.B16
+	AESIMC	V1.B16, V6.B16
+	AESIMC	V2.B16, V5.B16
+	AESIMC	V3.B16, V4.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V11.B16
+	AESIMC	V1.B16, V10.B16
+	AESIMC	V2.B16, V9.B16
+	AESIMC	V3.B16, V8.B16
+	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
+	AESIMC	V0.B16, V14.B16
+	AESIMC	V1.B16, V13.B16
+	VMOV	V2.B16, V12.B16
+	VST1.P	[V12.B16, V13.B16, V14.B16], 48(R11)
+	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+	B	ksDone
+ks192:
+	LDPW	(R9), (R2, R3)
+	LDPW	8(R9), (R4, R5)
+	LDPW	16(R9), (R6, R7)
+	STPW.P	(R2, R3), 8(R10)
+	STPW.P	(R4, R5), 8(R10)
+	SUB	$4, R8
+ks192Loop:
+		STPW.P	(R6, R7), 8(R10)
+		VMOV	R7, V2.S[0]
+		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16
+		EORW	R13, R2
+		LSLW	$1, R13
+		SUBS	$1, R8
+		VMOV	V2.S[0], R0
+		EORW	R0, R2
+		EORW	R2, R3
+		EORW	R3, R4
+		EORW	R4, R5
+		EORW	R5, R6
+		EORW	R6, R7
+		STPW.P	(R2, R3), 8(R10)
+		STPW.P	(R4, R5), 8(R10)
+	BNE	ks192Loop
+	CBZ	R11, ksDone
+	SUB	$208, R10
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VMOV	V0.B16, V7.B16
+	AESIMC	V1.B16, V6.B16
+	AESIMC	V2.B16, V5.B16
+	AESIMC	V3.B16, V4.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V11.B16
+	AESIMC	V1.B16, V10.B16
+	AESIMC	V2.B16, V9.B16
+	AESIMC	V3.B16, V8.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V15.B16
+	AESIMC	V1.B16, V14.B16
+	AESIMC	V2.B16, V13.B16
+	AESIMC	V3.B16, V12.B16
+	VLD1	(R10), [V0.B16]
+	VST1.P	[V0.B16], 16(R11)
+	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
+	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+	B	ksDone
+ks256:
+	LDP	invSRows<>(SB), (R0, R1)
+	VMOV	R0, V4.D[0]
+	VMOV	R1, V4.D[1]
+	LDPW	(R9), (R0, R1)
+	LDPW	8(R9), (R2, R3)
+	LDPW	16(R9), (R4, R5)
+	LDPW	24(R9), (R6, R7)
+	STPW.P	(R0, R1), 8(R10)
+	STPW.P	(R2, R3), 8(R10)
+	SUB	$7, R8
+ks256Loop:
+		STPW.P	(R4, R5), 8(R10)
+		STPW.P	(R6, R7), 8(R10)
+		VMOV	R7, V2.S[0]
+		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16
+		EORW	R13, R0
+		LSLW	$1, R13
+		SUBS	$1, R8
+		VMOV	V2.S[0], R9
+		EORW	R9, R0
+		EORW	R0, R1
+		EORW	R1, R2
+		EORW	R2, R3
+		VMOV	R3, V2.S[0]
+		WORD	$0x4E040042 //TBL	V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16
+		VMOV	V2.S[0], R9
+		EORW	R9, R4
+		EORW	R4, R5
+		EORW	R5, R6
+		EORW	R6, R7
+		STPW.P	(R0, R1), 8(R10)
+		STPW.P	(R2, R3), 8(R10)
+	BNE	ks256Loop
+	CBZ	R11, ksDone
+	SUB	$240, R10
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VMOV	V0.B16, V7.B16
+	AESIMC	V1.B16, V6.B16
+	AESIMC	V2.B16, V5.B16
+	AESIMC	V3.B16, V4.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V11.B16
+	AESIMC	V1.B16, V10.B16
+	AESIMC	V2.B16, V9.B16
+	AESIMC	V3.B16, V8.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V15.B16
+	AESIMC	V1.B16, V14.B16
+	AESIMC	V2.B16, V13.B16
+	AESIMC	V3.B16, V12.B16
+	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
+	AESIMC	V0.B16, V18.B16
+	AESIMC	V1.B16, V17.B16
+	VMOV	V2.B16, V16.B16
+	VST1.P	[V16.B16, V17.B16, V18.B16], 48(R11)
+	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
+	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+ksDone:
+	RET
diff --git a/src/crypto/aes/cipher_arm64.go b/src/crypto/aes/cipher_arm64.go
deleted file mode 100644
index a03547841f26822cc31b8d2586451b901f2ea3f6..0000000000000000000000000000000000000000
--- a/src/crypto/aes/cipher_arm64.go
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package aes
-
-import (
-	"crypto/cipher"
-	"crypto/internal/subtle"
-	"internal/cpu"
-	"math/bits"
-)
-
-// defined in asm_arm64.s
-//go:noescape
-func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
-
-//go:noescape
-func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
-
-type aesCipherAsm struct {
-	aesCipher
-}
-
-func newCipher(key []byte) (cipher.Block, error) {
-	if !cpu.ARM64.HasAES {
-		return newCipherGeneric(key)
-	}
-	n := len(key) + 28
-	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
-	arm64ExpandKey(key, c.enc, c.dec)
-	return &c, nil
-}
-
-func (c *aesCipherAsm) BlockSize() int { return BlockSize }
-
-func (c *aesCipherAsm) Encrypt(dst, src []byte) {
-	if len(src) < BlockSize {
-		panic("crypto/aes: input not full block")
-	}
-	if len(dst) < BlockSize {
-		panic("crypto/aes: output not full block")
-	}
-	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
-		panic("crypto/aes: invalid buffer overlap")
-	}
-	encryptBlockAsm(len(c.enc)/4-1, &c.enc[0], &dst[0], &src[0])
-}
-
-func (c *aesCipherAsm) Decrypt(dst, src []byte) {
-	if len(src) < BlockSize {
-		panic("crypto/aes: input not full block")
-	}
-	if len(dst) < BlockSize {
-		panic("crypto/aes: output not full block")
-	}
-	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
-		panic("crypto/aes: invalid buffer overlap")
-	}
-	decryptBlockAsm(len(c.dec)/4-1, &c.dec[0], &dst[0], &src[0])
-}
-
-func arm64ExpandKey(key []byte, enc, dec []uint32) {
-	expandKeyGo(key, enc, dec)
-	nk := len(enc)
-	for i := 0; i < nk; i++ {
-		enc[i] = bits.ReverseBytes32(enc[i])
-		dec[i] = bits.ReverseBytes32(dec[i])
-	}
-}
-
-// expandKey is used by BenchmarkExpand to ensure that the asm implementation
-// of key expansion is used for the benchmark when it is available.
-func expandKey(key []byte, enc, dec []uint32) {
-	if cpu.ARM64.HasAES {
-		arm64ExpandKey(key, enc, dec)
-	} else {
-		expandKeyGo(key, enc, dec)
-	}
-}
diff --git a/src/crypto/aes/cipher_amd64.go b/src/crypto/aes/cipher_asm.go
similarity index 87%
rename from src/crypto/aes/cipher_amd64.go
rename to src/crypto/aes/cipher_asm.go
index b12d9b46a2b7604ea4b126f6e91d40250bfc7eec..646bdfa5c0e6ce8867d90cd7fa3d28a660481836 100644
--- a/src/crypto/aes/cipher_amd64.go
+++ b/src/crypto/aes/cipher_asm.go
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build amd64 arm64
+
 package aes
 
 import (
@@ -10,23 +12,31 @@ import (
 	"internal/cpu"
 )
 
-// defined in asm_amd64.s
+// defined in asm_*.s
 
+//go:noescape
 func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+
+//go:noescape
 func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+
+//go:noescape
 func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
 
 type aesCipherAsm struct {
 	aesCipher
 }
 
+var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
+var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
+
 func newCipher(key []byte) (cipher.Block, error) {
-	if !cpu.X86.HasAES {
+	if !supportsAES {
 		return newCipherGeneric(key)
 	}
 	n := len(key) + 28
 	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
-	rounds := 10
+	var rounds int
 	switch len(key) {
 	case 128 / 8:
 		rounds = 10
@@ -37,10 +47,9 @@ func newCipher(key []byte) (cipher.Block, error) {
 	}
 
 	expandKeyAsm(rounds, &key[0], &c.enc[0], &c.dec[0])
-	if cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ {
+	if supportsAES && supportsGFMUL {
 		return &aesCipherGCM{c}, nil
 	}
-
 	return &c, nil
 }
 
@@ -75,7 +84,7 @@ func (c *aesCipherAsm) Decrypt(dst, src []byte) {
 // expandKey is used by BenchmarkExpand to ensure that the asm implementation
 // of key expansion is used for the benchmark when it is available.
 func expandKey(key []byte, enc, dec []uint32) {
-	if cpu.X86.HasAES {
+	if supportsAES {
 		rounds := 10 // rounds needed for AES128
 		switch len(key) {
 		case 192 / 8:
diff --git a/src/crypto/aes/gcm_amd64.s b/src/crypto/aes/gcm_amd64.s
index b651cc492500a9e65e0365ab63a67c0a1f95d145..e6eedf326400a59b5fb491767a77ce2696d08b3f 100644
--- a/src/crypto/aes/gcm_amd64.s
+++ b/src/crypto/aes/gcm_amd64.s
@@ -71,56 +71,6 @@ GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
 GLOBL andMask<>(SB), (NOPTR+RODATA), $240
 
-// func aesEncBlock(dst, src *[16]byte, ks []uint32)
-TEXT ·aesEncBlock(SB),NOSPLIT,$0
-	MOVQ dst+0(FP), DI
-	MOVQ src+8(FP), SI
-	MOVQ ks_base+16(FP), DX
-	MOVQ ks_len+24(FP), CX
-
-	SHRQ $2, CX
-	DECQ CX
-
-	MOVOU (SI), X0
-	MOVOU (16*0)(DX), X1
-	PXOR X1, X0
-	MOVOU (16*1)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*2)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*3)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*4)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*5)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*6)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*7)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*8)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*9)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*10)(DX), X1
-	CMPQ CX, $12
-	JB encLast
-	AESENC X1, X0
-	MOVOU (16*11)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*12)(DX), X1
-	JE encLast
-	AESENC X1, X0
-	MOVOU (16*13)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*14)(DX), X1
-
-encLast:
-	AESENCLAST X1, X0
-	MOVOU X0, (DI)
-
-	RET
-
 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
 TEXT ·gcmAesFinish(SB),NOSPLIT,$0
 #define pTbl DI
diff --git a/src/crypto/aes/gcm_arm64.s b/src/crypto/aes/gcm_arm64.s
new file mode 100644
index 0000000000000000000000000000000000000000..98e9f5bbe59dc518aa61c99d1e82cd4cd169a6db
--- /dev/null
+++ b/src/crypto/aes/gcm_arm64.s
@@ -0,0 +1,1021 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define B0 V0
+#define B1 V1
+#define B2 V2
+#define B3 V3
+#define B4 V4
+#define B5 V5
+#define B6 V6
+#define B7 V7
+
+#define ACC0 V8
+#define ACC1 V9
+#define ACCM V10
+
+#define T0 V11
+#define T1 V12
+#define T2 V13
+#define T3 V14
+
+#define POLY V15
+#define ZERO V16
+#define INC V17
+#define CTR V18
+
+#define K0 V19
+#define K1 V20
+#define K2 V21
+#define K3 V22
+#define K4 V23
+#define K5 V24
+#define K6 V25
+#define K7 V26
+#define K8 V27
+#define K9 V28
+#define K10 V29
+#define K11 V30
+#define KLAST V31
+
+#define reduce() \
+	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
+	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
+	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
+	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
+	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
+	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
+	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
+	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
+	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
+	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
+
+// func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
+TEXT ·gcmAesFinish(SB),NOSPLIT,$0
+#define pTbl R0
+#define tMsk R1
+#define tPtr R2
+#define plen R3
+#define dlen R4
+
+	MOVD	$0xC2, R1
+	LSL	$56, R1
+	MOVD	$1, R0
+	VMOV	R1, POLY.D[0]
+	VMOV	R0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	MOVD	productTable+0(FP), pTbl
+	MOVD	tagMask+8(FP), tMsk
+	MOVD	T+16(FP), tPtr
+	MOVD	pLen+24(FP), plen
+	MOVD	dLen+32(FP), dlen
+
+	VLD1	(tPtr), [ACC0.B16]
+	VLD1	(tMsk), [B1.B16]
+
+	LSL	$3, plen
+	LSL	$3, dlen
+
+	VMOV	dlen, B0.D[0]
+	VMOV	plen, B0.D[1]
+
+	ADD	$14*16, pTbl
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+	VEOR	ACC0.B16, B0.B16, B0.B16
+
+	VEXT	$8, B0.B16, B0.B16, T0.B16
+	VEOR	B0.B16, T0.B16, T0.B16
+	VPMULL	B0.D1, T1.D1, ACC1.Q1
+	VPMULL2	B0.D2, T1.D2, ACC0.Q1
+	VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+	reduce()
+
+	VREV64	ACC0.B16, ACC0.B16
+	VEOR	B1.B16, ACC0.B16, ACC0.B16
+
+	VST1	[ACC0.B16], (tPtr)
+	RET
+#undef pTbl
+#undef tMsk
+#undef tPtr
+#undef plen
+#undef dlen
+
+// func gcmAesInit(productTable *[256]byte, ks []uint32)
+TEXT ·gcmAesInit(SB),NOSPLIT,$0
+#define pTbl R0
+#define KS R1
+#define NR R2
+#define I R3
+	MOVD	productTable+0(FP), pTbl
+	MOVD	ks_base+8(FP), KS
+	MOVD	ks_len+16(FP), NR
+
+	MOVD	$0xC2, I
+	LSL	$56, I
+	VMOV	I, POLY.D[0]
+	MOVD	$1, I
+	VMOV	I, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+
+	// Encrypt block 0 with the AES key to generate the hash key H
+	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
+	VEOR	B0.B16, B0.B16, B0.B16
+	AESE	T0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T1.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T2.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T3.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
+	AESE	T0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T1.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T2.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T3.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	TBZ	$4, NR, initEncFinish
+	VLD1.P	32(KS), [T0.B16, T1.B16]
+	AESE	T0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T1.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	TBZ	$3, NR, initEncFinish
+	VLD1.P	32(KS), [T0.B16, T1.B16]
+	AESE	T0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T1.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+initEncFinish:
+	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
+	AESE	T0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	T1.B16, B0.B16
+ 	VEOR	T2.B16, B0.B16, B0.B16
+
+	VREV64	B0.B16, B0.B16
+
+	// Multiply by 2 modulo P
+	VMOV	B0.D[0], I
+	ASR	$63, I
+	VMOV	I, T1.D[0]
+	VMOV	I, T1.D[1]
+	VAND	POLY.B16, T1.B16, T1.B16
+	VUSHR	$63, B0.D2, T2.D2
+	VEXT	$8, ZERO.B16, T2.B16, T2.B16
+	VSHL	$1, B0.D2, B0.D2
+	VEOR	T1.B16, B0.B16, B0.B16
+	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
+
+	// Karatsuba pre-computation
+	VEXT	$8, B0.B16, B0.B16, B1.B16
+	VEOR	B0.B16, B1.B16, B1.B16
+
+	ADD	$14*16, pTbl
+	VST1	[B0.B16, B1.B16], (pTbl)
+	SUB	$2*16, pTbl
+
+	VMOV	B0.B16, B2.B16
+	VMOV	B1.B16, B3.B16
+
+	MOVD	$7, I
+
+initLoop:
+	// Compute powers of H
+	SUBS	$1, I
+
+	VPMULL	B0.D1, B2.D1, T1.Q1
+	VPMULL2	B0.D2, B2.D2, T0.Q1
+	VPMULL	B1.D1, B3.D1, T2.Q1
+	VEOR	T0.B16, T2.B16, T2.B16
+	VEOR	T1.B16, T2.B16, T2.B16
+	VEXT	$8, ZERO.B16, T2.B16, T3.B16
+	VEXT	$8, T2.B16, ZERO.B16, T2.B16
+	VEOR	T2.B16, T0.B16, T0.B16
+	VEOR	T3.B16, T1.B16, T1.B16
+	VPMULL	POLY.D1, T0.D1, T2.Q1
+	VEXT	$8, T0.B16, T0.B16, T0.B16
+	VEOR	T2.B16, T0.B16, T0.B16
+	VPMULL	POLY.D1, T0.D1, T2.Q1
+	VEXT	$8, T0.B16, T0.B16, T0.B16
+	VEOR	T2.B16, T0.B16, T0.B16
+	VEOR	T1.B16, T0.B16, B2.B16
+	VMOV	B2.B16, B3.B16
+	VEXT	$8, B2.B16, B2.B16, B2.B16
+	VEOR	B2.B16, B3.B16, B3.B16
+
+	VST1	[B2.B16, B3.B16], (pTbl)
+	SUB	$2*16, pTbl
+
+	BNE	initLoop
+	RET
+#undef I
+#undef NR
+#undef KS
+#undef pTbl
+
+// func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
+TEXT ·gcmAesData(SB),NOSPLIT,$0
+#define pTbl R0
+#define aut R1
+#define tPtr R2
+#define autLen R3
+#define H0 R4
+#define pTblSave R5
+
+#define mulRound(X) \
+	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
+	VREV64	X.B16, X.B16               \
+	VEXT	$8, X.B16, X.B16, T0.B16   \
+	VEOR	X.B16, T0.B16, T0.B16      \
+	VPMULL	X.D1, T1.D1, T3.Q1         \
+	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
+	VPMULL2	X.D2, T1.D2, T3.Q1         \
+	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
+	VPMULL	T0.D1, T2.D1, T3.Q1        \
+	VEOR	T3.B16, ACCM.B16, ACCM.B16
+
+	MOVD	productTable+0(FP), pTbl
+	MOVD	data_base+8(FP), aut
+	MOVD	data_len+16(FP), autLen
+	MOVD	T+32(FP), tPtr
+
+	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
+	CBZ	autLen, dataBail
+
+	MOVD	$0xC2, H0
+	LSL	$56, H0
+	VMOV	H0, POLY.D[0]
+	MOVD	$1, H0
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+	MOVD	pTbl, pTblSave
+
+	CMP	$13, autLen
+	BEQ	dataTLS
+	CMP	$128, autLen
+	BLT	startSinglesLoop
+	B	octetsLoop
+
+dataTLS:
+	ADD	$14*16, pTbl
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+	VEOR	B0.B16, B0.B16, B0.B16
+
+	MOVD	(aut), H0
+	VMOV	H0, B0.D[0]
+	MOVW	8(aut), H0
+	VMOV	H0, B0.S[2]
+	MOVB	12(aut), H0
+	VMOV	H0, B0.B[12]
+
+	MOVD	$0, autLen
+	B	dataMul
+
+octetsLoop:
+		CMP	$128, autLen
+		BLT	startSinglesLoop
+		SUB	$128, autLen
+
+		VLD1.P	32(aut), [B0.B16, B1.B16]
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		mulRound(B1)
+		VLD1.P  32(aut), [B2.B16, B3.B16]
+		mulRound(B2)
+		mulRound(B3)
+		VLD1.P  32(aut), [B4.B16, B5.B16]
+		mulRound(B4)
+		mulRound(B5)
+		VLD1.P  32(aut), [B6.B16, B7.B16]
+		mulRound(B6)
+		mulRound(B7)
+
+		MOVD	pTblSave, pTbl
+		reduce()
+	B	octetsLoop
+
+startSinglesLoop:
+
+	ADD	$14*16, pTbl
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+
+singlesLoop:
+
+		CMP	$16, autLen
+		BLT	dataEnd
+		SUB	$16, autLen
+
+		VLD1.P	16(aut), [B0.B16]
+dataMul:
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		reduce()
+
+	B	singlesLoop
+
+dataEnd:
+
+	CBZ	autLen, dataBail
+	VEOR	B0.B16, B0.B16, B0.B16
+	ADD	autLen, aut
+
+dataLoadLoop:
+		MOVB.W	-1(aut), H0
+		VEXT	$15, B0.B16, ZERO.B16, B0.B16
+		VMOV	H0, B0.B[0]
+		SUBS	$1, autLen
+		BNE	dataLoadLoop
+	B	dataMul
+
+dataBail:
+	VST1	[ACC0.B16], (tPtr)
+	RET
+
+#undef pTbl
+#undef aut
+#undef tPtr
+#undef autLen
+#undef H0
+#undef pTblSave
+
+// func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+TEXT ·gcmAesEnc(SB),NOSPLIT,$0
+#define pTbl R0
+#define dstPtr R1
+#define ctrPtr R2
+#define srcPtr R3
+#define ks R4
+#define tPtr R5
+#define srcPtrLen R6
+#define aluCTR R7
+#define aluTMP R8
+#define aluK R9
+#define NR R10
+#define H0 R11
+#define H1 R12
+#define curK R13
+#define pTblSave R14
+
+#define aesrndx8(K) \
+	AESE	K.B16, B0.B16    \
+	AESMC	B0.B16, B0.B16   \
+	AESE	K.B16, B1.B16    \
+	AESMC	B1.B16, B1.B16   \
+	AESE	K.B16, B2.B16    \
+	AESMC	B2.B16, B2.B16   \
+	AESE	K.B16, B3.B16    \
+	AESMC	B3.B16, B3.B16   \
+	AESE	K.B16, B4.B16    \
+	AESMC	B4.B16, B4.B16   \
+	AESE	K.B16, B5.B16    \
+	AESMC	B5.B16, B5.B16   \
+	AESE	K.B16, B6.B16    \
+	AESMC	B6.B16, B6.B16   \
+	AESE	K.B16, B7.B16    \
+	AESMC	B7.B16, B7.B16
+
+#define aesrndlastx8(K) \
+	AESE	K.B16, B0.B16    \
+	AESE	K.B16, B1.B16    \
+	AESE	K.B16, B2.B16    \
+	AESE	K.B16, B3.B16    \
+	AESE	K.B16, B4.B16    \
+	AESE	K.B16, B5.B16    \
+	AESE	K.B16, B6.B16    \
+	AESE	K.B16, B7.B16
+
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	ks_base+72(FP), ks
+	MOVD	ks_len+80(FP), NR
+
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+	// Compute NR from len(ks)
+	MOVD	pTbl, pTblSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare intial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+
+	MOVD	ks, H0
+	// For AES-128 round keys are stored in: K0 .. K10, KLAST
+	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
+	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
+	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
+	VMOV	K10.B16, KLAST.B16
+
+	BLT	startSingles
+	// There are at least 8 blocks to encrypt
+	TBZ	$4, NR, octetsLoop
+
+	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
+	VMOV	K8.B16, K10.B16
+	VMOV	K9.B16, K11.B16
+	VMOV	KLAST.B16, K8.B16
+	VLD1.P	16(H0), [K9.B16]
+	VLD1.P  16(H0), [KLAST.B16]
+	TBZ	$3, NR, octetsLoop
+	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
+	VMOV	KLAST.B16, K8.B16
+	VLD1.P	16(H0), [K9.B16]
+	VLD1.P  16(H0), [KLAST.B16]
+	ADD	$10*16, ks, H0
+	MOVD	H0, curK
+
+octetsLoop:
+		SUB	$128, srcPtrLen
+
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VREV32	B0.B16, B0.B16
+		VADD	B1.S4, INC.S4, B2.S4
+		VREV32	B1.B16, B1.B16
+		VADD	B2.S4, INC.S4, B3.S4
+		VREV32	B2.B16, B2.B16
+		VADD	B3.S4, INC.S4, B4.S4
+		VREV32	B3.B16, B3.B16
+		VADD	B4.S4, INC.S4, B5.S4
+		VREV32	B4.B16, B4.B16
+		VADD	B5.S4, INC.S4, B6.S4
+		VREV32	B5.B16, B5.B16
+		VADD	B6.S4, INC.S4, B7.S4
+		VREV32	B6.B16, B6.B16
+		VADD	B7.S4, INC.S4, CTR.S4
+		VREV32	B7.B16, B7.B16
+
+		aesrndx8(K0)
+		aesrndx8(K1)
+		aesrndx8(K2)
+		aesrndx8(K3)
+		aesrndx8(K4)
+		aesrndx8(K5)
+		aesrndx8(K6)
+		aesrndx8(K7)
+		TBZ	$4, NR, octetsFinish
+		aesrndx8(K10)
+		aesrndx8(K11)
+		TBZ	$3, NR, octetsFinish
+		VLD1.P	32(curK), [T1.B16, T2.B16]
+		aesrndx8(T1)
+		aesrndx8(T2)
+		MOVD	H0, curK
+octetsFinish:
+		aesrndx8(K8)
+		aesrndlastx8(K9)
+
+		VEOR	KLAST.B16, B0.B16, B0.B16
+		VEOR	KLAST.B16, B1.B16, B1.B16
+		VEOR	KLAST.B16, B2.B16, B2.B16
+		VEOR	KLAST.B16, B3.B16, B3.B16
+		VEOR	KLAST.B16, B4.B16, B4.B16
+		VEOR	KLAST.B16, B5.B16, B5.B16
+		VEOR	KLAST.B16, B6.B16, B6.B16
+		VEOR	KLAST.B16, B7.B16, B7.B16
+
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B0.B16, T1.B16, B0.B16
+		VEOR	B1.B16, T2.B16, B1.B16
+		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B2.B16, T1.B16, B2.B16
+		VEOR	B3.B16, T2.B16, B3.B16
+		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B4.B16, T1.B16, B4.B16
+		VEOR	B5.B16, T2.B16, B5.B16
+		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
+		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
+		VEOR	B6.B16, T1.B16, B6.B16
+		VEOR	B7.B16, T2.B16, B7.B16
+		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		mulRound(B1)
+		mulRound(B2)
+		mulRound(B3)
+		mulRound(B4)
+		mulRound(B5)
+		mulRound(B6)
+		mulRound(B7)
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	octetsLoop
+
+startSingles:
+	CBZ	srcPtrLen, done
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+	// Preload AES round keys
+	ADD	$128, ks
+	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
+	VMOV	K10.B16, KLAST.B16
+	TBZ	$4, NR, singlesLoop
+	VLD1.P	32(ks), [B1.B16, B2.B16]
+	VMOV	B2.B16, KLAST.B16
+	TBZ	$3, NR, singlesLoop
+	VLD1.P	32(ks), [B3.B16, B4.B16]
+	VMOV	B4.B16, KLAST.B16
+
+singlesLoop:
+		CMP	$16, srcPtrLen
+		BLT	tail
+		SUB	$16, srcPtrLen
+
+		VLD1.P	16(srcPtr), [T0.B16]
+		VEOR	KLAST.B16, T0.B16, T0.B16
+
+		VREV32	CTR.B16, B0.B16
+		VADD	CTR.S4, INC.S4, CTR.S4
+
+		AESE	K0.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K1.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K2.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K3.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K4.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K5.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K6.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K7.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K8.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K9.B16, B0.B16
+		TBZ	$4, NR, singlesLast
+		AESMC	B0.B16, B0.B16
+		AESE	K10.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	B1.B16, B0.B16
+		TBZ	$3, NR, singlesLast
+		AESMC	B0.B16, B0.B16
+		AESE	B2.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	B3.B16, B0.B16
+singlesLast:
+		VEOR	T0.B16, B0.B16, B0.B16
+encReduce:
+		VST1.P	[B0.B16], 16(dstPtr)
+
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+
+		reduce()
+
+	B	singlesLoop
+tail:
+	CBZ	srcPtrLen, done
+
+	VEOR	T0.B16, T0.B16, T0.B16
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+	ADD	srcPtrLen, srcPtr
+
+	TBZ	$3, srcPtrLen, ld4
+	MOVD.W	-8(srcPtr), H0
+	VMOV	H0, T0.D[0]
+	VMOV	H1, T3.D[0]
+ld4:
+	TBZ	$2, srcPtrLen, ld2
+	MOVW.W	-4(srcPtr), H0
+	VEXT	$12, T0.B16, ZERO.B16, T0.B16
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.S[0]
+	VMOV	H1, T3.S[0]
+ld2:
+	TBZ	$1, srcPtrLen, ld1
+	MOVH.W	-2(srcPtr), H0
+	VEXT	$14, T0.B16, ZERO.B16, T0.B16
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.H[0]
+	VMOV	H1, T3.H[0]
+ld1:
+	TBZ	$0, srcPtrLen, ld0
+	MOVB.W	-1(srcPtr), H0
+	VEXT	$15, T0.B16, ZERO.B16, T0.B16
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H0, T0.B[0]
+	VMOV	H1, T3.B[0]
+ld0:
+
+	MOVD	ZR, srcPtrLen
+	VEOR	KLAST.B16, T0.B16, T0.B16
+	VREV32	CTR.B16, B0.B16
+
+	AESE	K0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K1.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K2.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K3.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K4.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K5.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K6.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K7.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K8.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K9.B16, B0.B16
+	TBZ	$4, NR, tailLast
+	AESMC	B0.B16, B0.B16
+	AESE	K10.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	B1.B16, B0.B16
+	TBZ	$3, NR, tailLast
+	AESMC	B0.B16, B0.B16
+	AESE	B2.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	B3.B16, B0.B16
+
+tailLast:
+	VEOR	T0.B16, B0.B16, B0.B16
+	VAND	T3.B16, B0.B16, B0.B16
+	B	encReduce
+
+done:
+	VST1	[ACC0.B16], (tPtr)
+	RET
+
+// func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
+TEXT ·gcmAesDec(SB),NOSPLIT,$0
+	MOVD	productTable+0(FP), pTbl
+	MOVD	dst+8(FP), dstPtr
+	MOVD	src_base+32(FP), srcPtr
+	MOVD	src_len+40(FP), srcPtrLen
+	MOVD	ctr+56(FP), ctrPtr
+	MOVD	T+64(FP), tPtr
+	MOVD	ks_base+72(FP), ks
+	MOVD	ks_len+80(FP), NR
+
+	MOVD	$0xC2, H1
+	LSL	$56, H1
+	MOVD	$1, H0
+	VMOV	H1, POLY.D[0]
+	VMOV	H0, POLY.D[1]
+	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
+	// Compute NR from len(ks)
+	MOVD	pTbl, pTblSave
+	// Current tag, after AAD
+	VLD1	(tPtr), [ACC0.B16]
+	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
+	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
+	// Prepare intial counter, and the increment vector
+	VLD1	(ctrPtr), [CTR.B16]
+	VEOR	INC.B16, INC.B16, INC.B16
+	MOVD	$1, H0
+	VMOV	H0, INC.S[3]
+	VREV32	CTR.B16, CTR.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	MOVD	ks, H0
+	// For AES-128 round keys are stored in: K0 .. K10, KLAST
+	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
+	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
+	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
+	VMOV	K10.B16, KLAST.B16
+
+	// Skip to <8 blocks loop
+	CMP	$128, srcPtrLen
+	BLT	startSingles
+	// There are at least 8 blocks to encrypt
+	TBZ	$4, NR, octetsLoop
+
+	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
+	VMOV	K8.B16, K10.B16
+	VMOV	K9.B16, K11.B16
+	VMOV	KLAST.B16, K8.B16
+	VLD1.P	16(H0), [K9.B16]
+	VLD1.P  16(H0), [KLAST.B16]
+	TBZ	$3, NR, octetsLoop
+	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
+	VMOV	KLAST.B16, K8.B16
+	VLD1.P	16(H0), [K9.B16]
+	VLD1.P  16(H0), [KLAST.B16]
+	ADD	$10*16, ks, H0
+	MOVD	H0, curK
+
+octetsLoop:
+		SUB	$128, srcPtrLen
+
+		VMOV	CTR.B16, B0.B16
+		VADD	B0.S4, INC.S4, B1.S4
+		VREV32	B0.B16, B0.B16
+		VADD	B1.S4, INC.S4, B2.S4
+		VREV32	B1.B16, B1.B16
+		VADD	B2.S4, INC.S4, B3.S4
+		VREV32	B2.B16, B2.B16
+		VADD	B3.S4, INC.S4, B4.S4
+		VREV32	B3.B16, B3.B16
+		VADD	B4.S4, INC.S4, B5.S4
+		VREV32	B4.B16, B4.B16
+		VADD	B5.S4, INC.S4, B6.S4
+		VREV32	B5.B16, B5.B16
+		VADD	B6.S4, INC.S4, B7.S4
+		VREV32	B6.B16, B6.B16
+		VADD	B7.S4, INC.S4, CTR.S4
+		VREV32	B7.B16, B7.B16
+
+		aesrndx8(K0)
+		aesrndx8(K1)
+		aesrndx8(K2)
+		aesrndx8(K3)
+		aesrndx8(K4)
+		aesrndx8(K5)
+		aesrndx8(K6)
+		aesrndx8(K7)
+		TBZ	$4, NR, octetsFinish
+		aesrndx8(K10)
+		aesrndx8(K11)
+		TBZ	$3, NR, octetsFinish
+		VLD1.P	32(curK), [T1.B16, T2.B16]
+		aesrndx8(T1)
+		aesrndx8(T2)
+		MOVD	H0, curK
+octetsFinish:
+		aesrndx8(K8)
+		aesrndlastx8(K9)
+
+		VEOR	KLAST.B16, B0.B16, T1.B16
+		VEOR	KLAST.B16, B1.B16, T2.B16
+		VEOR	KLAST.B16, B2.B16, B2.B16
+		VEOR	KLAST.B16, B3.B16, B3.B16
+		VEOR	KLAST.B16, B4.B16, B4.B16
+		VEOR	KLAST.B16, B5.B16, B5.B16
+		VEOR	KLAST.B16, B6.B16, B6.B16
+		VEOR	KLAST.B16, B7.B16, B7.B16
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B0.B16, T1.B16, T1.B16
+		VEOR	B1.B16, T2.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+
+		VLD1.P	32(pTbl), [T1.B16, T2.B16]
+		VREV64	B0.B16, B0.B16
+		VEOR	ACC0.B16, B0.B16, B0.B16
+		VEXT	$8, B0.B16, B0.B16, T0.B16
+		VEOR	B0.B16, T0.B16, T0.B16
+		VPMULL	B0.D1, T1.D1, ACC1.Q1
+		VPMULL2	B0.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B2.B16, B0.B16, T1.B16
+		VEOR	B3.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B4.B16, B0.B16, T1.B16
+		VEOR	B5.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
+		VEOR	B6.B16, B0.B16, T1.B16
+		VEOR	B7.B16, B1.B16, T2.B16
+		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
+		mulRound(B0)
+		mulRound(B1)
+
+		MOVD	pTblSave, pTbl
+		reduce()
+
+		CMP	$128, srcPtrLen
+		BGE	octetsLoop
+
+startSingles:
+	CBZ	srcPtrLen, done
+	ADD	$14*16, pTbl
+	// Preload H and its Karatsuba precomp
+	VLD1.P	(pTbl), [T1.B16, T2.B16]
+	// Preload AES round keys
+	ADD	$128, ks
+	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
+	VMOV	K10.B16, KLAST.B16
+	TBZ	$4, NR, singlesLoop
+	VLD1.P	32(ks), [B1.B16, B2.B16]
+	VMOV	B2.B16, KLAST.B16
+	TBZ	$3, NR, singlesLoop
+	VLD1.P	32(ks), [B3.B16, B4.B16]
+	VMOV	B4.B16, KLAST.B16
+
+singlesLoop:
+		CMP	$16, srcPtrLen
+		BLT	tail
+		SUB	$16, srcPtrLen
+
+		VLD1.P	16(srcPtr), [T0.B16]
+		VREV64	T0.B16, B5.B16
+		VEOR	KLAST.B16, T0.B16, T0.B16
+
+		VREV32	CTR.B16, B0.B16
+		VADD	CTR.S4, INC.S4, CTR.S4
+
+		AESE	K0.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K1.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K2.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K3.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K4.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K5.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K6.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K7.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K8.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	K9.B16, B0.B16
+		TBZ	$4, NR, singlesLast
+		AESMC	B0.B16, B0.B16
+		AESE	K10.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	B1.B16, B0.B16
+		TBZ	$3, NR, singlesLast
+		AESMC	B0.B16, B0.B16
+		AESE	B2.B16, B0.B16
+		AESMC	B0.B16, B0.B16
+		AESE	B3.B16, B0.B16
+singlesLast:
+		VEOR	T0.B16, B0.B16, B0.B16
+
+		VST1.P	[B0.B16], 16(dstPtr)
+
+		VEOR	ACC0.B16, B5.B16, B5.B16
+		VEXT	$8, B5.B16, B5.B16, T0.B16
+		VEOR	B5.B16, T0.B16, T0.B16
+		VPMULL	B5.D1, T1.D1, ACC1.Q1
+		VPMULL2	B5.D2, T1.D2, ACC0.Q1
+		VPMULL	T0.D1, T2.D1, ACCM.Q1
+		reduce()
+
+	B	singlesLoop
+tail:
+	CBZ	srcPtrLen, done
+
+	VREV32	CTR.B16, B0.B16
+	VADD	CTR.S4, INC.S4, CTR.S4
+
+	AESE	K0.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K1.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K2.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K3.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K4.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K5.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K6.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K7.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K8.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	K9.B16, B0.B16
+	TBZ	$4, NR, tailLast
+	AESMC	B0.B16, B0.B16
+	AESE	K10.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	B1.B16, B0.B16
+	TBZ	$3, NR, tailLast
+	AESMC	B0.B16, B0.B16
+	AESE	B2.B16, B0.B16
+	AESMC	B0.B16, B0.B16
+	AESE	B3.B16, B0.B16
+tailLast:
+	VEOR	KLAST.B16, B0.B16, B0.B16
+
+	// Assuming it is safe to load past dstPtr due to the presense of the tag
+	VLD1	(srcPtr), [B5.B16]
+
+	VEOR	B5.B16, B0.B16, B0.B16
+
+	VEOR	T3.B16, T3.B16, T3.B16
+	MOVD	$0, H1
+	SUB	$1, H1
+
+	TBZ	$3, srcPtrLen, ld4
+	VMOV	B0.D[0], H0
+	MOVD.P	H0, 8(dstPtr)
+	VMOV	H1, T3.D[0]
+	VEXT	$8, ZERO.B16, B0.B16, B0.B16
+ld4:
+	TBZ	$2, srcPtrLen, ld2
+	VMOV	B0.S[0], H0
+	MOVW.P	H0, 4(dstPtr)
+	VEXT	$12, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.S[0]
+	VEXT	$4, ZERO.B16, B0.B16, B0.B16
+ld2:
+	TBZ	$1, srcPtrLen, ld1
+	VMOV	B0.H[0], H0
+	MOVH.P	H0, 2(dstPtr)
+	VEXT	$14, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.H[0]
+	VEXT	$2, ZERO.B16, B0.B16, B0.B16
+ld1:
+	TBZ	$0, srcPtrLen, ld0
+	VMOV	B0.B[0], H0
+	MOVB.P	H0, 1(dstPtr)
+	VEXT	$15, T3.B16, ZERO.B16, T3.B16
+	VMOV	H1, T3.B[0]
+ld0:
+
+	VAND	T3.B16, B5.B16, B5.B16
+	VREV64	B5.B16, B5.B16
+
+	VEOR	ACC0.B16, B5.B16, B5.B16
+	VEXT	$8, B5.B16, B5.B16, T0.B16
+	VEOR	B5.B16, T0.B16, T0.B16
+	VPMULL	B5.D1, T1.D1, ACC1.Q1
+	VPMULL2	B5.D2, T1.D2, ACC0.Q1
+	VPMULL	T0.D1, T2.D1, ACCM.Q1
+	reduce()
+done:
+	VST1	[ACC0.B16], (tPtr)
+
+	RET
diff --git a/src/crypto/cipher/gcm_test.go b/src/crypto/cipher/gcm_test.go
index c48001db281e6d6db20388571ad92e0986696685..64d5cc0db4fd9ff5d88054038a2cf610403f015b 100644
--- a/src/crypto/cipher/gcm_test.go
+++ b/src/crypto/cipher/gcm_test.go
@@ -424,7 +424,7 @@ func TestGCMAsm(t *testing.T) {
 
 	// generate permutations
 	type pair struct{ align, length int }
-	lengths := []int{0, 8192, 8193, 8208}
+	lengths := []int{0, 156, 8192, 8193, 8208}
 	keySizes := []int{16, 24, 32}
 	alignments := []int{0, 1, 2, 3}
 	if testing.Short() {
diff --git a/src/crypto/tls/common.go b/src/crypto/tls/common.go
index 7c8f0de6e82faa8a51ca6dc79bd8011ed5b74d3c..729bce6d50c66e89be2c861441310910867e30dd 100644
--- a/src/crypto/tls/common.go
+++ b/src/crypto/tls/common.go
@@ -925,12 +925,7 @@ func initDefaultCipherSuites() {
 	// Worst case, these variables will just all be false
 	hasGCMAsmAMD64 := cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ
 
-	// TODO: enable the arm64 HasAES && HasPMULL feature check after the
-	// optimized AES-GCM implementation for arm64 is merged (CL 107298).
-	// This is explicitly set to false for now to prevent misprioritization
-	// of AES-GCM based cipher suites, which will be slower than chacha20-poly1305
-	hasGCMAsmARM64 := false
-	// hasGCMAsmARM64 := cpu.ARM64.HasAES && cpu.ARM64.HasPMULL
+	hasGCMAsmARM64 := cpu.ARM64.HasAES && cpu.ARM64.HasPMULL
 
 	// Keep in sync with crypto/aes/cipher_s390x.go.
 	hasGCMAsmS390X := cpu.S390X.HasAES && cpu.S390X.HasAESCBC && cpu.S390X.HasAESCTR && (cpu.S390X.HasGHASH || cpu.S390X.HasAESGCM)