From 25cbd534df4ff829dbcdb063330ce689bdbc48a9 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Thu, 21 Mar 2013 11:26:00 -0400
Subject: [PATCH] crypto/md5: faster amd64, 386 implementations

-- amd64 --

On a MacBookPro10,2 (Core i5):

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   471          524  +11.25%
BenchmarkHash1K                      3018         2220  -26.44%
BenchmarkHash8K                     20634        14604  -29.22%
BenchmarkHash8BytesUnaligned          468          523  +11.75%
BenchmarkHash1KUnaligned             3006         2212  -26.41%
BenchmarkHash8KUnaligned            20820        14652  -29.63%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 16.98        15.26    0.90x
BenchmarkHash1K                    339.26       461.19    1.36x
BenchmarkHash8K                    397.00       560.92    1.41x
BenchmarkHash8BytesUnaligned        17.08        15.27    0.89x
BenchmarkHash1KUnaligned           340.65       462.75    1.36x
BenchmarkHash8KUnaligned           393.45       559.08    1.42x

For comparison, on the same machine, openssl 0.9.8r reports
its md5 speed as 350 MB/s for 1K and 410 MB/s for 8K.

On an Intel Xeon E5520:

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   565          607   +7.43%
BenchmarkHash1K                      3753         2475  -34.05%
BenchmarkHash8K                     25945        16250  -37.37%
BenchmarkHash8BytesUnaligned          559          594   +6.26%
BenchmarkHash1KUnaligned             3754         2474  -34.10%
BenchmarkHash8KUnaligned            26011        16359  -37.11%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 14.15        13.17    0.93x
BenchmarkHash1K                    272.83       413.58    1.52x
BenchmarkHash8K                    315.74       504.11    1.60x
BenchmarkHash8BytesUnaligned        14.31        13.46    0.94x
BenchmarkHash1KUnaligned           272.73       413.78    1.52x
BenchmarkHash8KUnaligned           314.93       500.73    1.59x

For comparison, on the same machine, openssl 1.0.1 reports
its md5 speed as 443 MB/s for 1K and 513 MB/s for 8K.

-- 386 --

On a MacBookPro10,2 (Core i5):

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   602          670  +11.30%
BenchmarkHash1K                      4038         2549  -36.87%
BenchmarkHash8K                     27879        16690  -40.13%
BenchmarkHash8BytesUnaligned          602          670  +11.30%
BenchmarkHash1KUnaligned             4025         2546  -36.75%
BenchmarkHash8KUnaligned            27844        16692  -40.05%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 13.28        11.93    0.90x
BenchmarkHash1K                    253.58       401.69    1.58x
BenchmarkHash8K                    293.83       490.81    1.67x
BenchmarkHash8BytesUnaligned        13.27        11.94    0.90x
BenchmarkHash1KUnaligned           254.40       402.05    1.58x
BenchmarkHash8KUnaligned           294.21       490.77    1.67x

On an Intel Xeon E5520:

benchmark                       old ns/op    new ns/op    delta
BenchmarkHash8Bytes                   752          716   -4.79%
BenchmarkHash1K                      5307         2799  -47.26%
BenchmarkHash8K                     36993        18042  -51.23%
BenchmarkHash8BytesUnaligned          748          730   -2.41%
BenchmarkHash1KUnaligned             5301         2795  -47.27%
BenchmarkHash8KUnaligned            36983        18085  -51.10%

benchmark                        old MB/s     new MB/s  speedup
BenchmarkHash8Bytes                 10.64        11.16    1.05x
BenchmarkHash1K                    192.93       365.80    1.90x
BenchmarkHash8K                    221.44       454.03    2.05x
BenchmarkHash8BytesUnaligned        10.69        10.95    1.02x
BenchmarkHash1KUnaligned           193.15       366.36    1.90x
BenchmarkHash8KUnaligned           221.51       452.96    2.04x

R=agl
CC=golang-dev
https://golang.org/cl/7621049
---
 src/pkg/crypto/md5/gen.go           |  19 ++-
 src/pkg/crypto/md5/md5block.go      |   5 +
 src/pkg/crypto/md5/md5block_386.s   | 180 ++++++++++++++++++++++++++++
 src/pkg/crypto/md5/md5block_amd64.s | 177 +++++++++++++++++++++++++++
 src/pkg/crypto/md5/md5block_decl.go |   9 ++
 5 files changed, 388 insertions(+), 2 deletions(-)
 create mode 100644 src/pkg/crypto/md5/md5block_386.s
 create mode 100644 src/pkg/crypto/md5/md5block_amd64.s
 create mode 100644 src/pkg/crypto/md5/md5block_decl.go

diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go
index 966bdae267b..275b4aeea39 100644
--- a/src/pkg/crypto/md5/gen.go
+++ b/src/pkg/crypto/md5/gen.go
@@ -161,6 +161,11 @@ var data = Data{
 }
 
 var program = `
+// DO NOT EDIT.
+// Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go
+
+// +build !amd64
+
 package md5
 
 import (
@@ -186,6 +191,16 @@ import (
 	}
 {{end}}
 
+const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386"
+
+var littleEndian bool
+
+func init() {
+	x := uint32(0x04030201)
+	y := [4]byte{0x1, 0x2, 0x3, 0x4}
+	littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y
+}
+
 func block(dig *digest, p []byte) {
 	a := dig.s[0]
 	b := dig.s[1]
@@ -197,13 +212,13 @@ func block(dig *digest, p []byte) {
 		aa, bb, cc, dd := a, b, c, d
 
 		// This is a constant condition - it is not evaluated on each iteration.
-		if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" {
+		if x86 {
 			// MD5 was designed so that x86 processors can just iterate
 			// over the block data directly as uint32s, and we generate
 			// less code and run 1.3x faster if we take advantage of that.
 			// My apologies.
 			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
-		} else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
+		} else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 {
 			X = (*[16]uint32)(unsafe.Pointer(&p[0]))
 		} else {
 			X = &xbuf
diff --git a/src/pkg/crypto/md5/md5block.go b/src/pkg/crypto/md5/md5block.go
index 0ca42177403..a376fbee99b 100644
--- a/src/pkg/crypto/md5/md5block.go
+++ b/src/pkg/crypto/md5/md5block.go
@@ -1,3 +1,8 @@
+// DO NOT EDIT.
+// Generate with: go run gen.go -full | gofmt >md5block.go
+
+// +build !amd64,!386
+
 package md5
 
 import (
diff --git a/src/pkg/crypto/md5/md5block_386.s b/src/pkg/crypto/md5/md5block_386.s
new file mode 100644
index 00000000000..1083d83f359
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_386.s
@@ -0,0 +1,180 @@
+// Original source:
+//	http://www.zorinaq.com/papers/md5-amd64.html
+//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 8a assembly, and adjusted for 386,
+// by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+	XORL	c, BP; \
+	LEAL	const(a)(DI*1), a; \
+	ANDL	b, BP; \
+	XORL d, BP; \
+	MOVL (index*4)(SI), DI; \
+	ADDL BP, a; \
+	ROLL $shift, a; \
+	MOVL c, BP; \
+	ADDL b, a
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(DI*1),a; \
+	MOVL	d,		DI; \
+	ANDL	b,		DI; \
+	MOVL	d,		BP; \
+	NOTL	BP; \
+	ANDL	c,		BP; \
+	ORL	DI,		BP; \
+	MOVL	(index*4)(SI),DI; \
+	ADDL	BP,		a; \
+	ROLL	$shift,	a; \
+	ADDL	b,		a
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(DI*1),a; \
+	MOVL	(index*4)(SI),DI; \
+	XORL	d,		BP; \
+	XORL	b,		BP; \
+	ADDL	BP,		a; \
+	ROLL	$shift,		a; \
+	MOVL	b,		BP; \
+	ADDL	b,		a
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(DI*1),a; \
+	ORL	b,		BP; \
+	XORL	c,		BP; \
+	ADDL	BP,		a; \
+	MOVL	(index*4)(SI),DI; \
+	MOVL	$0xffffffff,	BP; \
+	ROLL	$shift,		a; \
+	XORL	c,		BP; \
+	ADDL	b,		a
+
+TEXT	·block(SB),7,$24-16
+	MOVL	dig+0(FP),	BP
+	MOVL	p+4(FP),	SI
+	MOVL	n+8(FP), DX
+	SHRL	$6,		DX
+	SHLL	$6,		DX
+
+	LEAL	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+
+	CMPL	SI,		DI
+	JEQ	end
+
+	MOVL	DI,		16(SP)
+
+loop:
+	MOVL	AX,		0(SP)
+	MOVL	BX,		4(SP)
+	MOVL	CX,		8(SP)
+	MOVL	DX,		12(SP)
+
+	MOVL	(0*4)(SI),	DI
+	MOVL	DX,		BP
+
+	ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+	ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+	ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+	ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+	ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+	ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+	ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+	ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+	ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+	ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+	ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+	ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+	ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+	ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+	ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+	ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+	MOVL	(1*4)(SI),	DI
+	MOVL	DX,		BP
+
+	ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+	ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+	ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+	ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+	ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+	ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+	ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+	ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+	ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+	ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+	ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+	ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+	ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+	ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+	ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+	ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+ 
+	MOVL	(5*4)(SI),	DI
+	MOVL	CX,		BP
+
+	ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+	ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+	ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+	ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+	ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+	ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+	ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+	ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+	ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+	ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+	ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+	ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+	ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+	ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+	ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+	ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+	MOVL	(0*4)(SI),	DI
+	MOVL	$0xffffffff,	BP
+	XORL	DX,		BP
+
+	ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+	ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+	ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+	ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+	ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+	ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+	ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+	ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+	ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+	ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+	ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+	ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+	ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+	ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+	ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+	ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+	ADDL	0(SP),	AX
+	ADDL	4(SP),	BX
+	ADDL	8(SP),	CX
+	ADDL	12(SP),	DX
+
+	ADDL	$64,		SI
+	CMPL	SI,		16(SP)
+	JB	loop
+
+end:
+	MOVL	dig+0(FP),	BP
+	MOVL	AX,		(0*4)(BP)
+	MOVL	BX,		(1*4)(BP)
+	MOVL	CX,		(2*4)(BP)
+	MOVL	DX,		(3*4)(BP)
+	RET
diff --git a/src/pkg/crypto/md5/md5block_amd64.s b/src/pkg/crypto/md5/md5block_amd64.s
new file mode 100644
index 00000000000..74a361e7758
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_amd64.s
@@ -0,0 +1,177 @@
+// Original source:
+//	http://www.zorinaq.com/papers/md5-amd64.html
+//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 6a assembly by the Go Authors.
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+TEXT	·block(SB),7,$0-32
+	MOVQ	dig+0(FP),	BP
+	MOVQ	p+8(FP),	SI
+	MOVQ	n+16(FP), DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+
+	LEAQ	(SI)(DX*1),	DI
+	MOVL	(0*4)(BP),	AX
+	MOVL	(1*4)(BP),	BX
+	MOVL	(2*4)(BP),	CX
+	MOVL	(3*4)(BP),	DX
+
+	CMPQ	SI,		DI
+	JEQ	end
+
+loop:
+	MOVL	AX,		R12
+	MOVL	BX,		R13
+	MOVL	CX,		R14
+	MOVL	DX,		R15
+
+	MOVL	(0*4)(SI),	R8
+	MOVL	DX,		R9
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+	XORL	c, R9; \
+	LEAL	const(a)(R8*1), a; \
+	ANDL	b, R9; \
+	XORL d, R9; \
+	MOVL (index*4)(SI), R8; \
+	ADDL R9, a; \
+	ROLL $shift, a; \
+	MOVL c, R9; \
+	ADDL b, a
+
+	ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+	ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+	ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+	ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+	ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+	ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+	ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+	ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+	ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+	ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+	ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+	ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+	ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+	ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+	ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+	ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+	MOVL	(1*4)(SI),	R8
+	MOVL	DX,		R9
+	MOVL	DX,		R10
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+	NOTL	R9; \
+	LEAL	const(a)(R8*1),a; \
+	ANDL	b,		R10; \
+	ANDL	c,		R9; \
+	MOVL	(index*4)(SI),R8; \
+	ORL	R9,		R10; \
+	MOVL	c,		R9; \
+	ADDL	R10,		a; \
+	MOVL	c,		R10; \
+	ROLL	$shift,	a; \
+	ADDL	b,		a
+
+	ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+	ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+	ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+	ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+	ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+	ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+	ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+	ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+	ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+	ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+	ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+	ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+	ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+	ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+	ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+	ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+ 
+	MOVL	(5*4)(SI),	R8
+	MOVL	CX,		R9
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(R8*1),a; \
+	MOVL	(index*4)(SI),R8; \
+	XORL	d,		R9; \
+	XORL	b,		R9; \
+	ADDL	R9,		a; \
+	ROLL	$shift,		a; \
+	MOVL	b,		R9; \
+	ADDL	b,		a
+
+	ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+	ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+	ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+	ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+	ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+	ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+	ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+	ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+	ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+	ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+	ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+	ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+	ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+	ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+	ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+	ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+	MOVL	(0*4)(SI),	R8
+	MOVL	$0xffffffff,	R9
+	XORL	DX,		R9
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(R8*1),a; \
+	ORL	b,		R9; \
+	XORL	c,		R9; \
+	ADDL	R9,		a; \
+	MOVL	(index*4)(SI),R8; \
+	MOVL	$0xffffffff,	R9; \
+	ROLL	$shift,		a; \
+	XORL	c,		R9; \
+	ADDL	b,		a
+	
+	ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+	ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+	ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+	ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+	ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+	ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+	ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+	ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+	ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+	ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+	ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+	ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+	ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+	ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+	ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+	ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+	ADDL	R12,	AX
+	ADDL	R13,	BX
+	ADDL	R14,	CX
+	ADDL	R15,	DX
+
+	ADDQ	$64,		SI
+	CMPQ	SI,		DI
+	JB	loop
+
+end:
+	MOVL	AX,		(0*4)(BP)
+	MOVL	BX,		(1*4)(BP)
+	MOVL	CX,		(2*4)(BP)
+	MOVL	DX,		(3*4)(BP)
+	RET
diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go
new file mode 100644
index 00000000000..14190c6ff29
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_decl.go
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 386
+
+package md5
+
+func block(dig *digest, p []byte)
-- 
GitLab