From 25cbd534df4ff829dbcdb063330ce689bdbc48a9 Mon Sep 17 00:00:00 2001 From: Russ Cox <rsc@golang.org> Date: Thu, 21 Mar 2013 11:26:00 -0400 Subject: [PATCH] crypto/md5: faster amd64, 386 implementations -- amd64 -- On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 471 524 +11.25% BenchmarkHash1K 3018 2220 -26.44% BenchmarkHash8K 20634 14604 -29.22% BenchmarkHash8BytesUnaligned 468 523 +11.75% BenchmarkHash1KUnaligned 3006 2212 -26.41% BenchmarkHash8KUnaligned 20820 14652 -29.63% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 16.98 15.26 0.90x BenchmarkHash1K 339.26 461.19 1.36x BenchmarkHash8K 397.00 560.92 1.41x BenchmarkHash8BytesUnaligned 17.08 15.27 0.89x BenchmarkHash1KUnaligned 340.65 462.75 1.36x BenchmarkHash8KUnaligned 393.45 559.08 1.42x For comparison, on the same machine, openssl 0.9.8r reports its md5 speed as 350 MB/s for 1K and 410 MB/s for 8K. On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 565 607 +7.43% BenchmarkHash1K 3753 2475 -34.05% BenchmarkHash8K 25945 16250 -37.37% BenchmarkHash8BytesUnaligned 559 594 +6.26% BenchmarkHash1KUnaligned 3754 2474 -34.10% BenchmarkHash8KUnaligned 26011 16359 -37.11% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 14.15 13.17 0.93x BenchmarkHash1K 272.83 413.58 1.52x BenchmarkHash8K 315.74 504.11 1.60x BenchmarkHash8BytesUnaligned 14.31 13.46 0.94x BenchmarkHash1KUnaligned 272.73 413.78 1.52x BenchmarkHash8KUnaligned 314.93 500.73 1.59x For comparison, on the same machine, openssl 1.0.1 reports its md5 speed as 443 MB/s for 1K and 513 MB/s for 8K. -- 386 -- On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 602 670 +11.30% BenchmarkHash1K 4038 2549 -36.87% BenchmarkHash8K 27879 16690 -40.13% BenchmarkHash8BytesUnaligned 602 670 +11.30% BenchmarkHash1KUnaligned 4025 2546 -36.75% BenchmarkHash8KUnaligned 27844 16692 -40.05% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 13.28 11.93 0.90x BenchmarkHash1K 253.58 401.69 1.58x BenchmarkHash8K 293.83 490.81 1.67x BenchmarkHash8BytesUnaligned 13.27 11.94 0.90x BenchmarkHash1KUnaligned 254.40 402.05 1.58x BenchmarkHash8KUnaligned 294.21 490.77 1.67x On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 752 716 -4.79% BenchmarkHash1K 5307 2799 -47.26% BenchmarkHash8K 36993 18042 -51.23% BenchmarkHash8BytesUnaligned 748 730 -2.41% BenchmarkHash1KUnaligned 5301 2795 -47.27% BenchmarkHash8KUnaligned 36983 18085 -51.10% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 10.64 11.16 1.05x BenchmarkHash1K 192.93 365.80 1.90x BenchmarkHash8K 221.44 454.03 2.05x BenchmarkHash8BytesUnaligned 10.69 10.95 1.02x BenchmarkHash1KUnaligned 193.15 366.36 1.90x BenchmarkHash8KUnaligned 221.51 452.96 2.04x R=agl CC=golang-dev https://golang.org/cl/7621049 --- src/pkg/crypto/md5/gen.go | 19 ++- src/pkg/crypto/md5/md5block.go | 5 + src/pkg/crypto/md5/md5block_386.s | 180 ++++++++++++++++++++++++++++ src/pkg/crypto/md5/md5block_amd64.s | 177 +++++++++++++++++++++++++++ src/pkg/crypto/md5/md5block_decl.go | 9 ++ 5 files changed, 388 insertions(+), 2 deletions(-) create mode 100644 src/pkg/crypto/md5/md5block_386.s create mode 100644 src/pkg/crypto/md5/md5block_amd64.s create mode 100644 src/pkg/crypto/md5/md5block_decl.go diff --git a/src/pkg/crypto/md5/gen.go b/src/pkg/crypto/md5/gen.go index 966bdae267b..275b4aeea39 100644 --- a/src/pkg/crypto/md5/gen.go +++ b/src/pkg/crypto/md5/gen.go @@ -161,6 +161,11 @@ var data = Data{ } var program = ` +// DO NOT EDIT. +// Generate with: go run gen.go{{if .Full}} -full{{end}} | gofmt >md5block.go + +// +build !amd64 + package md5 import ( @@ -186,6 +191,16 @@ import ( } {{end}} +const x86 = runtime.GOARCH == "amd64" || runtime.GOARCH == "386" + +var littleEndian bool + +func init() { + x := uint32(0x04030201) + y := [4]byte{0x1, 0x2, 0x3, 0x4} + littleEndian = *(*[4]byte)(unsafe.Pointer(&x)) == y +} + func block(dig *digest, p []byte) { a := dig.s[0] b := dig.s[1] @@ -197,13 +212,13 @@ func block(dig *digest, p []byte) { aa, bb, cc, dd := a, b, c, d // This is a constant condition - it is not evaluated on each iteration. - if runtime.GOARCH == "amd64" || runtime.GOARCH == "386" { + if x86 { // MD5 was designed so that x86 processors can just iterate // over the block data directly as uint32s, and we generate // less code and run 1.3x faster if we take advantage of that. // My apologies. X = (*[16]uint32)(unsafe.Pointer(&p[0])) - } else if uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 { + } else if littleEndian && uintptr(unsafe.Pointer(&p[0]))&(unsafe.Alignof(uint32(0))-1) == 0 { X = (*[16]uint32)(unsafe.Pointer(&p[0])) } else { X = &xbuf diff --git a/src/pkg/crypto/md5/md5block.go b/src/pkg/crypto/md5/md5block.go index 0ca42177403..a376fbee99b 100644 --- a/src/pkg/crypto/md5/md5block.go +++ b/src/pkg/crypto/md5/md5block.go @@ -1,3 +1,8 @@ +// DO NOT EDIT. +// Generate with: go run gen.go -full | gofmt >md5block.go + +// +build !amd64,!386 + package md5 import ( diff --git a/src/pkg/crypto/md5/md5block_386.s b/src/pkg/crypto/md5/md5block_386.s new file mode 100644 index 00000000000..1083d83f359 --- /dev/null +++ b/src/pkg/crypto/md5/md5block_386.s @@ -0,0 +1,180 @@ +// Original source: +// http://www.zorinaq.com/papers/md5-amd64.html +// http://www.zorinaq.com/papers/md5-amd64.tar.bz2 +// +// Translated from Perl generating GNU assembly into +// #defines generating 8a assembly, and adjusted for 386, +// by the Go Authors. + +// MD5 optimized for AMD64. +// +// Author: Marc Bevand <bevand_m (at) epita.fr> +// Licence: I hereby disclaim the copyright on this code and place it +// in the public domain. + +#define ROUND1(a, b, c, d, index, const, shift) \ + XORL c, BP; \ + LEAL const(a)(DI*1), a; \ + ANDL b, BP; \ + XORL d, BP; \ + MOVL (index*4)(SI), DI; \ + ADDL BP, a; \ + ROLL $shift, a; \ + MOVL c, BP; \ + ADDL b, a + +#define ROUND2(a, b, c, d, index, const, shift) \ + LEAL const(a)(DI*1),a; \ + MOVL d, DI; \ + ANDL b, DI; \ + MOVL d, BP; \ + NOTL BP; \ + ANDL c, BP; \ + ORL DI, BP; \ + MOVL (index*4)(SI),DI; \ + ADDL BP, a; \ + ROLL $shift, a; \ + ADDL b, a + +#define ROUND3(a, b, c, d, index, const, shift) \ + LEAL const(a)(DI*1),a; \ + MOVL (index*4)(SI),DI; \ + XORL d, BP; \ + XORL b, BP; \ + ADDL BP, a; \ + ROLL $shift, a; \ + MOVL b, BP; \ + ADDL b, a + +#define ROUND4(a, b, c, d, index, const, shift) \ + LEAL const(a)(DI*1),a; \ + ORL b, BP; \ + XORL c, BP; \ + ADDL BP, a; \ + MOVL (index*4)(SI),DI; \ + MOVL $0xffffffff, BP; \ + ROLL $shift, a; \ + XORL c, BP; \ + ADDL b, a + +TEXT ·block(SB),7,$24-16 + MOVL dig+0(FP), BP + MOVL p+4(FP), SI + MOVL n+8(FP), DX + SHRL $6, DX + SHLL $6, DX + + LEAL (SI)(DX*1), DI + MOVL (0*4)(BP), AX + MOVL (1*4)(BP), BX + MOVL (2*4)(BP), CX + MOVL (3*4)(BP), DX + + CMPL SI, DI + JEQ end + + MOVL DI, 16(SP) + +loop: + MOVL AX, 0(SP) + MOVL BX, 4(SP) + MOVL CX, 8(SP) + MOVL DX, 12(SP) + + MOVL (0*4)(SI), DI + MOVL DX, BP + + ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7); + ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12); + ROUND1(CX,DX,AX,BX, 3,0x242070db,17); + ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22); + ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7); + ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12); + ROUND1(CX,DX,AX,BX, 7,0xa8304613,17); + ROUND1(BX,CX,DX,AX, 8,0xfd469501,22); + ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7); + ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12); + ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17); + ROUND1(BX,CX,DX,AX,12,0x895cd7be,22); + ROUND1(AX,BX,CX,DX,13,0x6b901122, 7); + ROUND1(DX,AX,BX,CX,14,0xfd987193,12); + ROUND1(CX,DX,AX,BX,15,0xa679438e,17); + ROUND1(BX,CX,DX,AX, 0,0x49b40821,22); + + MOVL (1*4)(SI), DI + MOVL DX, BP + + ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5); + ROUND2(DX,AX,BX,CX,11,0xc040b340, 9); + ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14); + ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20); + ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5); + ROUND2(DX,AX,BX,CX,15, 0x2441453, 9); + ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14); + ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20); + ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5); + ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9); + ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14); + ROUND2(BX,CX,DX,AX,13,0x455a14ed,20); + ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5); + ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9); + ROUND2(CX,DX,AX,BX,12,0x676f02d9,14); + ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20); + + MOVL (5*4)(SI), DI + MOVL CX, BP + + ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4); + ROUND3(DX,AX,BX,CX,11,0x8771f681,11); + ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16); + ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23); + ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4); + ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11); + ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16); + ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23); + ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4); + ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11); + ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16); + ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23); + ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4); + ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11); + ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16); + ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23); + + MOVL (0*4)(SI), DI + MOVL $0xffffffff, BP + XORL DX, BP + + ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6); + ROUND4(DX,AX,BX,CX,14,0x432aff97,10); + ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15); + ROUND4(BX,CX,DX,AX,12,0xfc93a039,21); + ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6); + ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10); + ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15); + ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21); + ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6); + ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10); + ROUND4(CX,DX,AX,BX,13,0xa3014314,15); + ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21); + ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6); + ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10); + ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15); + ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21); + + ADDL 0(SP), AX + ADDL 4(SP), BX + ADDL 8(SP), CX + ADDL 12(SP), DX + + ADDL $64, SI + CMPL SI, 16(SP) + JB loop + +end: + MOVL dig+0(FP), BP + MOVL AX, (0*4)(BP) + MOVL BX, (1*4)(BP) + MOVL CX, (2*4)(BP) + MOVL DX, (3*4)(BP) + RET diff --git a/src/pkg/crypto/md5/md5block_amd64.s b/src/pkg/crypto/md5/md5block_amd64.s new file mode 100644 index 00000000000..74a361e7758 --- /dev/null +++ b/src/pkg/crypto/md5/md5block_amd64.s @@ -0,0 +1,177 @@ +// Original source: +// http://www.zorinaq.com/papers/md5-amd64.html +// http://www.zorinaq.com/papers/md5-amd64.tar.bz2 +// +// Translated from Perl generating GNU assembly into +// #defines generating 6a assembly by the Go Authors. + +// MD5 optimized for AMD64. +// +// Author: Marc Bevand <bevand_m (at) epita.fr> +// Licence: I hereby disclaim the copyright on this code and place it +// in the public domain. + +TEXT ·block(SB),7,$0-32 + MOVQ dig+0(FP), BP + MOVQ p+8(FP), SI + MOVQ n+16(FP), DX + SHRQ $6, DX + SHLQ $6, DX + + LEAQ (SI)(DX*1), DI + MOVL (0*4)(BP), AX + MOVL (1*4)(BP), BX + MOVL (2*4)(BP), CX + MOVL (3*4)(BP), DX + + CMPQ SI, DI + JEQ end + +loop: + MOVL AX, R12 + MOVL BX, R13 + MOVL CX, R14 + MOVL DX, R15 + + MOVL (0*4)(SI), R8 + MOVL DX, R9 + +#define ROUND1(a, b, c, d, index, const, shift) \ + XORL c, R9; \ + LEAL const(a)(R8*1), a; \ + ANDL b, R9; \ + XORL d, R9; \ + MOVL (index*4)(SI), R8; \ + ADDL R9, a; \ + ROLL $shift, a; \ + MOVL c, R9; \ + ADDL b, a + + ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7); + ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12); + ROUND1(CX,DX,AX,BX, 3,0x242070db,17); + ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22); + ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7); + ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12); + ROUND1(CX,DX,AX,BX, 7,0xa8304613,17); + ROUND1(BX,CX,DX,AX, 8,0xfd469501,22); + ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7); + ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12); + ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17); + ROUND1(BX,CX,DX,AX,12,0x895cd7be,22); + ROUND1(AX,BX,CX,DX,13,0x6b901122, 7); + ROUND1(DX,AX,BX,CX,14,0xfd987193,12); + ROUND1(CX,DX,AX,BX,15,0xa679438e,17); + ROUND1(BX,CX,DX,AX, 0,0x49b40821,22); + + MOVL (1*4)(SI), R8 + MOVL DX, R9 + MOVL DX, R10 + +#define ROUND2(a, b, c, d, index, const, shift) \ + NOTL R9; \ + LEAL const(a)(R8*1),a; \ + ANDL b, R10; \ + ANDL c, R9; \ + MOVL (index*4)(SI),R8; \ + ORL R9, R10; \ + MOVL c, R9; \ + ADDL R10, a; \ + MOVL c, R10; \ + ROLL $shift, a; \ + ADDL b, a + + ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5); + ROUND2(DX,AX,BX,CX,11,0xc040b340, 9); + ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14); + ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20); + ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5); + ROUND2(DX,AX,BX,CX,15, 0x2441453, 9); + ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14); + ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20); + ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5); + ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9); + ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14); + ROUND2(BX,CX,DX,AX,13,0x455a14ed,20); + ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5); + ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9); + ROUND2(CX,DX,AX,BX,12,0x676f02d9,14); + ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20); + + MOVL (5*4)(SI), R8 + MOVL CX, R9 + +#define ROUND3(a, b, c, d, index, const, shift) \ + LEAL const(a)(R8*1),a; \ + MOVL (index*4)(SI),R8; \ + XORL d, R9; \ + XORL b, R9; \ + ADDL R9, a; \ + ROLL $shift, a; \ + MOVL b, R9; \ + ADDL b, a + + ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4); + ROUND3(DX,AX,BX,CX,11,0x8771f681,11); + ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16); + ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23); + ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4); + ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11); + ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16); + ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23); + ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4); + ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11); + ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16); + ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23); + ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4); + ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11); + ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16); + ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23); + + MOVL (0*4)(SI), R8 + MOVL $0xffffffff, R9 + XORL DX, R9 + +#define ROUND4(a, b, c, d, index, const, shift) \ + LEAL const(a)(R8*1),a; \ + ORL b, R9; \ + XORL c, R9; \ + ADDL R9, a; \ + MOVL (index*4)(SI),R8; \ + MOVL $0xffffffff, R9; \ + ROLL $shift, a; \ + XORL c, R9; \ + ADDL b, a + + ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6); + ROUND4(DX,AX,BX,CX,14,0x432aff97,10); + ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15); + ROUND4(BX,CX,DX,AX,12,0xfc93a039,21); + ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6); + ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10); + ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15); + ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21); + ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6); + ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10); + ROUND4(CX,DX,AX,BX,13,0xa3014314,15); + ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21); + ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6); + ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10); + ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15); + ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21); + + ADDL R12, AX + ADDL R13, BX + ADDL R14, CX + ADDL R15, DX + + ADDQ $64, SI + CMPQ SI, DI + JB loop + +end: + MOVL AX, (0*4)(BP) + MOVL BX, (1*4)(BP) + MOVL CX, (2*4)(BP) + MOVL DX, (3*4)(BP) + RET diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go new file mode 100644 index 00000000000..14190c6ff29 --- /dev/null +++ b/src/pkg/crypto/md5/md5block_decl.go @@ -0,0 +1,9 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 386 + +package md5 + +func block(dig *digest, p []byte) -- GitLab