diff --git a/src/pkg/crypto/rc4/rc4_386.s b/src/pkg/crypto/rc4/rc4_386.s
new file mode 100644
index 0000000000000000000000000000000000000000..55b527bd8c9f03573aa53c7db98f76dd27d15360
--- /dev/null
+++ b/src/pkg/crypto/rc4/rc4_386.s
@@ -0,0 +1,51 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
+TEXT ·xorKeyStream(SB),7,$0
+	MOVL dst+0(FP), DI
+	MOVL src+4(FP), SI
+	MOVL state+12(FP), BP
+
+	MOVL xPtr+16(FP), AX
+	MOVBLZX (AX), AX
+	MOVL yPtr+20(FP), BX
+	MOVBLZX (BX), BX
+	CMPL n+8(FP), $0
+	JEQ done
+
+loop:
+	// i += 1
+	INCB AX
+
+	// j += c.s[i]
+	MOVBLZX (BP)(AX*1), DX
+	ADDB DX, BX
+	MOVBLZX BX, BX
+
+	// c.s[i], c.s[j] = c.s[j], c.s[i]
+	MOVBLZX (BP)(BX*1), CX
+	MOVB CX, (BP)(AX*1)
+	MOVB DX, (BP)(BX*1)
+
+	// *dst = *src ^ c.s[c.s[i]+c.s[j]]
+	ADDB DX, CX
+	MOVBLZX CX, CX
+	MOVB (BP)(CX*1), CX
+	XORB (SI), CX
+	MOVBLZX CX, CX
+	MOVB CX, (DI)
+
+	INCL SI
+	INCL DI
+	DECL n+8(FP)
+	JNE loop
+
+done:
+	MOVL xPtr+16(FP), CX
+	MOVB AX, (CX)
+	MOVL yPtr+20(FP), CX
+	MOVB BX, (CX)
+
+	RET
diff --git a/src/pkg/crypto/rc4/rc4_amd64.s b/src/pkg/crypto/rc4/rc4_amd64.s
index ffe9ada85bb3b5c473d79c2366809e7121683201..d6d4577a3834e575e1ad0e744f545ea18ee80e21 100644
--- a/src/pkg/crypto/rc4/rc4_amd64.s
+++ b/src/pkg/crypto/rc4/rc4_amd64.s
@@ -1,53 +1,106 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
+// Original source:
+//	http://www.zorinaq.com/papers/rc4-amd64.html
+//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
+//
+// Transliterated from GNU to 6a assembly syntax by the Go authors.
+// The comments and spacing are from the original.
 
-// func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8)
-TEXT ·xorKeyStream(SB),7,$0
-	MOVQ dst+0(FP), DI
-	MOVQ src+8(FP), SI
-	MOVQ n+16(FP), CX
-	MOVQ state+24(FP), R8
-
-	MOVQ xPtr+32(FP), AX
-	MOVBQZX (AX), AX
-	MOVQ yPtr+40(FP), BX
-	MOVBQZX (BX), BX
+// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
 
-loop:
-	CMPQ CX, $0
-	JE done
+// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
+// but makes the code run 2.0x slower on Xeon.
+#define EXTEND(r) MOVBLZX r, r
 
-	// c.i += 1
-	INCB AX
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The code has been designed to be easily integrated into openssl:
+** the exported RC4() function can replace the actual implementations
+** openssl already contains. Please note that when linking with openssl,
+** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
+** with -DRC4_INT='unsigned long'.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+*/
 
-	// c.j += c.s[c.i]
-	MOVB (R8)(AX*1), R9
-	ADDB R9, BX
-
-	MOVBQZX (R8)(BX*1), R10
+TEXT ·xorKeyStream(SB),7,$0
+	MOVQ	len+16(FP),	BX		// rbx = ARG(len)
+	MOVQ	in+8(FP),	SI		// in = ARG(in)
+	MOVQ	out+0(FP),	DI		// out = ARG(out)
+	MOVQ	d+24(FP),	BP		// d = ARG(data)
+	MOVQ	xp+32(FP),	AX
+	MOVBQZX	0(AX),		CX		// x = *xp
+	MOVQ	yp+40(FP),	AX
+	MOVBQZX	0(AX),		DX		// y = *yp
 
-	MOVB R10, (R8)(AX*1)
-	MOVB R9, (R8)(BX*1)
+	INCQ	CX				// x++
+	ANDQ	$255,		CX		// x &= 0xff
+	LEAQ	-8(BX)(SI*1),	BX		// rbx = in+len-8
+	MOVQ	BX,		R9		// tmp = in+len-8
+	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x]
+	CMPQ	BX,		SI		// cmp in with in+len-8
+	JLT	end				// jump if (in+len-8 < in)
 
-	// R11 = c.s[c.i]+c.s[c.j]
-	MOVQ R10, R11
-	ADDB R9, R11
+start:
+	ADDQ	$8,		SI		// increment in
+	ADDQ	$8,		DI		// increment out
+	
+	// generate the next 8 bytes of the rc4 stream into R8
+	MOVQ	$8,		R11		// byte counter
+l1:	ADDB	AX,		DX
+	EXTEND(DX)
+	MOVBLZX	(BP)(DX*1),	BX		// ty = d[y]
+	MOVB	BX,		(BP)(CX*1)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty + tx
+	EXTEND(BX)
+	MOVB	AX,		(BP)(DX*1)	// d[y] = tx
+	INCB	CX				// x++ (NEXT ROUND)
+	EXTEND(CX)
+	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x] (NEXT ROUND)
+	SHLQ	$8,		R8
+	MOVB	(BP)(BX*1),	R8		// val = d[val]
+	DECQ	R11
+	JNZ	l1
 
-	MOVB (R8)(R11*1), R11
-	MOVB (SI), R12
-	XORB R11, R12
-	MOVB R12, (DI)
+	// xor 8 bytes
+	BSWAPQ	R8
+	XORQ	-8(SI),		R8
+	CMPQ	SI,		R9		// cmp in+len-8 with in XXX
+	MOVQ	R8,		-8(DI)
+	JLE	start				// jump if (in <= in+len-8)
 
-	INCQ SI
-	INCQ DI
-	DECQ CX
+end:
+	ADDQ	$8,		R9		// tmp = in+len
 
-	JMP loop
-done:
-	MOVQ xPtr+32(FP), R8
-	MOVB AX, (R8)
-	MOVQ yPtr+40(FP), R8
-	MOVB BX, (R8)
+	// handle the last bytes, one by one
+l2:	CMPQ	R9,		SI		// cmp in with in+len
+	JLE	finished			// jump if (in+len <= in)
+	ADDB	AX,		DX		// y += tx
+	EXTEND(DX)
+	MOVBLZX	(BP)(DX*1),	BX		// ty = d[y]
+	MOVB	BX,		(BP)(CX*1)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty+tx
+	EXTEND(BX)
+	MOVB	AX,		(BP)(DX*1)	// d[y] = tx
+	INCB	CX				// x++ (NEXT ROUND)
+	EXTEND(CX)
+	MOVBLZX	(BP)(CX*1),	AX		// tx = d[x] (NEXT ROUND)
+	MOVBLZX	(BP)(BX*1),	R8		// val = d[val]
+	XORB	(SI),		R8		// xor 1 byte
+	MOVB	R8,		(DI)
+	INCQ	SI				// in++
+	INCQ	DI				// out++
+	JMP l2
 
+finished:
+	DECQ	CX				// x--
+	MOVQ	yp+40(FP),	BX
+	MOVB	DX, 0(BX)
+	MOVQ	xp+32(FP),	AX
+	MOVB	CX, 0(AX)
 	RET
diff --git a/src/pkg/crypto/rc4/rc4_asm.go b/src/pkg/crypto/rc4/rc4_asm.go
index 0b66e4a9e21a6f0bb75f5346a910d33ea0c9f1e3..532768dff26b43cfd933c84bafa63c579e010966 100644
--- a/src/pkg/crypto/rc4/rc4_asm.go
+++ b/src/pkg/crypto/rc4/rc4_asm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 arm
+// +build amd64 arm 386
 
 package rc4
 
diff --git a/src/pkg/crypto/rc4/rc4_ref.go b/src/pkg/crypto/rc4/rc4_ref.go
index 1018548c240a05087cc5020c76302ebbbceefe43..44d3804368ae3b6e127e538d05cc07a028c8d2fa 100644
--- a/src/pkg/crypto/rc4/rc4_ref.go
+++ b/src/pkg/crypto/rc4/rc4_ref.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!arm
+// +build !amd64,!arm,!386
 
 package rc4
 
diff --git a/src/pkg/crypto/rc4/rc4_test.go b/src/pkg/crypto/rc4/rc4_test.go
index 9e12789f7f38ec7e033acd85b7163e895f8d10b9..1ce03608ca26ac0f6269ff492b3d6f3ca5613375 100644
--- a/src/pkg/crypto/rc4/rc4_test.go
+++ b/src/pkg/crypto/rc4/rc4_test.go
@@ -5,6 +5,7 @@
 package rc4
 
 import (
+	"fmt"
 	"testing"
 )
 
@@ -72,20 +73,43 @@ var golden = []rc4Test{
 	},
 }
 
+func testEncrypt(t *testing.T, desc string, c *Cipher, src, expect []byte) {
+	dst := make([]byte, len(src))
+	c.XORKeyStream(dst, src)
+	for i, v := range dst {
+		if v != expect[i] {
+			t.Fatalf("%s: mismatch at byte %d:\nhave %x\nwant %x", desc, i, dst, expect)
+		}
+	}
+}
+
 func TestGolden(t *testing.T) {
-	for i := 0; i < len(golden); i++ {
-		g := golden[i]
-		c, err := NewCipher(g.key)
-		if err != nil {
-			t.Errorf("Failed to create cipher at golden index %d", i)
-			return
+	for gi, g := range golden {
+		data := make([]byte, len(g.keystream))
+		for i := range data {
+			data[i] = byte(i)
 		}
-		keystream := make([]byte, len(g.keystream))
-		c.XORKeyStream(keystream, keystream)
-		for j, v := range keystream {
-			if g.keystream[j] != v {
-				t.Errorf("Failed at golden index %d:\n%x\nvs\n%x", i, keystream, g.keystream)
-				break
+
+		expect := make([]byte, len(g.keystream))
+		for i := range expect {
+			expect[i] = byte(i) ^ g.keystream[i]
+		}
+
+		for size := 1; size <= len(g.keystream); size++ {
+			c, err := NewCipher(g.key)
+			if err != nil {
+				t.Fatalf("#%d: NewCipher: %v", gi, err)
+			}
+
+			off := 0
+			for off < len(g.keystream) {
+				n := len(g.keystream) - off
+				if n > size {
+					n = size
+				}
+				desc := fmt.Sprintf("#%d@[%d:%d]", gi, off, off+n)
+				testEncrypt(t, desc, c, data[off:off+n], expect[off:off+n])
+				off += n
 			}
 		}
 	}