diff --git a/sparc/arcfour-crypt.asm b/sparc/arcfour-crypt.asm
index 02575e1306125fe5f6b9f46c0ddbfda62826d51d..4d8dac948b4f172ffecc0b39a162c5483f3c9fb5 100644
--- a/sparc/arcfour-crypt.asm
+++ b/sparc/arcfour-crypt.asm
@@ -36,8 +36,9 @@ define(<J>,	<%g1>)
 define(<SI>,	<%g2>)
 define(<SJ>,	<%g3>)
 define(<TMP>,	<%o0>)
-define(<N>,	<%o1>)
-define(<DATA>,	<%o2>)
+define(<TMP2>,	<%o1>)
+define(<N>,	<%o2>)
+define(<DATA>,	<%o3>)
 
 C	Computes the next byte of the key stream. As input, i must
 C	already point to the index for the current access, the index
@@ -76,20 +77,22 @@ PROLOGUE(nettle_arcfour_crypt)
 	save	%sp, -FRAME_SIZE, %sp
 	cmp	LENGTH, 0
 	be	.Lend
+	nop
 	
 	C	Load both I and J
 	lduh	[CTX + ARCFOUR_I], I1
 	and	I1, 0xff, J
 	srl	I1, 8, I1
 
-	andcc	LENGTH, 1, %g0
-	beq	.Loop
-
+	C	We want an even address for DST
+	andcc	DST, 1, %g0
 	add	I1, 1 ,I1
+	beq	.Laligned2
 	and	I1, 0xff, I1
 
-	ARCFOUR_BYTE(I1, I2, TMP)
+	mov	I1, I2
 	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I2, I1, TMP)
 	subcc	LENGTH, 1, LENGTH
 	add	SRC, 1, SRC
 	xor	DATA, TMP, DATA
@@ -97,29 +100,107 @@ PROLOGUE(nettle_arcfour_crypt)
 	beq	.Ldone
 	add	DST, 1, DST
 
-	mov	I2, I1
-.Loop:
-	ARCFOUR_BYTE(I1, I2, TMP)
+.Laligned2:
+
+	cmp	LENGTH, 2
+	blu	.Lfinal1
+	C	Harmless delay slot instruction	
+	andcc	DST, 2, %g0
+	beq	.Laligned4
+	nop
+
 	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
 	add	SRC, 2, SRC
 	xor	DATA, TMP, DATA
-	stb	DATA, [DST]
+	sll	DATA, 8, DATA	
 
 	ARCFOUR_BYTE(I2, I1, TMP)
-	ldub	[SRC - 1], DATA
+	xor	TMP2, TMP, TMP
 	subcc	LENGTH, 2, LENGTH
+	or	DATA, TMP, DATA
+
+	sth	DATA, [DST]
+	beq	.Ldone
 	add	DST, 2, DST
-	xor	DATA, TMP, DATA
 	
+.Laligned4:
+	cmp	LENGTH, 4
+	blu	.Lfinal2
+	C	Harmless delay slot instruction
+	srl	LENGTH, 2, N
+	
+.Loop:
+	C	Main loop, with aligned writes
+	
+	C	FIXME: Could check if SRC is aligned, and
+	C	use 32-bit reads in that case.
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	xor	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	ldub	[SRC + 2], TMP2
+	or	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I1, I2, TMP)
+	xor	TMP2, TMP, TMP
+	ldub	[SRC + 3], TMP2
+	or	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	or	TMP, DATA, DATA
+	subcc	N, 1, N
+	add	SRC, 4, SRC
+	st	DATA, [DST]
 	bne	.Loop
-	stb	DATA, [DST - 1]
+	add	DST, 4, DST
+	
+	andcc	LENGTH, 3, LENGTH
+	beq	.Ldone
+	nop
+
+.Lfinal2:
+	C	DST address must be 2-aligned
+	cmp	LENGTH, 2
+	blu	.Lfinal1
+	nop
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	add	SRC, 2, SRC
+	xor	DATA, TMP, DATA
+	sll	DATA, 8, DATA	
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	or	DATA, TMP, DATA
+
+	sth	DATA, [DST]
+	beq	.Ldone
+	add	DST, 2, DST
+
+.Lfinal1:
+	mov	I1, I2
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
 
-	mov	I2, I1
 .Ldone:
 	C	Save back I and J
-	sll	I1, 8, I1
-	or	I1, J, I1
-	stuh	I1, [CTX + ARCFOUR_I]
+	sll	I2, 8, I2
+	or	I2, J, I2
+	stuh	I2, [CTX + ARCFOUR_I]
 
 .Lend:
 	ret
@@ -136,6 +217,7 @@ C 4:	Better instruction scheduling
 C 5:	Special case SRC and DST with compatible alignment
 C 6:	After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
 C 7:	Unrolled only twice, with byte-accesses
+C 8:	Unrolled, using 8-bit reads and aligned 32-bit writes.
 
 C	MB/s	cycles/byte	Code size (bytes)
 C 1:	6.6	12.4		132
@@ -145,3 +227,4 @@ C 4:	6.5	12.4		116
 C 5:	7.9	10.4		496
 C 6:	8.3	9.7		496
 C 7:	6.7	12.1		268
+C 8:	8.3	9.8		768