diff --git a/sparc/arcfour-crypt.asm b/sparc/arcfour-crypt.asm
index 0c24d25478df9dff21d6d74ba16750d8496ddec9..02575e1306125fe5f6b9f46c0ddbfda62826d51d 100644
--- a/sparc/arcfour-crypt.asm
+++ b/sparc/arcfour-crypt.asm
@@ -30,107 +30,34 @@ define(<LENGTH>,<%i1>)
 define(<DST>,	<%i2>)
 define(<SRC>,	<%i3>)
 
-define(<I>,	<%i4>)
-define(<J>,	<%i5>)
-define(<SI>,	<%g1>)
-define(<SJ>,	<%g2>)
-define(<TMP>,	<%g3>)
-define(<N>,	<%o0>)
-define(<WORD>,	<%o1>)
-
-C	Encrypts n bytes, one byte at a time.
-C	ARCFOUR_BYTE_LOOP(n, label)
-define(<ARCFOUR_BYTE_LOOP>, <
-$2:	
-	add	I, 1, I
-	and	I, 0xff, I
-	ldub	[CTX + I], SI
-	subcc	$1,1,$1
-	ldub	[SRC], TMP
+define(<I1>,	<%i4>)
+define(<I2>,	<%i5>)
+define(<J>,	<%g1>)
+define(<SI>,	<%g2>)
+define(<SJ>,	<%g3>)
+define(<TMP>,	<%o0>)
+define(<N>,	<%o1>)
+define(<DATA>,	<%o2>)
+
+C	Computes the next byte of the key stream. As input, i must
+C	already point to the index for the current access, the index
+C	for the next access is stored in ni. The resulting key byte is
+C	stored in res.
+C	ARCFOUR_BYTE(i, ni, res)
+define(<ARCFOUR_BYTE>, <
+	ldub	[CTX + $1], SI
+	add	$1, 1, $2
 	add	J, SI, J
 	and	J, 0xff, J
 	ldub	[CTX + J], SJ
-	add	SRC, 1, SRC
+	and	$2, 0xff, $2
 	stb	SI, [CTX + J]
 	add	SI, SJ, SI
 	and	SI, 0xff, SI
-	stb	SJ, [CTX + I]
-	ldub	[CTX + SI], SI
-	xor	TMP, SI, TMP
-	stb	TMP, [DST]
-	bne	$2
-	add	DST, 1, DST
+	stb	SJ, [CTX + $1]
+	ldub	[CTX + SI], $3
 >)dnl
-
-C	Encrypts 4n bytes, four at a time. Requires proper alignmentof
-C	SRC and DST.
-C	ARCFOUR_WORD_LOOP(n, label)
-define(<ARCFOUR_WORD_LOOP>, <
-$2:
-	add	I, 1, I
-	and	I, 0xff, I
-	ldub	[CTX + I], SI
-	ld	[SRC], WORD
-	add	J, SI, J
-	and	J, 0xff, J
-	ldub	[CTX + J], SJ
-	stb	SI, [CTX + J]
-	add	SI, SJ, SI
-	and	SI, 0xff, SI
-	stb	SJ, [CTX + I]
-	ldub	[CTX + SI], TMP
-
-	add	I, 1, I
-	and	I, 0xff, I
-	ldub	[CTX + I], SI
-	add	SRC, 4, SRC
-	add	J, SI, J
-	and	J, 0xff, J
-	ldub	[CTX + J], SJ
-	stb	SI, [CTX + J]
-	add	SI, SJ, SI
-	and	SI, 0xff, SI
-	stb	SJ, [CTX + I]
-	ldub	[CTX + SI], SI
-	sll	TMP, 8, TMP
-	or	TMP, SI, TMP
-	
-	add	I, 1, I
-	and	I, 0xff, I
-	ldub	[CTX + I], SI
-	subcc	$1, 1, $1
-	add	J, SI, J
-	and	J, 0xff, J
-	ldub	[CTX + J], SJ
-	stb	SI, [CTX + J]
-	add	SI, SJ, SI
-	and	SI, 0xff, SI
-	stb	SJ, [CTX + I]
-	ldub	[CTX + SI], SI
-	sll	TMP, 8, TMP
-	or	TMP, SI, TMP
-
-	add	I, 1, I
-	and	I, 0xff, I
-	ldub	[CTX + I], SI
-	C	empty slot
-	add	J, SI, J
-	and	J, 0xff, J
-	ldub	[CTX + J], SJ
-	stb	SI, [CTX + J]
-	add	SI, SJ, SI
-	and	SI, 0xff, SI
-	stb	SJ, [CTX + I]
-	ldub	[CTX + SI], SI
-	sll	TMP, 8, TMP
-	or	TMP, SI, TMP
-	xor	WORD, TMP, WORD
-	st	WORD, [DST]
-	
-	bne	$2
-	add	DST, 4, DST
->)dnl
-		
+			
 C	FIXME: Consider using the callers window
 define(<FRAME_SIZE>, 104)
 
@@ -151,52 +78,48 @@ PROLOGUE(nettle_arcfour_crypt)
 	be	.Lend
 	
 	C	Load both I and J
-	lduh	[CTX + ARCFOUR_I], I
-	and	I, 0xff, J
-	srl	I, 8, I
+	lduh	[CTX + ARCFOUR_I], I1
+	and	I1, 0xff, J
+	srl	I1, 8, I1
 
-ifelse(WITH_ALIGN, YES, <
-	C	Check if SRC and DST have compatible alignment
-	xor	SRC, DST, TMP
-	andcc	TMP, 3, TMP
+	andcc	LENGTH, 1, %g0
+	beq	.Loop
 
-	bne	.Lrest
-	nop
-	
-	andcc	DST, 3, N
-	bz	.Laligned
-	nop
-	
-	sub	N, 4, N
-	neg	N
-	cmp	N, LENGTH
-	bgeu	.Lrest
-	nop
-	
-	sub	LENGTH, N, LENGTH
-	
-	ARCFOUR_BYTE_LOOP(N, .Lunalignedloop)
+	add	I1, 1 ,I1
+	and	I1, 0xff, I1
 
-.Laligned:
-	srl	LENGTH, 2, N
-	cmp	N, 0
-	be	.Lrest
-	nop
-	
-	ARCFOUR_WORD_LOOP(N, .Lalignedloop)
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC], DATA
+	subcc	LENGTH, 1, LENGTH
+	add	SRC, 1, SRC
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
+	beq	.Ldone
+	add	DST, 1, DST
 
-	andcc	LENGTH, 3, LENGTH
-	bz	.Ldone
-	nop
->)
-.Lrest:
-	ARCFOUR_BYTE_LOOP(LENGTH, .Loop)
+	mov	I2, I1
+.Loop:
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC], DATA
+	add	SRC, 2, SRC
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	ldub	[SRC - 1], DATA
+	subcc	LENGTH, 2, LENGTH
+	add	DST, 2, DST
+	xor	DATA, TMP, DATA
+	
+	bne	.Loop
+	stb	DATA, [DST - 1]
 
+	mov	I2, I1
 .Ldone:
-	C	Save back I and J	
-	sll	I, 8, I
-	or	I, J, I
-	stuh	I, [CTX + ARCFOUR_I]
+	C	Save back I and J
+	sll	I1, 8, I1
+	or	I1, J, I1
+	stuh	I1, [CTX + ARCFOUR_I]
 
 .Lend:
 	ret
@@ -212,6 +135,7 @@ C 3:	Moved load of source byte
 C 4:	Better instruction scheduling
 C 5:	Special case SRC and DST with compatible alignment
 C 6:	After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
+C 7:	Unrolled only twice, with byte-accesses
 
 C	MB/s	cycles/byte	Code size (bytes)
 C 1:	6.6	12.4		132
@@ -220,3 +144,4 @@ C 3:	6.0	13.5		116
 C 4:	6.5	12.4		116
 C 5:	7.9	10.4		496
 C 6:	8.3	9.7		496
+C 7:	6.7	12.1		268