diff --git a/sparc/aes-encrypt-internal.asm b/sparc/aes-encrypt-internal.asm
index 8e06e03a42a940a1e33583a57a7c9e9404601ec8..e5951ad19857d7b18ef9669c70029beb97d3c8b3 100644
--- a/sparc/aes-encrypt-internal.asm
+++ b/sparc/aes-encrypt-internal.asm
@@ -133,10 +133,11 @@ C A:	nettle-1.13 C-code
 C B:	nettle-1.13 assembler
 C C:	New C-code
 C D:	New assembler, first correct version
-
+C E:	New assembler, with basic scheduling of AES_ROUND.
+	
 C	MB/s	cycles/block
 C A	1.2	1107
 C B	2.3	572
 C C	2.1	627
 C D	1.8	722
-	
+C E	2.6	496
diff --git a/sparc/machine.m4 b/sparc/machine.m4
index 5f3731a078e1abd4dd4bb7785076c2fa8d7c51ed..eb9e334b40ef4ec927f5408f628865d89598823d 100644
--- a/sparc/machine.m4
+++ b/sparc/machine.m4
@@ -33,34 +33,27 @@ C FIXME: Needs better instruction scheduling, and perhaps more temporaries
 C Alternatively, we can use a single table and some rotations
 define(<AES_ROUND>, <
 	and	$3, 0xff, TMP1		C  0
-	sll	TMP1, 2, TMP1		C  0
-	add	TMP1, AES_TABLE0, TMP1	C  0
-	ld	[$2 + TMP1], $8		C  0
-
 	srl	$4, 6, TMP2		C  1
+	sll	TMP1, 2, TMP1		C  0
 	and	TMP2, 0x3fc, TMP2	C  1
+	add	TMP1, AES_TABLE0, TMP1	C  0
 	add	TMP2, AES_TABLE1, TMP2	C  1
-	ld	[$2 + TMP2], TMP2	C  1
-	nop
-	xor	$8, TMP2, $8		C  1
-
+	ld	[$2 + TMP1], $8		C  0	E0
 	srl	$5, 14, TMP1		C  2
+	ld	[$2 + TMP2], TMP2	C  1
 	and	TMP1, 0x3fc, TMP1	C  2
-	add	TMP1, AES_TABLE2, TMP1	C  2
-	ld	[$2 + TMP1], TMP1	C  2
-	nop
-	xor	$8, TMP1, $8		C  2
-
+	xor	$8, TMP2, $8		C  1	E1
 	srl	$6, 22, TMP2		C  3
+	add	TMP1, AES_TABLE2, TMP1	C  2
 	and	TMP2, 0x3fc, TMP2	C  3
+	ld	[$2 + TMP1], TMP1	C  2
 	add	TMP2, AES_TABLE3, TMP2	C  3
+	xor	$8, TMP1, $8		C  2	E2
+	ld	[$7 + eval(4*$1)], TMP1	C  4
 	ld	[$2 + TMP2], TMP2	C  3
-	nop
+	xor	$8, TMP1, $8		C  4
 	xor	$8, TMP2, $8		C  3
-
-	ld	[$7 + eval(4*$1)], TMP2	C  4
-	nop
-	xor	$8, TMP2, $8>)dnl	C  4
+>)dnl
 
 C AES_FINAL_ROUND(i, T, a, b, c, d, key, dst)
 C Compute one word in the final round function. Output is converted to