diff --git a/sparc/aes.asm b/sparc/aes.asm
index 6731c25993b5614081a6b377e14c532f7ffe373a..c2bd67644da99a8fdfebab27e9dc4a7b53ff8cd8 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -298,17 +298,16 @@ define(i, round)
 	! NOTE: First instruction duplicated in delay slot
 
 	C i = 0
-	ld	[IDX1+i], t1 	! 1
-C .Lfinal_loop:
-	! IDX2(j) = j XOR 2
-	xor	i, 8, t2
+	ld	[IDX1+0], t1 	! 1
+	C xor	i, 8, t2
 	add	wtxt, t1, t1	! 1
 	ldub	[t1+2], t1	! 1
 
-	ld	[wtxt+i], t0	! 0
-	lduh	[wtxt+t2], t2	! 2
+	ld	[wtxt+0], t0	! 0
+	! IDX2(j) = j XOR 2
+	lduh	[wtxt+8], t2	! 2
 	and	t0, 255, t0	! 0
-	ld	[IDX3 + i], t3	! 3
+	ld	[IDX3 + 0], t3	! 3
 	
 	and	t2, 255, t2	! 2
 	ldub	[T+t1], t1	! 1
@@ -322,13 +321,12 @@ C .Lfinal_loop:
 	
 	sll	t2, 16, t2	! 2
 	or	t0, t2, t0	! 0, 1, 2
-	ld	[key + i], t2
+	ld	[key + 0], t2
 	sll	t3, 24, t3	! 3
 	
 	or	t0, t3, t0	! 0, 1, 2, 3
 	xor	t0, t2, t0
 	add	i, 4, i
-	cmp	i, 12
 	
 	srl	t0, 24, t3
 	srl	t0, 16, t2
@@ -340,12 +338,11 @@ C .Lfinal_loop:
 	stb	t0, [dst]
 	add	dst, 4, dst
 	
-	C bleu	.Lfinal_loop
 	ld	[IDX1+i], t1 	! 1
 
 	C i = 1
 	ld	[IDX1+i], t1 	! 1
-C .Lfinal_loop:
+
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
 	add	wtxt, t1, t1	! 1
@@ -374,7 +371,6 @@ C .Lfinal_loop:
 	or	t0, t3, t0	! 0, 1, 2, 3
 	xor	t0, t2, t0
 	add	i, 4, i
-	cmp	i, 12
 	
 	srl	t0, 24, t3
 	srl	t0, 16, t2
@@ -386,12 +382,11 @@ C .Lfinal_loop:
 	stb	t0, [dst]
 	add	dst, 4, dst
 	
-	C bleu	.Lfinal_loop
 	ld	[IDX1+i], t1 	! 1
 
 	C i = 2
 	ld	[IDX1+i], t1 	! 1
-C .Lfinal_loop:
+	
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
 	add	wtxt, t1, t1	! 1
@@ -420,7 +415,6 @@ C .Lfinal_loop:
 	or	t0, t3, t0	! 0, 1, 2, 3
 	xor	t0, t2, t0
 	add	i, 4, i
-	cmp	i, 12
 	
 	srl	t0, 24, t3
 	srl	t0, 16, t2
@@ -432,12 +426,11 @@ C .Lfinal_loop:
 	stb	t0, [dst]
 	add	dst, 4, dst
 	
-	C bleu	.Lfinal_loop
 	ld	[IDX1+i], t1 	! 1
 
 	C i = 3
 	ld	[IDX1+i], t1 	! 1
-C .Lfinal_loop:
+
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
 	add	wtxt, t1, t1	! 1
@@ -466,7 +459,6 @@ C .Lfinal_loop:
 	or	t0, t3, t0	! 0, 1, 2, 3
 	xor	t0, t2, t0
 	add	i, 4, i
-	cmp	i, 12
 	
 	srl	t0, 24, t3
 	srl	t0, 16, t2
@@ -478,7 +470,6 @@ C .Lfinal_loop:
 	stb	t0, [dst]
 	add	dst, 4, dst
 	
-	C bleu	.Lfinal_loop
 	ld	[IDX1+i], t1 	! 1
 			
 	C Unrolled final loop ends