diff --git a/sparc/aes.asm b/sparc/aes.asm
index d72ec883f21870eb53e30447e8da57cc7a22c382..6731c25993b5614081a6b377e14c532f7ffe373a 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -58,6 +58,7 @@ define(T1, %l7)
 define(T2, %g5)
 define(T3, %o7)
 define(IDX1, %i5)
+C define(IDX1, <T + AES_SIDX1 >)
 define(IDX3, %o5)
 
 ! Teporaries
@@ -141,12 +142,14 @@ _aes_crypt:
 	! The code uses the register %o[j], aka tj, as the primary 
 	! register for that sub-expression. True for j==1,3.
 
+	C Unrolled inner loop begins
+	
 	C i = 0
-	ld	[IDX1], t1		! 1
+	ld	[IDX1+0], t1		! 1
 	
 	add	wtxt, t1, t1		! 1
 	ldub	[t1+2], t1		! 1
-	ld	[IDX3], t3		! 3
+	ld	[IDX3+0], t3		! 3
 	
 	sll	t1, 2, t1		! 1
 	ld	[wtxt], t0		! 0
@@ -273,7 +276,7 @@ _aes_crypt:
 	xor	t0, t1, t0
 	st	t0, [tmp+12]
 			
-	C End of unrolled loop
+	C Unrolled inner loop ends
 	
 	! switch roles for tmp and wtxt
 	xor	wtxt, diff, wtxt
@@ -290,9 +293,151 @@ define(i, round)
 
 	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
 	! the instruction is part of
+
+	C Unrolled final loop begins
 	! NOTE: First instruction duplicated in delay slot
+
+	C i = 0
+	ld	[IDX1+i], t1 	! 1
+C .Lfinal_loop:
+	! IDX2(j) = j XOR 2
+	xor	i, 8, t2
+	add	wtxt, t1, t1	! 1
+	ldub	[t1+2], t1	! 1
+
+	ld	[wtxt+i], t0	! 0
+	lduh	[wtxt+t2], t2	! 2
+	and	t0, 255, t0	! 0
+	ld	[IDX3 + i], t3	! 3
+	
+	and	t2, 255, t2	! 2
+	ldub	[T+t1], t1	! 1
+	ldub	[T+t0], t0	! 0
+	sll	t1, 8, t1	! 1
+	
+	ldub	[wtxt+t3], t3	! 3
+	or	t0, t1, t0	! 0, 1
+	ldub	[T+t2], t2	! 2
+	ldub	[T+t3], t3	! 3
+	
+	sll	t2, 16, t2	! 2
+	or	t0, t2, t0	! 0, 1, 2
+	ld	[key + i], t2
+	sll	t3, 24, t3	! 3
+	
+	or	t0, t3, t0	! 0, 1, 2, 3
+	xor	t0, t2, t0
+	add	i, 4, i
+	cmp	i, 12
+	
+	srl	t0, 24, t3
+	srl	t0, 16, t2
+	srl	t0, 8, t1
+	stb	t1, [dst+1]
+	
+	stb	t3, [dst+3]
+	stb	t2, [dst+2]
+	stb	t0, [dst]
+	add	dst, 4, dst
+	
+	C bleu	.Lfinal_loop
+	ld	[IDX1+i], t1 	! 1
+
+	C i = 1
+	ld	[IDX1+i], t1 	! 1
+C .Lfinal_loop:
+	! IDX2(j) = j XOR 2
+	xor	i, 8, t2
+	add	wtxt, t1, t1	! 1
+	ldub	[t1+2], t1	! 1
+
+	ld	[wtxt+i], t0	! 0
+	lduh	[wtxt+t2], t2	! 2
+	and	t0, 255, t0	! 0
+	ld	[IDX3 + i], t3	! 3
+	
+	and	t2, 255, t2	! 2
+	ldub	[T+t1], t1	! 1
+	ldub	[T+t0], t0	! 0
+	sll	t1, 8, t1	! 1
+	
+	ldub	[wtxt+t3], t3	! 3
+	or	t0, t1, t0	! 0, 1
+	ldub	[T+t2], t2	! 2
+	ldub	[T+t3], t3	! 3
+	
+	sll	t2, 16, t2	! 2
+	or	t0, t2, t0	! 0, 1, 2
+	ld	[key + i], t2
+	sll	t3, 24, t3	! 3
+	
+	or	t0, t3, t0	! 0, 1, 2, 3
+	xor	t0, t2, t0
+	add	i, 4, i
+	cmp	i, 12
+	
+	srl	t0, 24, t3
+	srl	t0, 16, t2
+	srl	t0, 8, t1
+	stb	t1, [dst+1]
+	
+	stb	t3, [dst+3]
+	stb	t2, [dst+2]
+	stb	t0, [dst]
+	add	dst, 4, dst
+	
+	C bleu	.Lfinal_loop
+	ld	[IDX1+i], t1 	! 1
+
+	C i = 2
+	ld	[IDX1+i], t1 	! 1
+C .Lfinal_loop:
+	! IDX2(j) = j XOR 2
+	xor	i, 8, t2
+	add	wtxt, t1, t1	! 1
+	ldub	[t1+2], t1	! 1
+
+	ld	[wtxt+i], t0	! 0
+	lduh	[wtxt+t2], t2	! 2
+	and	t0, 255, t0	! 0
+	ld	[IDX3 + i], t3	! 3
+	
+	and	t2, 255, t2	! 2
+	ldub	[T+t1], t1	! 1
+	ldub	[T+t0], t0	! 0
+	sll	t1, 8, t1	! 1
+	
+	ldub	[wtxt+t3], t3	! 3
+	or	t0, t1, t0	! 0, 1
+	ldub	[T+t2], t2	! 2
+	ldub	[T+t3], t3	! 3
+	
+	sll	t2, 16, t2	! 2
+	or	t0, t2, t0	! 0, 1, 2
+	ld	[key + i], t2
+	sll	t3, 24, t3	! 3
+	
+	or	t0, t3, t0	! 0, 1, 2, 3
+	xor	t0, t2, t0
+	add	i, 4, i
+	cmp	i, 12
+	
+	srl	t0, 24, t3
+	srl	t0, 16, t2
+	srl	t0, 8, t1
+	stb	t1, [dst+1]
+	
+	stb	t3, [dst+3]
+	stb	t2, [dst+2]
+	stb	t0, [dst]
+	add	dst, 4, dst
+	
+	C bleu	.Lfinal_loop
 	ld	[IDX1+i], t1 	! 1
-.Lfinal_loop:
+
+	C i = 3
+	ld	[IDX1+i], t1 	! 1
+C .Lfinal_loop:
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
 	add	wtxt, t1, t1	! 1
@@ -333,8 +478,11 @@ define(i, round)
 	stb	t0, [dst]
 	add	dst, 4, dst
 	
-	bleu	.Lfinal_loop
+	C bleu	.Lfinal_loop
 	ld	[IDX1+i], t1 	! 1
+			
+	C Unrolled final loop ends
+	
 	addcc	length, -16, length
 	sub	ctx, src, %g2