diff --git a/sparc/aes.asm b/sparc/aes.asm
index 8256d7a8e070d2c5d3f1ad6344e3c6d6f1c5a09c..1ee518b05b74c42ed9e00b40ac61186a8b05104c 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -65,14 +65,17 @@ _aes_crypt:
 	add	T, AES_TABLE3, T3
 	! Read src, and add initial subkey
 	! Difference between ctx and src.
-	! NOTE: This isntruction is duplicated in the delay slot
+	! NOTE: This instruction is duplicated in the delay slot
 	sub	ctx, src, %g2
+
 .Lblock_loop:
 	! Difference between wtxt and src
 	sub	wtxt, src, %g3
 	! For stop condition. Note that src is incremented in the
 	! delay slot
 	add	src, 8, %g4
+	nop
+	nop
 	
 .Lsource_loop:
 	ldub	[src+3], t3
@@ -95,10 +98,9 @@ _aes_crypt:
 	st	t3, [src+%g3]
 	bleu	.Lsource_loop
 	add	src, 4, src
-
+	
 	sub	nrounds, 1, round
 	add	ctx, 16, key
-
 .Lround_loop:
 	! 4*i
 	mov	0, i
@@ -158,7 +160,8 @@ _aes_crypt:
 	xor	wtxt, diff, wtxt
 	subcc	round, 1, round
 	add	key, 16, key
-
+	nop
+	
 	bne	.Lround_loop
 	xor	tmp, diff, tmp
 
@@ -171,33 +174,33 @@ _aes_crypt:
 	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
 	! the instruction is part of
 	ld	[idx-32], t1 	! 1
-	ld	[idx-16], t2	! 2
-
+	xor	i, 8, t2
+	! ld	[idx-16], t2	! 2
 	add	wtxt, t1, t1	! 1
 	ldub	[t1+2], t1	! 1
 
 	ld	[wtxt+i], t0	! 0
-
 	lduh	[wtxt+t2], t2	! 2
 	and	t0, 255, t0	! 0
 	ld	[idx], t3	! 3
+	
 	and	t2, 255, t2	! 2
 	ldub	[T+t1], t1	! 1
-
 	ldub	[T+t0], t0	! 0
 	sll	t1, 8, t1	! 1
+	
 	ldub	[wtxt+t3], t3	! 3
 	or	t0, t1, t0	! 0, 1
 	ldub	[T+t2], t2	! 2
-
 	ldub	[T+t3], t3	! 3
+	
 	sll	t2, 16, t2	! 2
 	or	t0, t2, t0	! 0, 1, 2
 	ld	[key + i], t2
 	sll	t3, 24, t3	! 3
+	
 	or	t0, t3, t0	! 0, 1, 2, 3
 	xor	t0, t2, t0
-	
 	add	i, 4, i
 	cmp	i, 12
 	
@@ -205,6 +208,7 @@ _aes_crypt:
 	srl	t0, 16, t2
 	srl	t0, 8, t1
 	stb	t1, [dst+1]
+	
 	stb	t3, [dst+3]
 	stb	t2, [dst+2]
 	stb	t0, [dst]
@@ -212,8 +216,9 @@ _aes_crypt:
 	
 	bleu	.Lfinal_loop
 	add	idx, 4, idx
-	
 	addcc	length, -16, length
+	nop
+	
 	bne	.Lblock_loop
 	sub	ctx, src, %g2
 
@@ -310,3 +315,21 @@ _aes_crypt:
 	! aes256 (ECB decrypt): 15.38s, 0.650MB/s
 	! aes256 (CBC encrypt): 17.49s, 0.572MB/s
 	! aes256 (CBC decrypt): 17.87s, 0.560MB/s
+
+	! After further optimizations of the initial and final loops,
+	! source_loop and final_loop. 
+	! aes128 (ECB encrypt): 8.07s, 1.239MB/s
+	! aes128 (ECB decrypt): 9.48s, 1.055MB/s
+	! aes128 (CBC encrypt): 12.76s, 0.784MB/s
+	! aes128 (CBC decrypt): 12.15s, 0.823MB/s
+	! 
+	! aes192 (ECB encrypt): 9.43s, 1.060MB/s
+	! aes192 (ECB decrypt): 11.20s, 0.893MB/s
+	! aes192 (CBC encrypt): 14.19s, 0.705MB/s
+	! aes192 (CBC decrypt): 13.97s, 0.716MB/s
+	! 
+	! aes256 (ECB encrypt): 10.81s, 0.925MB/s
+	! aes256 (ECB decrypt): 12.92s, 0.774MB/s
+	! aes256 (CBC encrypt): 15.59s, 0.641MB/s
+	! aes256 (CBC decrypt): 15.76s, 0.635MB/s
+