diff --git a/sparc/aes.asm b/sparc/aes.asm
index 108c69d0e8b09be00bbd15bfc967d641b3042c7a..0e167db06ab275d4e8c8d7b6c18cce43ec4696ac 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -1,7 +1,7 @@
-	! Used registers:	%l0,1,2,3,4,5
+	! Used registers:	%l0,1,2,3,4,5,6,7
 	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
 	!			%o0,1,2,3,4,5,7 (%o6=%sp)
-	!			%g2,3,4
+	!			%g2,3,4,5,6
 include(`asm.m4')
 	
 	.file	"aes.asm"
@@ -26,9 +26,16 @@ define(diff, %l2)
 define(nrounds, %l3)
 
 ! Loop variables
-define(round, %l4) ! Should perhaps be 16 * round
+define(round, %l4) ! Really 16 * round
 define(i, %l5)
 
+! Further loop invariants
+define(T0, %l6)
+define(T1, %l7)
+define(T2, %g5)
+define(T3, %g6)
+define(key, %g7)
+
 ! Teporaries
 define(t0, %o0)
 define(t1, %o1)
@@ -39,45 +46,48 @@ define(idx, %o4)
 _aes_crypt:
 ! Why -136?
 	save	%sp, -136, %sp
-
 	cmp	length, 0
 	be	.Lend
-
 	! wtxt
 	add	%fp, -24, wtxt
+	
 	add	%fp, -40, tmp
-
 	ld	[ctx + AES_NROUNDS], nrounds
-
 	! Compute xor, so that we can swap efficiently.
 	xor	wtxt, tmp, diff
-
 	! The loop variable will be multiplied by 16.
 	sll	nrounds, 4, nrounds
-
+	
+	! More loop invariants
+	add	T, AES_TABLE0, T0
+	add	T, AES_TABLE1, T1
+	add	T, AES_TABLE2, T2
+	add	T, AES_TABLE3, T3
+		
 .Lblock_loop:
 	! Read src, and add initial subkey
 	mov	-4, i
 .Lsource_loop:
 	add	i, 4, i
-		
 	add	i, src, %o5
 	ldub	[%o5+3], %g2
-
 	ldub	[%o5+2], %g3
+	
 	sll	%g2, 24, %g2
 	ldub	[%o5+1], %o0
 	sll	%g3, 16, %g3
 	or	%g2, %g3, %g2
+	
 	ldub	[src+i], %o5
 	sll	%o0, 8, %o0
 	ld	[ctx+i], %g3
 	or	%g2, %o0, %g2
+	
 	or	%g2, %o5, %g2
 	xor	%g2, %g3, %g2
-
 	cmp	i, 12
 	bleu	.Lsource_loop
+	
 	st	%g2, [wtxt+i]
 
 	! ! Read a little-endian word
@@ -86,6 +96,7 @@ _aes_crypt:
 	! 
 	! ldub	[src+2], %g3
 	! or	%g3, %g2, %g2
+	
 	! sll	%g2, 8, %g2
 	! 
 	! ldub	[src+1], %g3
@@ -97,16 +108,19 @@ _aes_crypt:
 	! sll	%g2, 8, %g2
 	! 
 	! ld	[ctx+%o3], %g3
+	
 	! xor	%g3, %g2, %g2
 	! 
 	! add	src, 4, src
 	! st	%g2, [wtxt+%o4]
 	! 
 	! cmp	%o3, 8
+	
 	! bleu	.Lsource_loop
 	! add	%o3, 4, %o3
 
 	mov	16, round
+	! add	ctx, 16, key
 
 .Lround_loop:
 	! 4*i
@@ -121,12 +135,9 @@ _aes_crypt:
 	
 	! AES_SIDX1
 	ld	[idx-32], t1		! 1
-
 	! AES_SIDX2
-	! ld	[idx-16], t2		! 2
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
-	
 	! wtxt[IDX1...]
 	add	wtxt, t1, t1		! 1
 	ldub	[t1+2], t1		! 1
@@ -134,56 +145,44 @@ _aes_crypt:
 	! AES_SIDX3
 	ld	[idx], t3		! 3
 	sll	t1, 2, t1		! 1
-	
 	! wtxt[i]
 	ld	[wtxt+i], t0		! 0
-	
 	! wtxt[IDX2...]
 	lduh	[wtxt+t2], t2		! 2
 	
 	and	t0, 255, t0		! 0
-
 	! wtxt[IDX3...]
 	ldub	[wtxt+t3], t3		! 3
-	
 	sll	t0, 2, t0		! 0
-	add	t0, AES_TABLE0, t0	! 0
-	ld	[T+t0], t0		! 0
+	ld	[T0+t0], t0		! 0
 
-	add	t1, AES_TABLE1, t1	! 1
 	and	t2, 255, t2		! 2
-	ld	[T+t1], t1		! 1
+	ld	[T1+t1], t1		! 1
 	sll	t2, 2, t2		! 2
-	add	t2, AES_TABLE2, t2	! 2
-	ld	[T+t2], t2		! 2
+	ld	[T2+t2], t2		! 2
+
 	sll	t3, 2, t3		! 3
-	add	t3, AES_TABLE3, t3	! 3
-	ld	[T+t3], t3		! 3
+	ld	[T3+t3], t3		! 3
 	xor	t0, t1, t0		! 0, 1
 	xor	t0, t2, t0		! 0, 1, 2
 
 	add	idx, 4, idx		
-
 	! Fetch roundkey
-	! FIXME: We could save one instruction
-	! if we kept a pointer to the current subkey,
-	! indexed by i.
 	ld	[ctx+round], t1
 	add	round, 4, round
-	
+	! ld	[key+i], t1
 	xor	t0, t3, t0		! 0, 1, 2, 3
 
 	xor	t0, t1, t0
 	st	t0, [tmp+i]
-
 	cmp	i, 8
-
 	bleu	.Linner_loop
+	
 	add	i, 4, i
 	
 	! switch roles for tmp and wtxt
 	xor	wtxt, diff, wtxt
-
+	! add	key, 16, key
 	cmp	round, nrounds
 	blu	.Lround_loop
 	xor	tmp, diff, tmp