diff --git a/sparc/aes.asm b/sparc/aes.asm
index 79a5d37fde36d13cbefd4a647d11c575b5c62ece..a861b4c18d8fe0587b85f2ed8c749d0535f3cae1 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -24,7 +24,7 @@
 	! Used registers:	%l0,1,2,3,4,5,6,7
 	!			%i0,1,2,3,4,5 (%i6=%fp, %i7 = return)
 	!			%o0,1,2,3,4,5,7 (%o6=%sp)
-	!			%g1,2,3
+	!			%g1,2,3,5
 	
 	.file	"aes.asm"
 	
@@ -42,9 +42,7 @@ define(dst, %i3)
 define(src, %i4)
 
 ! Loop invariants
-! NOTE: We overwrite %fp with the wtxt pointer,
-!       so it must be restored at the end of the function.
-define(wtxt, %fp)
+define(wtxt, %l0)
 define(tmp, %l1)
 define(diff, %l2)
 define(nrounds, %l3)
@@ -57,7 +55,7 @@ define(key, %o4)
 ! Further loop invariants
 define(T0, %l6)
 define(T1, %l7)
-define(T2, %l0)
+define(T2, %g5)
 define(T3, %o7)
 define(IDX1, %i5)
 define(IDX3, %o5)
@@ -81,10 +79,10 @@ _aes_crypt:
 	save	%sp, -FRAME_SIZE, %sp
 	cmp	length, 0
 	be	.Lend
-	add	%fp, -24, tmp
-
-	C NOTE: Over writes %fp
-	add	%fp, -40, wtxt
+	! wtxt
+	add	%fp, -24, wtxt
+	
+	add	%fp, -40, tmp
 	ld	[ctx + AES_NROUNDS], nrounds
 	! Compute xor, so that we can swap efficiently.
 	xor	wtxt, tmp, diff
@@ -101,7 +99,7 @@ _aes_crypt:
 	! Read src, and add initial subkey
 	! Difference between ctx and src.
 	! NOTE: These instructions are duplicated in the delay slot,
-	! and the instruction before the branch
+	! and the instruction before the branch.
 	sub	ctx, src, %g2
 	! Difference between wtxt and src
 	sub	wtxt, src, %g3
@@ -109,7 +107,7 @@ _aes_crypt:
 	! For stop condition. Note that src is incremented in the
 	! delay slot
 	add	src, 8, %g1
-		
+	
 .Lsource_loop:
 	ldub	[src+3], t3
 	ldub	[src+2], t2
@@ -146,53 +144,55 @@ _aes_crypt:
 	! register for that sub-expression. True for j==1,3.
 	
 	ld	[IDX1+i], t1		! 1
+	
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
 	add	wtxt, t1, t1		! 1
 	ldub	[t1+2], t1		! 1
-	
 	ld	[IDX3+i], t3		! 3
+	
 	sll	t1, 2, t1		! 1
 	ld	[wtxt+i], t0		! 0
 	lduh	[wtxt+t2], t2		! 2
-	
 	and	t0, 255, t0		! 0
+	
 	ldub	[wtxt+t3], t3		! 3
 	sll	t0, 2, t0		! 0
 	ld	[T0+t0], t0		! 0
-	
 	and	t2, 255, t2		! 2
+	
 	ld	[T1+t1], t1		! 1
 	sll	t2, 2, t2		! 2
 	ld	[T2+t2], t2		! 2
-	
 	sll	t3, 2, t3		! 3
+	
 	ld	[T3+t3], t3		! 3
 	xor	t0, t1, t0		! 0, 1
 	xor	t0, t2, t0		! 0, 1, 2
-	
 	! Fetch roundkey
 	ld	[key+i], t1
+	
 	xor	t0, t3, t0		! 0, 1, 2, 3
 	xor	t0, t1, t0
 	st	t0, [tmp+i]
-	
 	cmp	i, 8
+	
 	bleu	.Linner_loop
 	add	i, 4, i
 	! switch roles for tmp and wtxt
 	xor	wtxt, diff, wtxt
-	
 	xor	tmp, diff, tmp
+
 	subcc	round, 1, round
 	add	key, 16, key
 	bne	.Linner_loop
-	
 	mov	0, i
+
 	! final round
 	! Use round as the loop variable, as it's already zero
 undefine(<i>)
 define(i, round)
+
 	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
 	! the instruction is part of
 	! NOTE: First instruction duplicated in delay slot
@@ -246,7 +246,6 @@ define(i, round)
 	bne	.Lblock_loop
 	sub	wtxt, src, %g3
 
-	add	%sp, FRAME_SIZE, %fp
 .Lend:
 	ret
 	restore