diff --git a/sparc/aes.asm b/sparc/aes.asm
index 6e0f0b69c6206390e5ace66df7cf3c24236b4e2e..9fce564f473bf51dbc86865296155c70bb10baba 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -18,6 +18,13 @@
 ! the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 ! MA 02111-1307, USA.
 
+! FIXME: For improved ultra sparc performance, we should avoid ALU
+! instructions that use the result of an immediately preceeding ALU
+! instruction. It is also a good idea to have a greater distance than
+! one instruction between a load and use of its value, as that reduces
+! the penalty for cache misses. Such instruction sequences are marked
+! with !U comments.
+
 ! NOTE: Some of the %g registers are reserved for operating system etc
 ! (see gcc/config/sparc.h). The only %g registers that seems safe to
 ! use are %g1-%g3.
@@ -28,7 +35,7 @@
 	!			
 	
 	.file	"aes.asm"
-	
+
 ! Arguments
 define(ctx, %i0)
 define(T, %i1)
@@ -71,7 +78,7 @@ define(<AES_LOAD>, <
 	ldub	[src+$1+2], t2
 	sll	t1, 8, t1
 	
-	or	t0, t1, t0	
+	or	t0, t1, t0	! U
 	ldub	[src+$1+3], t1
 	sll	t2, 16, t2
 	or	t0, t2, t0
@@ -98,7 +105,7 @@ define(<AES_ROUND>, <
 	
 	ld	[T0+t0], t0		! 0
 	sll	t1, 2, t1		! 1
-	ld	[T1+t1], t1		! 1
+	ld	[T1+t1], t1		! 1 !U
 	ld	[IDX3+$1], t2		! 3
 	
 	xor	t0, t1, t0		! 0, 1
@@ -107,15 +114,15 @@ define(<AES_ROUND>, <
 	ldub	[wtxt+t2], t2		! 3
 	sll	t1, 2, t1		! 2
 	
-	ld	[T2+t1], t1		! 2
+	ld	[T2+t1], t1		! 2	!U
 	sll	t2, 2, t2		! 3
-	ld	[T3+t2], t2		! 3
+	ld	[T3+t2], t2		! 3	!U
 	xor	t0, t1, t0		! 0, 1, 2
 	
 	! Fetch roundkey
 	ld	[key + $1], t1
 	xor	t0, t2, t0		! 0, 1, 2, 3
-	xor	t0, t1, t0
+	xor	t0, t1, t0		!U
 	st	t0, [tmp + $1]>)dnl
 
 C AES_FINAL_ROUND(i)
@@ -133,7 +140,7 @@ define(<AES_FINAL_ROUND>, <
 	ldub	[T+t1], t1		! 1
 	ld	[IDX3 + $1], t2		! 3
 	sll	t1, 8, t1		! 1
-	or	t0, t1, t0		! 0, 1
+	or	t0, t1, t0		! 0, 1 !U
 	
 	! IDX2(j) = j XOR 2
 	ldub	[wtxt+eval($1 ^ 8)+1], t1	! 2
@@ -142,19 +149,19 @@ define(<AES_FINAL_ROUND>, <
 	ldub	[T+t2], t2		! 3
 	
 	sll	t1, 16, t1		! 2
-	or	t0, t1, t0		! 0, 1, 2
+	or	t0, t1, t0		! 0, 1, 2 !U
 	sll	t2, 24, t2		! 3
 	ld	[key + $1], t1
 	
 	or	t0, t2, t0		! 0, 1, 2, 3
-	xor	t0, t1, t0
-	srl	t0, 24, t1
-	stb	t1, [dst+$1+3]
+	xor	t0, t1, t0		!U
+	srl	t0, 24, t1		!U
+	stb	t1, [dst+$1+3]		!U
 	
 	srl	t0, 16, t1
-	stb	t1, [dst+$1+2]
+	stb	t1, [dst+$1+2]		!U
 	srl	t0, 8, t1
-	stb	t1, [dst+$1+1]
+	stb	t1, [dst+$1+1]		!U
 	
 	stb	t0, [dst+$1]>)dnl