From 7a70cc7a7ee328fc8bcf4cca238d3ec422cb2517 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Mon, 25 Feb 2002 17:00:37 +0100
Subject: [PATCH] (_aes_crypt): Some peep hole optimizations, duplicating some
 instructions to fill nop:s, and put branch instructions on even word
 addresses.

Rev: src/nettle/sparc/aes.asm:1.66
---
 sparc/aes.asm | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/sparc/aes.asm b/sparc/aes.asm
index e49a195d..c2420ba7 100644
--- a/sparc/aes.asm
+++ b/sparc/aes.asm
@@ -69,12 +69,12 @@ _aes_crypt:
 	add	T, AES_SIDX3, IDX3
 	! Read src, and add initial subkey
 	! Difference between ctx and src.
-	! NOTE: This instruction is duplicated in the delay slot
+	! NOTE: These instruction is duplicated in the delay slot,
+	! and the instruction before the branch
 	sub	ctx, src, %g2
-
-.Lblock_loop:
 	! Difference between wtxt and src
 	sub	wtxt, src, %g3
+.Lblock_loop:
 	! For stop condition. Note that src is incremented in the
 	! delay slot
 	add	src, 8, %g4
@@ -105,6 +105,7 @@ _aes_crypt:
 	add	ctx, 16, key
 .Lround_loop:
 	! 4*i
+	! NOTE: Instruction duplicated in delay slot
 	mov	0, i
 .Linner_loop:
 	! The comments mark which j in T->table[j][ Bj(wtxt[IDXi(i)]) ]
@@ -151,21 +152,23 @@ _aes_crypt:
 	add	i, 4, i
 	! switch roles for tmp and wtxt
 	xor	wtxt, diff, wtxt
+	xor	tmp, diff, tmp
+
 	subcc	round, 1, round
-	
 	add	key, 16, key
-	nop
-	bne	.Lround_loop
-	xor	tmp, diff, tmp
+	bne	.Linner_loop
+	mov	0, i
 
 	! final round
-	! 4*i
-	mov	0, i
+	! Use round as the loop variable, as it's already zero
+undefine(<i>)
+define(i, round)
 
-.Lfinal_loop:
 	! Comments mark which j in T->sbox[Bj(wtxt[IDXj(i)])]
 	! the instruction is part of
+	! NOTE: First instruction duplicated in delay slot
 	ld	[IDX1+i], t1 	! 1
+.Lfinal_loop:
 	! IDX2(j) = j XOR 2
 	xor	i, 8, t2
 	! ld	[idx-16], t2	! 2
@@ -205,15 +208,15 @@ _aes_crypt:
 	stb	t3, [dst+3]
 	stb	t2, [dst+2]
 	stb	t0, [dst]
+	add	dst, 4, dst
 	
 	bleu	.Lfinal_loop
-	add	dst, 4, dst
-
+	ld	[IDX1+i], t1 	! 1
 	addcc	length, -16, length
-	nop
+	sub	ctx, src, %g2
 	
 	bne	.Lblock_loop
-	sub	ctx, src, %g2
+	sub	wtxt, src, %g3
 
 .Lend:
 	ret
-- 
GitLab