From 280e0fda9f47b51dbda08730bb4eee27be73f50f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Sun, 24 Oct 2004 22:54:38 +0200
Subject: [PATCH] Reverted the latest two changes; update bost src and dst
 pointers in the loop, and use plain addb when updating j. These two previous
 changes slowed the code down on AMD Duron.

Rev: src/nettle/x86/arcfour-crypt.asm:1.10
---
 x86/arcfour-crypt.asm | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/x86/arcfour-crypt.asm b/x86/arcfour-crypt.asm
index 43e4f7d3..74dd7763 100644
--- a/x86/arcfour-crypt.asm
+++ b/x86/arcfour-crypt.asm
@@ -47,7 +47,7 @@ C Register usage:
 	C %ebx = j
 	C %cl  = si
 	C %ch  = sj
-	
+
 	movl	24(%esp), %edx		C  length
 	testl	%edx,%edx
 	jz	.Lend
@@ -59,15 +59,15 @@ C Register usage:
 	
 	movzbl  256(%ebp), %eax		C  i
 	movzbl  257(%ebp), %ebx		C  j
-	subl	%esi, %edi
 .Lloop:
 C	incb	%al
 	incl	%eax
 	andl	$0xff, %eax
 	movzbl  (%ebp, %eax), %ecx	C  si. Clears high bytes
-C	addb    %cl, %bl
-	addl	%ecx, %ebx
-	andl	$0xff, %ebx
+	addb    %cl, %bl
+C The addl andl is preferable on PPro and PII, but slows thing down on AMD Duron.
+C	addl	%ecx, %ebx
+C	andl	$0xff, %ebx
 	movb    (%ebp, %ebx), %ch	C  sj
 	movb    %ch, (%ebp, %eax)	C  S[i] = sj
 	movb	%cl, (%ebp, %ebx)	C  S[j] = si
@@ -76,8 +76,9 @@ C	addb    %cl, %bl
 					C  for indexing.
 	movb    (%ebp, %ecx), %cl
 	xorb    (%esi), %cl
-	movb    %cl, (%esi,%edi)
 	incl    %esi
+	movb    %cl, (%edi)
+	incl    %edi
 	cmpl	%esi, %edx
 	jne	.Lloop
 
-- 
GitLab