From e997f88b88d901ae99683b88465673d52b26706f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 26 Aug 2009 23:19:39 +0200
Subject: [PATCH] Work in progress checkin, replaced the third set of rounds.

Rev: nettle/x86/sha1-compress.asm:1.7
---
 x86/sha1-compress.asm | 126 +++++++++++++++++-------------------------
 1 file changed, 51 insertions(+), 75 deletions(-)

diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm
index 37140f9d..51bee9e0 100644
--- a/x86/sha1-compress.asm
+++ b/x86/sha1-compress.asm
@@ -31,7 +31,7 @@ define(<KVALUE>,<%esi>)				C  Used by rounds
 C Constants
 define(<K1VALUE>, <0x5A827999>)		C  Rounds  0-19
 define(<K2VALUE>, <0x6ED9EBA1>)		C  Rounds 20-39
-define(<K3VALUE>, <<$>0x8F1BBCDC>)		C  Rounds 40-59
+define(<K3VALUE>, <0x8F1BBCDC>)		C  Rounds 40-59
 define(<K4VALUE>, <0xCA62C1D6>)		C  Rounds 60-79
 	
 C Reads the input via T2 into register, byteswaps it, and stores it in the DATA array.
@@ -63,16 +63,11 @@ C The f functions,
 C
 C  f1(x,y,z) = z ^ (x & (y ^ z))
 C  f2(x,y,z) = x ^ y ^ z
-C  f3(x,y,z) = (x & y) | (z & (x | y))
+C  f3(x,y,z) = (x & (y ^ z)) + (y & z)
 C  f4 = f2
-C
-C The macro Fk(x,y,z) computes = fk(x,y,z). 
-C Result is left in T1.
-define(<F1>, <
-	movl	$3, T1
-	xorl	$2, T1
-	andl	$1, T1
-	xorl	$3, T1>)dnl
+
+C This form for f3 was suggested by George Spelvin. The terms can be
+C added into the result one at a time, saving one temporary.
 
 C The form of one sha1 round is
 C
@@ -87,23 +82,6 @@ C instead get
 C
 C   e += a <<< 5 + f( b, c, d ) + k + w;
 C   b <<<= 30
-C
-C ROUND(a,b,c,d,e,f,w)
-define(<ROUND>, <
-	addl	KVALUE, $5
-	addl	ifelse($7,,T1,$7), $5
-	$6($2,$3,$4)
-	addl	T1, $5
-
-C Using the T1 register can be avoided, by rotating $1 in place,
-C adding, and then rotating back.
-	movl	$1, T1
-	roll	<$>5, T1
-	addl	T1, $5
-	C roll	<$>5, $1
-	C addl	$1, $5
-	C rorl	<$>5, $1
-	roll	<$>30, $2>)dnl
 
 dnl ROUND_F1(a, b, c, d, e, i)
 define(<ROUND_F1>, <
@@ -158,33 +136,32 @@ define(<ROUND_F2>, <
 	add	T2, $5
 >)
 
+dnl ROUND_F3(a, b, c, d, e, i)
+define(<ROUND_F3>, <
+	mov	OFFSET(eval($6 % 16)) (DATA), T1
+	xor	OFFSET(eval(($6 +  2) % 16)) (DATA), T1
+	xor	OFFSET(eval(($6 +  8) % 16)) (DATA), T1
+	xor	OFFSET(eval(($6 + 13) % 16)) (DATA), T1
+	rol	<$>1, T1
+	mov	T1, OFFSET(eval($6 % 16)) (DATA)
+	mov	$4, T2
+	and	$3, T2
+	lea	K3VALUE (T1, T2), T1
+	mov	$4, T2
+	xor	$3, T2
+	and	$2, T2
+	add	T1, $5
+	rol	<$>30, $2
+	mov	$1, T1
+	rol	<$>5, T1
+	add	T1, $5
+	add	T2, $5
+>)
+
+
 C As suggested by George Spelvin, write the F3 function as
 C (x&y) | (y&z) | (x&z) == (x & (y^z)) + (y&z). Then, we can compute
 C and add each term to e, using a single temporary.
-	
-C ROUND_F3(a,b,c,d,e,w)
-define(<ROUND_F3>, <
-	addl	KVALUE, $5
-	addl	T1, $5
-
-	movl	$3, T1
-	andl	$4, T1
-	addl	T1, $5
-	movl	$3, T1
-	xorl	$4, T1
-	andl	$2, T1
-	addl	T1, $5
-
-C Using the T1 register can be avoided, by rotating $1 in place,
-C adding, and then rotating back.
-	movl	$1, T1
-	roll	<$>5, T1
-	addl	T1, $5
-	C roll	<$>5, $1
-	C addl	$1, $5
-	C rorl	<$>5, $1
-	roll	<$>30, $2>)dnl
-
 
 	.file "sha1-compress.asm"
 
@@ -269,30 +246,29 @@ PROLOGUE(_nettle_sha1_compress)
 	ROUND_F2(SC, SD, SE, SA, SB, 38, K2VALUE)
 	ROUND_F2(SB, SC, SD, SE, SA, 39, K2VALUE)
 
-	movl	K3VALUE, KVALUE
-	EXPAND(40) ROUND_F3(SA, SB, SC, SD, SE)
-	EXPAND(41) ROUND_F3(SE, SA, SB, SC, SD)
-	EXPAND(42) ROUND_F3(SD, SE, SA, SB, SC)
-	EXPAND(43) ROUND_F3(SC, SD, SE, SA, SB)
-	EXPAND(44) ROUND_F3(SB, SC, SD, SE, SA)
-
-	EXPAND(45) ROUND_F3(SA, SB, SC, SD, SE)
-	EXPAND(46) ROUND_F3(SE, SA, SB, SC, SD)
-	EXPAND(47) ROUND_F3(SD, SE, SA, SB, SC)
-	EXPAND(48) ROUND_F3(SC, SD, SE, SA, SB)
-	EXPAND(49) ROUND_F3(SB, SC, SD, SE, SA)
-
-	EXPAND(50) ROUND_F3(SA, SB, SC, SD, SE)
-	EXPAND(51) ROUND_F3(SE, SA, SB, SC, SD)
-	EXPAND(52) ROUND_F3(SD, SE, SA, SB, SC)
-	EXPAND(53) ROUND_F3(SC, SD, SE, SA, SB)
-	EXPAND(54) ROUND_F3(SB, SC, SD, SE, SA)
-
-	EXPAND(55) ROUND_F3(SA, SB, SC, SD, SE)
-	EXPAND(56) ROUND_F3(SE, SA, SB, SC, SD)
-	EXPAND(57) ROUND_F3(SD, SE, SA, SB, SC)
-	EXPAND(58) ROUND_F3(SC, SD, SE, SA, SB)
-	EXPAND(59) ROUND_F3(SB, SC, SD, SE, SA)
+	ROUND_F3(SA, SB, SC, SD, SE, 40)
+	ROUND_F3(SE, SA, SB, SC, SD, 41)
+	ROUND_F3(SD, SE, SA, SB, SC, 42)
+	ROUND_F3(SC, SD, SE, SA, SB, 43)
+	ROUND_F3(SB, SC, SD, SE, SA, 44)
+
+	ROUND_F3(SA, SB, SC, SD, SE, 45)
+	ROUND_F3(SE, SA, SB, SC, SD, 46)
+	ROUND_F3(SD, SE, SA, SB, SC, 47)
+	ROUND_F3(SC, SD, SE, SA, SB, 48)
+	ROUND_F3(SB, SC, SD, SE, SA, 49)
+
+	ROUND_F3(SA, SB, SC, SD, SE, 50)
+	ROUND_F3(SE, SA, SB, SC, SD, 51)
+	ROUND_F3(SD, SE, SA, SB, SC, 52)
+	ROUND_F3(SC, SD, SE, SA, SB, 53)
+	ROUND_F3(SB, SC, SD, SE, SA, 54)
+
+	ROUND_F3(SA, SB, SC, SD, SE, 55)
+	ROUND_F3(SE, SA, SB, SC, SD, 56)
+	ROUND_F3(SD, SE, SA, SB, SC, 57)
+	ROUND_F3(SC, SD, SE, SA, SB, 58)
+	ROUND_F3(SB, SC, SD, SE, SA, 59)
 
  	ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
  	ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
-- 
GitLab