From 73176207ef291ffafc8f93a553c4a9225bf6a32b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Sun, 8 Feb 2004 00:59:26 +0100
Subject: [PATCH] Comment on further optimization.

Rev: src/nettle/x86/sha1-compress.asm:1.5
---
 x86/sha1-compress.asm | 55 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm
index dbf43861..b5b135cc 100644
--- a/x86/sha1-compress.asm
+++ b/x86/sha1-compress.asm
@@ -252,3 +252,58 @@ _nettle_sha1_compress:
 
 .Leord:
 	.size	_nettle_sha1_compress,.Leord-_nettle_sha1_compress
+
+C  It's possible to shave of half of the stores to tmp in the evaluation of f3,
+C  although it's probably not worth the effort. This is the trick: 
+C  
+C  round(a,b,c,d,e,f,k) modifies only b,e.
+C  
+C  round(a,b,c,d,e,f3,k) load + store
+C  round(e,a,b,c,d,f3,k) load + store
+C  
+C  ; f3(b,c,d) = (b & c) | (d & (b | c))
+C  
+C    movl b, tmp
+C    andl c, tmp
+C    movl tmp, tmp2
+C    movl b, tmp
+C    orl  c, tmp
+C    andl d, tmp
+C    orl tmp2, tmp
+C  
+C  and corresponding code for f3(a,b,c)
+C  
+C  Use the register allocated for c as a temporary?
+C  
+C    movl c, tmp2
+C  ; f3(b,c,d) = (b & c) | (d & (b | c))
+C    movl b, tmp
+C    orl  c, tmp
+C    andl b, c
+C    andl d, tmp
+C    orl  c, tmp
+C  
+C  ; fr(a,b,c) = (a & b) | (c & (a | b))
+C    movl b, tmp
+C    andl a, tmp
+C    movl a, c
+C    orl  b, c
+C    andl tmp2, c
+C    orl  c, tmp
+C  
+C    movl tmp2, c
+C  
+C  Before: 14 instr, 2 store, 2 load
+C  After: 13 instr, 1 store, 2 load
+C  
+C  Final load can be folded into the next round,
+C  
+C  round(d,e,a,b,c,f3,k)
+C  
+C    c += d <<< 5 + f(e, a, b) + k + w
+C  
+C  if we arrange to have w placed directly into the register
+C  corresponding to w. That way we save one more instruction, total save
+C  of two instructions, one of which is a store, per two rounds. For the
+C  twenty rounds involving f3, that's 20 instructions, 10 of which are
+C  stores, or about 1.5 %.
-- 
GitLab