diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm
index dbf438616497635167ccca436847fe1fd89f824e..b5b135cc427685a77e27468942ff0ff0fc8c6f28 100644
--- a/x86/sha1-compress.asm
+++ b/x86/sha1-compress.asm
@@ -252,3 +252,58 @@ _nettle_sha1_compress:
 
 .Leord:
 	.size	_nettle_sha1_compress,.Leord-_nettle_sha1_compress
+
+C  It's possible to shave of half of the stores to tmp in the evaluation of f3,
+C  although it's probably not worth the effort. This is the trick: 
+C  
+C  round(a,b,c,d,e,f,k) modifies only b,e.
+C  
+C  round(a,b,c,d,e,f3,k) load + store
+C  round(e,a,b,c,d,f3,k) load + store
+C  
+C  ; f3(b,c,d) = (b & c) | (d & (b | c))
+C  
+C    movl b, tmp
+C    andl c, tmp
+C    movl tmp, tmp2
+C    movl b, tmp
+C    orl  c, tmp
+C    andl d, tmp
+C    orl tmp2, tmp
+C  
+C  and corresponding code for f3(a,b,c)
+C  
+C  Use the register allocated for c as a temporary?
+C  
+C    movl c, tmp2
+C  ; f3(b,c,d) = (b & c) | (d & (b | c))
+C    movl b, tmp
+C    orl  c, tmp
+C    andl b, c
+C    andl d, tmp
+C    orl  c, tmp
+C  
+C  ; fr(a,b,c) = (a & b) | (c & (a | b))
+C    movl b, tmp
+C    andl a, tmp
+C    movl a, c
+C    orl  b, c
+C    andl tmp2, c
+C    orl  c, tmp
+C  
+C    movl tmp2, c
+C  
+C  Before: 14 instr, 2 store, 2 load
+C  After: 13 instr, 1 store, 2 load
+C  
+C  Final load can be folded into the next round,
+C  
+C  round(d,e,a,b,c,f3,k)
+C  
+C    c += d <<< 5 + f(e, a, b) + k + w
+C  
+C  if we arrange to have w placed directly into the register
+C  corresponding to w. That way we save one more instruction, total save
+C  of two instructions, one of which is a store, per two rounds. For the
+C  twenty rounds involving f3, that's 20 instructions, 10 of which are
+C  stores, or about 1.5 %.