diff --git a/ChangeLog b/ChangeLog
index 41633c25cc3a1bbf0a60567729bba7ef4f8955f0..a0ae4d6f6971b7546b042bc392f99f8cbc5f91df 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,10 +1,13 @@
 2013-12-03  Niels Möller  <nisse@lysator.liu.se>
 
+	* x86_64/sha1-compress.asm: Reorganized, to get closer to the x86
+	version. No difference in running time (benchmarked on ADM E-350).
+
 	* configure.ac (dummy-dep-files): Don't overwrite any existing
 	dependency files.
 
 	* x86_64/md5-compress.asm: New file, similar to the x86 version.
-	35% speedup.
+	35% speedup (benchmarked on ADM E-350).
 
 2013-11-25  Niels Möller  <nisse@lysator.liu.se>
 
diff --git a/x86_64/sha1-compress.asm b/x86_64/sha1-compress.asm
index 5155683ca332f1016138cb88237f0e337b76c5f7..4bc551fe83c3323a8bb540e555cf1c911fb51b8f 100644
--- a/x86_64/sha1-compress.asm
+++ b/x86_64/sha1-compress.asm
@@ -1,6 +1,6 @@
 C nettle, low-level cryptographics library
 C 
-C Copyright (C) 2004, 2008 Niels Möller
+C Copyright (C) 2004, 2008, 2013 Niels Möller
 C  
 C The nettle library is free software; you can redistribute it and/or modify
 C it under the terms of the GNU Lesser General Public License as published by
@@ -24,8 +24,8 @@ define(<SC>,<%ecx>)dnl
 define(<SD>,<%edx>)dnl
 define(<SE>,<%r9d>)dnl
 define(<DATA>,<%rsp>)dnl
-define(<TMP>,<%r10d>)dnl
-define(<TMP2>,<%r11d>)dnl			C  Used by F3
+define(<T1>,<%r10d>)dnl
+define(<T2>,<%r11d>)dnl
 define(<KVALUE>, <%esi>)dnl			
 
 C Arguments
@@ -46,49 +46,22 @@ define(<SWAP>, <
 	movl	$2, OFFSET($1) (DATA)
 >)dnl
 
-C expand(i) is the expansion function
-C
-C   W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1
-C
-C where W[i] is stored in DATA[i mod 16].
-C
-C Result is stored back in W[i], and also left in TMP, the only
-C register that is used.
-define(<EXPAND>, <
-	movl	OFFSET(eval($1 % 16)) (DATA), TMP
-	xorl	OFFSET(eval(($1 +  2) % 16)) (DATA), TMP
-	xorl	OFFSET(eval(($1 +  8) % 16)) (DATA), TMP
-	xorl	OFFSET(eval(($1 + 13) % 16)) (DATA), TMP
-	roll	<$>1, TMP
-	movl	TMP, OFFSET(eval($1 % 16)) (DATA)>)dnl
-define(<NOEXPAND>, <OFFSET($1) (DATA)>)dnl
-
 C The f functions,
 C
 C  f1(x,y,z) = z ^ (x & (y ^ z))
 C  f2(x,y,z) = x ^ y ^ z
 C  f3(x,y,z) = (x & y) | (z & (x | y))
+C	     = (x & (y ^ z)) + (y & z)
 C  f4 = f2
+
+C This form for f3 was suggested by George Spelvin. The terms can be
+C added into the result one at a time, saving one temporary.
+
+C expand(i) is the expansion function
 C
-C The macro Fk(x,y,z) computes = fk(x,y,z). 
-C Result is left in TMP.
-define(<F1>, <
-	movl	$3, TMP
-	xorl	$2, TMP
-	andl	$1, TMP
-	xorl	$3, TMP>)dnl
-define(<F2>, <
-	movl	$1, TMP
-	xorl	$2, TMP
-	xorl	$3, TMP>)dnl
-C Uses TMP2
-define(<F3>, <
-	movl	$1, TMP2
-	andl	$2, TMP2
-	movl	$1, TMP
-	orl	$2, TMP
-	andl	$3, TMP
-	orl	TMP2, TMP>)dnl
+C   W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1
+C
+C where W[i] is stored in DATA[i mod 16].
 
 C The form of one sha1 round is
 C
@@ -103,20 +76,85 @@ C instead get
 C
 C   e += a <<< 5 + f( b, c, d ) + k + w;
 C   b <<<= 30
-C
-C ROUND(a,b,c,d,e,f,w)
-define(<ROUND>, <
+
+dnl ROUND_F1(a, b, c, d, e, i)
+define(<ROUND_F1>, <
+	movl	OFFSET(eval($6 % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 +  2) % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 +  8) % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 + 13) % 16)) (DATA), T1
+	roll	<$>1, T1
+	movl	T1, OFFSET(eval($6 % 16)) (DATA)
+	movl	$4, T2
+	xorl	$3, T2
+	andl	$2, T2
+	xorl	$4, T2
+	roll	<$>30, $2
+	addl	T1, $5
+	addl	KVALUE, $5
+	movl	$1, T1
+	roll	<$>5, T1
+	addl	T1, $5
+	addl	T2, $5
+>)
+
+dnl ROUND_F1_NOEXP(a, b, c, d, e, i)
+define(<ROUND_F1_NOEXP>, <
+	movl	$4, T2
+	xorl	$3, T2
+	movl	$1, T1
+	andl	$2, T2
+	addl	OFFSET($6) (DATA), $5
+	xorl	$4, T2
+	addl	T2, $5
+	roll	<$>30, $2
+	roll	<$>5, T1
+	addl	T1, $5
 	addl	KVALUE, $5
-	addl	ifelse($7,,TMP,$7), $5
-	$6($2,$3,$4)
-	addl	TMP, $5
+>)
 
-C Using the TMP register could be avoided, by rotating $1 in place,
-C adding, and then rotating back.
-	movl	$1, TMP
-	roll	<$>5, TMP
-	addl	TMP, $5
-	roll	<$>30, $2>)dnl
+dnl ROUND_F2(a, b, c, d, e, i)
+define(<ROUND_F2>, <
+	movl	OFFSET(eval($6 % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 +  2) % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 +  8) % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 + 13) % 16)) (DATA), T1
+	roll	<$>1, T1
+	movl	T1, OFFSET(eval($6 % 16)) (DATA)
+	movl	$4, T2
+	xorl	$3, T2
+	xorl	$2, T2
+	roll	<$>30, $2
+	addl	T1, $5
+	addl	KVALUE, $5
+	movl	$1, T1
+	roll	<$>5, T1
+	addl	T1, $5
+	addl	T2, $5
+>)
+
+dnl ROUND_F3(a, b, c, d, e, i)
+define(<ROUND_F3>, <
+	movl	OFFSET(eval($6 % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 +  2) % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 +  8) % 16)) (DATA), T1
+	xorl	OFFSET(eval(($6 + 13) % 16)) (DATA), T1
+	roll	<$>1, T1
+	movl	T1, OFFSET(eval($6 % 16)) (DATA)
+	movl	$4, T2
+	andl	$3, T2
+	addl	T1, $5
+ 	addl	KVALUE, $5
+	movl	$4, T1
+	xorl	$3, T1
+	andl	$2, T1
+	addl	T2, $5
+	roll	<$>30, $2
+	movl	$1, T2
+	roll	<$>5, T2
+	addl	T1, $5
+	addl	T2, $5
+>)
 
 	.file "sha1-compress.asm"
 
@@ -128,7 +166,7 @@ PROLOGUE(_nettle_sha1_compress)
 	C save all registers that need to be saved
 	W64_ENTRY(2, 0)
 	
-	sub	$68, %rsp	C  %rsp = W
+	sub	$64, %rsp	C  %rsp = W
 
 	C Load and byteswap data
 	SWAP( 0, SA) SWAP( 1, SB) SWAP( 2, SC) SWAP( 3, SD)
@@ -144,104 +182,104 @@ PROLOGUE(_nettle_sha1_compress)
 	movl	16(STATE), SE
 
 	movl	K1VALUE, KVALUE
-	ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND( 0))
-	ROUND(SE, SA, SB, SC, SD, <F1>, NOEXPAND( 1))
-	ROUND(SD, SE, SA, SB, SC, <F1>, NOEXPAND( 2))
-	ROUND(SC, SD, SE, SA, SB, <F1>, NOEXPAND( 3))
-	ROUND(SB, SC, SD, SE, SA, <F1>, NOEXPAND( 4))
+	ROUND_F1_NOEXP(SA, SB, SC, SD, SE,  0)
+	ROUND_F1_NOEXP(SE, SA, SB, SC, SD,  1)
+	ROUND_F1_NOEXP(SD, SE, SA, SB, SC,  2)
+	ROUND_F1_NOEXP(SC, SD, SE, SA, SB,  3)
+	ROUND_F1_NOEXP(SB, SC, SD, SE, SA,  4)
 
-	ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND( 5))
-	ROUND(SE, SA, SB, SC, SD, <F1>, NOEXPAND( 6))
-	ROUND(SD, SE, SA, SB, SC, <F1>, NOEXPAND( 7))
-	ROUND(SC, SD, SE, SA, SB, <F1>, NOEXPAND( 8))
-	ROUND(SB, SC, SD, SE, SA, <F1>, NOEXPAND( 9))
+	ROUND_F1_NOEXP(SA, SB, SC, SD, SE,  5)
+	ROUND_F1_NOEXP(SE, SA, SB, SC, SD,  6)
+	ROUND_F1_NOEXP(SD, SE, SA, SB, SC,  7)
+	ROUND_F1_NOEXP(SC, SD, SE, SA, SB,  8)
+	ROUND_F1_NOEXP(SB, SC, SD, SE, SA,  9)
 
-	ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND(10))
-	ROUND(SE, SA, SB, SC, SD, <F1>, NOEXPAND(11))
-	ROUND(SD, SE, SA, SB, SC, <F1>, NOEXPAND(12))
-	ROUND(SC, SD, SE, SA, SB, <F1>, NOEXPAND(13))
-	ROUND(SB, SC, SD, SE, SA, <F1>, NOEXPAND(14))
+	ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 10)
+	ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 11)
+	ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 12)
+	ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 13)
+	ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 14)
 
-	ROUND(SA, SB, SC, SD, SE, <F1>, NOEXPAND(15))
-	EXPAND(16) ROUND(SE, SA, SB, SC, SD, <F1>)
-	EXPAND(17) ROUND(SD, SE, SA, SB, SC, <F1>)
-	EXPAND(18) ROUND(SC, SD, SE, SA, SB, <F1>)
-	EXPAND(19) ROUND(SB, SC, SD, SE, SA, <F1>)
+	ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 15)
+	ROUND_F1(SE, SA, SB, SC, SD, 16)
+	ROUND_F1(SD, SE, SA, SB, SC, 17)
+	ROUND_F1(SC, SD, SE, SA, SB, 18)
+	ROUND_F1(SB, SC, SD, SE, SA, 19)
 
 	movl	K2VALUE, KVALUE
-	EXPAND(20) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(21) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(22) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(23) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(24) ROUND(SB, SC, SD, SE, SA, <F2>)
-
-	EXPAND(25) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(26) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(27) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(28) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(29) ROUND(SB, SC, SD, SE, SA, <F2>)
-
-	EXPAND(30) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(31) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(32) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(33) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(34) ROUND(SB, SC, SD, SE, SA, <F2>)
-
-	EXPAND(35) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(36) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(37) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(38) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(39) ROUND(SB, SC, SD, SE, SA, <F2>)
+	ROUND_F2(SA, SB, SC, SD, SE, 20)
+	ROUND_F2(SE, SA, SB, SC, SD, 21)
+	ROUND_F2(SD, SE, SA, SB, SC, 22)
+	ROUND_F2(SC, SD, SE, SA, SB, 23)
+	ROUND_F2(SB, SC, SD, SE, SA, 24)
+				     
+	ROUND_F2(SA, SB, SC, SD, SE, 25)
+	ROUND_F2(SE, SA, SB, SC, SD, 26)
+	ROUND_F2(SD, SE, SA, SB, SC, 27)
+	ROUND_F2(SC, SD, SE, SA, SB, 28)
+	ROUND_F2(SB, SC, SD, SE, SA, 29)
+				     
+	ROUND_F2(SA, SB, SC, SD, SE, 30)
+	ROUND_F2(SE, SA, SB, SC, SD, 31)
+	ROUND_F2(SD, SE, SA, SB, SC, 32)
+	ROUND_F2(SC, SD, SE, SA, SB, 33)
+	ROUND_F2(SB, SC, SD, SE, SA, 34)
+				     
+	ROUND_F2(SA, SB, SC, SD, SE, 35)
+	ROUND_F2(SE, SA, SB, SC, SD, 36)
+	ROUND_F2(SD, SE, SA, SB, SC, 37)
+	ROUND_F2(SC, SD, SE, SA, SB, 38)
+	ROUND_F2(SB, SC, SD, SE, SA, 39)
 
 	movl	K3VALUE, KVALUE
-	EXPAND(40) ROUND(SA, SB, SC, SD, SE, <F3>)
-	EXPAND(41) ROUND(SE, SA, SB, SC, SD, <F3>)
-	EXPAND(42) ROUND(SD, SE, SA, SB, SC, <F3>)
-	EXPAND(43) ROUND(SC, SD, SE, SA, SB, <F3>)
-	EXPAND(44) ROUND(SB, SC, SD, SE, SA, <F3>)
-
-	EXPAND(45) ROUND(SA, SB, SC, SD, SE, <F3>)
-	EXPAND(46) ROUND(SE, SA, SB, SC, SD, <F3>)
-	EXPAND(47) ROUND(SD, SE, SA, SB, SC, <F3>)
-	EXPAND(48) ROUND(SC, SD, SE, SA, SB, <F3>)
-	EXPAND(49) ROUND(SB, SC, SD, SE, SA, <F3>)
-
-	EXPAND(50) ROUND(SA, SB, SC, SD, SE, <F3>)
-	EXPAND(51) ROUND(SE, SA, SB, SC, SD, <F3>)
-	EXPAND(52) ROUND(SD, SE, SA, SB, SC, <F3>)
-	EXPAND(53) ROUND(SC, SD, SE, SA, SB, <F3>)
-	EXPAND(54) ROUND(SB, SC, SD, SE, SA, <F3>)
-
-	EXPAND(55) ROUND(SA, SB, SC, SD, SE, <F3>)
-	EXPAND(56) ROUND(SE, SA, SB, SC, SD, <F3>)
-	EXPAND(57) ROUND(SD, SE, SA, SB, SC, <F3>)
-	EXPAND(58) ROUND(SC, SD, SE, SA, SB, <F3>)
-	EXPAND(59) ROUND(SB, SC, SD, SE, SA, <F3>)
+	ROUND_F3(SA, SB, SC, SD, SE, 40)
+	ROUND_F3(SE, SA, SB, SC, SD, 41)
+	ROUND_F3(SD, SE, SA, SB, SC, 42)
+	ROUND_F3(SC, SD, SE, SA, SB, 43)
+	ROUND_F3(SB, SC, SD, SE, SA, 44)
+				     
+	ROUND_F3(SA, SB, SC, SD, SE, 45)
+	ROUND_F3(SE, SA, SB, SC, SD, 46)
+	ROUND_F3(SD, SE, SA, SB, SC, 47)
+	ROUND_F3(SC, SD, SE, SA, SB, 48)
+	ROUND_F3(SB, SC, SD, SE, SA, 49)
+				     
+	ROUND_F3(SA, SB, SC, SD, SE, 50)
+	ROUND_F3(SE, SA, SB, SC, SD, 51)
+	ROUND_F3(SD, SE, SA, SB, SC, 52)
+	ROUND_F3(SC, SD, SE, SA, SB, 53)
+	ROUND_F3(SB, SC, SD, SE, SA, 54)
+				     
+	ROUND_F3(SA, SB, SC, SD, SE, 55)
+	ROUND_F3(SE, SA, SB, SC, SD, 56)
+	ROUND_F3(SD, SE, SA, SB, SC, 57)
+	ROUND_F3(SC, SD, SE, SA, SB, 58)
+	ROUND_F3(SB, SC, SD, SE, SA, 59)
 
 	movl	K4VALUE, KVALUE
-	EXPAND(60) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(61) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(62) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(63) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(64) ROUND(SB, SC, SD, SE, SA, <F2>)
-
-	EXPAND(65) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(66) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(67) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(68) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(69) ROUND(SB, SC, SD, SE, SA, <F2>)
-
-	EXPAND(70) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(71) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(72) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(73) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(74) ROUND(SB, SC, SD, SE, SA, <F2>)
-
-	EXPAND(75) ROUND(SA, SB, SC, SD, SE, <F2>)
-	EXPAND(76) ROUND(SE, SA, SB, SC, SD, <F2>)
-	EXPAND(77) ROUND(SD, SE, SA, SB, SC, <F2>)
-	EXPAND(78) ROUND(SC, SD, SE, SA, SB, <F2>)
-	EXPAND(79) ROUND(SB, SC, SD, SE, SA, <F2>)
+	ROUND_F2(SA, SB, SC, SD, SE, 60)
+	ROUND_F2(SE, SA, SB, SC, SD, 61)
+	ROUND_F2(SD, SE, SA, SB, SC, 62)
+	ROUND_F2(SC, SD, SE, SA, SB, 63)
+	ROUND_F2(SB, SC, SD, SE, SA, 64)
+				     
+	ROUND_F2(SA, SB, SC, SD, SE, 65)
+	ROUND_F2(SE, SA, SB, SC, SD, 66)
+	ROUND_F2(SD, SE, SA, SB, SC, 67)
+	ROUND_F2(SC, SD, SE, SA, SB, 68)
+	ROUND_F2(SB, SC, SD, SE, SA, 69)
+				     
+	ROUND_F2(SA, SB, SC, SD, SE, 70)
+	ROUND_F2(SE, SA, SB, SC, SD, 71)
+	ROUND_F2(SD, SE, SA, SB, SC, 72)
+	ROUND_F2(SC, SD, SE, SA, SB, 73)
+	ROUND_F2(SB, SC, SD, SE, SA, 74)
+				     
+	ROUND_F2(SA, SB, SC, SD, SE, 75)
+	ROUND_F2(SE, SA, SB, SC, SD, 76)
+	ROUND_F2(SD, SE, SA, SB, SC, 77)
+	ROUND_F2(SC, SD, SE, SA, SB, 78)
+	ROUND_F2(SB, SC, SD, SE, SA, 79)
 
 	C Update the state vector
 	addl	SA,   (STATE) 
@@ -250,7 +288,7 @@ PROLOGUE(_nettle_sha1_compress)
 	addl	SD, 12(STATE) 
 	addl	SE, 16(STATE)
 
-	add	$68, %rsp
+	add	$64, %rsp
 	W64_EXIT(2, 0)
 	ret
 EPILOGUE(_nettle_sha1_compress)