diff --git a/x86/sha1-compress.asm b/x86/sha1-compress.asm
index 1e4a91107a42be47f3fb8b10f636867128ecf90b..f6fa347be92105fb24acfd2670ae0d9c0f513fae 100644
--- a/x86/sha1-compress.asm
+++ b/x86/sha1-compress.asm
@@ -25,7 +25,7 @@ define(<SD>,<%edx>)
 define(<SE>,<%ebp>)
 define(<DATA>,<%esp>)
 define(<T1>,<%edi>)
-define(<T2>,<%esi>)				C  Used by SWAP
+define(<T2>,<%esi>)
 	
 C Constants
 define(<K1VALUE>, <0x5A827999>)		C  Rounds  0-19
@@ -77,8 +77,8 @@ define(<ROUND_F1>, <
 	xor	$3, T2
 	and	$2, T2
 	xor	$4, T2
-	lea	K1VALUE (T1, T2), T2
 	rol	<$>30, $2
+	lea	K1VALUE (T1, $5), $5
 	mov	$1, T1
 	rol	<$>5, T1
 	add	T1, $5
@@ -110,8 +110,8 @@ define(<ROUND_F2>, <
 	mov	$4, T2
 	xor	$3, T2
 	xor	$2, T2
-	lea	$7 (T1, T2), T2
 	rol	<$>30, $2
+	lea	$7 (T1, $5), $5
 	mov	$1, T1
 	rol	<$>5, T1
 	add	T1, $5
@@ -128,14 +128,14 @@ define(<ROUND_F3>, <
 	mov	T1, OFFSET(eval($6 % 16)) (DATA)
 	mov	$4, T2
 	and	$3, T2
-	lea	K3VALUE (T1, T2), T1
-	mov	$4, T2
-	xor	$3, T2
-	and	$2, T2
-	add	T1, $5
+ 	lea	K3VALUE (T1, $5), $5
+	mov	$4, T1
+	xor	$3, T1
+	and	$2, T1
+	add	T2, $5
 	rol	<$>30, $2
-	mov	$1, T1
-	rol	<$>5, T1
+	mov	$1, T2
+	rol	<$>5, T2
 	add	T1, $5
 	add	T2, $5
 >)
@@ -145,7 +145,7 @@ define(<ROUND_F3>, <
 	C _nettle_sha1_compress(uint32_t *state, uint8_t *data)
 	
 	.text
-	ALIGN(4)
+
 PROLOGUE(_nettle_sha1_compress)
 	C save all registers that need to be saved
 	C 			   88(%esp)  data
@@ -158,117 +158,1360 @@ PROLOGUE(_nettle_sha1_compress)
 
 	subl	$64, %esp	C  %esp = W
 
-	C Load and byteswap data
-	movl	88(%esp), T2
-
-	SWAP( 0, %eax) SWAP( 1, %ebx) SWAP( 2, %ecx) SWAP( 3, %edx)
-	SWAP( 4, %eax) SWAP( 5, %ebx) SWAP( 6, %ecx) SWAP( 7, %edx)
-	SWAP( 8, %eax) SWAP( 9, %ebx) SWAP(10, %ecx) SWAP(11, %edx)
-	SWAP(12, %eax) SWAP(13, %ebx) SWAP(14, %ecx) SWAP(15, %edx)
-
-	C load the state vector
-	movl	84(%esp),T1
-	movl	(T1),   SA
-	movl	4(T1),  SB
-	movl	8(T1),  SC
-	movl	12(T1), SD
-	movl	16(T1), SE
-
-	ROUND_F1_NOEXP(SA, SB, SC, SD, SE,  0)
-	ROUND_F1_NOEXP(SE, SA, SB, SC, SD,  1)
-	ROUND_F1_NOEXP(SD, SE, SA, SB, SC,  2)
-	ROUND_F1_NOEXP(SC, SD, SE, SA, SB,  3)
-	ROUND_F1_NOEXP(SB, SC, SD, SE, SA,  4)
-
-	ROUND_F1_NOEXP(SA, SB, SC, SD, SE,  5)
-	ROUND_F1_NOEXP(SE, SA, SB, SC, SD,  6)
-	ROUND_F1_NOEXP(SD, SE, SA, SB, SC,  7)
-	ROUND_F1_NOEXP(SC, SD, SE, SA, SB,  8)
-	ROUND_F1_NOEXP(SB, SC, SD, SE, SA,  9)
-
-	ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 10)
-	ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 11)
-	ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 12)
-	ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 13)
-	ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 14)
-
-	ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 15)
-	ROUND_F1(SE, SA, SB, SC, SD, 16)
-	ROUND_F1(SD, SE, SA, SB, SC, 17)
-	ROUND_F1(SC, SD, SE, SA, SB, 18)
-	ROUND_F1(SB, SC, SD, SE, SA, 19)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 20, K2VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 21, K2VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 22, K2VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 23, K2VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 24, K2VALUE)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 25, K2VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 26, K2VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 27, K2VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 28, K2VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 29, K2VALUE)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 30, K2VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 31, K2VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 32, K2VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 33, K2VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 34, K2VALUE)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 35, K2VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 36, K2VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 37, K2VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 38, K2VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 39, K2VALUE)
-
-	ROUND_F3(SA, SB, SC, SD, SE, 40)
-	ROUND_F3(SE, SA, SB, SC, SD, 41)
-	ROUND_F3(SD, SE, SA, SB, SC, 42)
-	ROUND_F3(SC, SD, SE, SA, SB, 43)
-	ROUND_F3(SB, SC, SD, SE, SA, 44)
-
-	ROUND_F3(SA, SB, SC, SD, SE, 45)
-	ROUND_F3(SE, SA, SB, SC, SD, 46)
-	ROUND_F3(SD, SE, SA, SB, SC, 47)
-	ROUND_F3(SC, SD, SE, SA, SB, 48)
-	ROUND_F3(SB, SC, SD, SE, SA, 49)
-
-	ROUND_F3(SA, SB, SC, SD, SE, 50)
-	ROUND_F3(SE, SA, SB, SC, SD, 51)
-	ROUND_F3(SD, SE, SA, SB, SC, 52)
-	ROUND_F3(SC, SD, SE, SA, SB, 53)
-	ROUND_F3(SB, SC, SD, SE, SA, 54)
-
-	ROUND_F3(SA, SB, SC, SD, SE, 55)
-	ROUND_F3(SE, SA, SB, SC, SD, 56)
-	ROUND_F3(SD, SE, SA, SB, SC, 57)
-	ROUND_F3(SC, SD, SE, SA, SB, 58)
-	ROUND_F3(SB, SC, SD, SE, SA, 59)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
-
-	ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
+	C Loop-mixed to 520 cycles (for the complete function call) on
+	C AMD K7.
+ALIGN(5)
+	mov	88(%esp), T2
+	mov	OFFSET(2)(T2), %ecx
+	mov	OFFSET(0)(T2), %eax
+	bswap	%ecx
+	bswap	%eax
+	mov	%ecx, OFFSET(2) (DATA)
+	mov	%eax, OFFSET(0) (DATA)
+	mov	OFFSET(3)(T2), %edx
+	mov	OFFSET(6)(T2), %ecx
+	mov	OFFSET(4)(T2), %eax
+	mov	OFFSET(1)(T2), %ebx
+	bswap	%ebx
+	bswap	%eax
+	bswap	%ecx
+	mov	%ecx, OFFSET(6) (DATA)
+	mov	%eax, OFFSET(4) (DATA)
+	bswap	%edx
+	mov	%edx, OFFSET(3) (DATA)
+	mov	%ebx, OFFSET(1) (DATA)
+	mov	OFFSET(10)(T2), %ecx
+	mov	OFFSET(8)(T2), %eax
+	mov	OFFSET(7)(T2), %edx
+	bswap	%eax
+	bswap	%edx
+	mov	%edx, OFFSET(7) (DATA)
+	mov	OFFSET(5)(T2), %ebx
+	mov	%eax, OFFSET(8) (DATA)
+	mov	OFFSET(11)(T2), %edx
+	bswap	%ecx
+	bswap	%edx
+	mov	OFFSET(12)(T2), %eax
+	bswap	%ebx
+	mov	%ecx, OFFSET(10) (DATA)
+	mov	%ebx, OFFSET(5) (DATA)
+	mov	%edx, OFFSET(11) (DATA)
+	mov	OFFSET(15)(T2), %edx
+	mov	84(%esp),T1
+	mov	OFFSET(9)(T2), %ebx
+	bswap	%edx
+	bswap	%ebx
+	bswap	%eax
+	mov	OFFSET(14)(T2), %ecx
+	mov	%edx, OFFSET(15) (DATA)
+	bswap	%ecx
+	mov	%ecx, OFFSET(14) (DATA)
+	mov	%ebx, OFFSET(9) (DATA)
+	mov	OFFSET(13)(T2), %ebx
+	mov	12(T1), SD
+	bswap	%ebx
+	mov	%ebx, OFFSET(13) (DATA)
+	mov	8(T1),  SC
+	mov	16(T1), SE
+	mov	4(T1),  SB
+	mov	SD, T2
+	add	OFFSET(0) (DATA), SE
+	xor	SC, T2
+	mov	%eax, OFFSET(12) (DATA)
+	mov	(T1),   SA
+	and	SB, T2
+	xor	SD, T2
+	rol	$30, SB
+	add	T2, SE
+	mov	SA, T1
+	mov	SC, T2
+	add	OFFSET(1) (DATA), SD
+	rol	$5, T1
+	xor	SB, T2
+	and	SA, T2
+	xor	SC, T2
+	lea	K1VALUE (T1, SE), SE
+	add	T2, SD
+	mov	SB, T2
+	rol	$30, SA
+	xor	SA, T2
+	and	SE, T2
+	mov	SE, T1
+	add	OFFSET(2) (DATA), SC
+	rol	$30, SE
+	xor	SB, T2
+	rol	$5, T1
+	lea	K1VALUE (T1, SD), SD
+	mov	SD, T1
+	rol	$5, T1
+	add	T2, SC
+	mov	SA, T2
+	xor	SE, T2
+	lea	K1VALUE (T1, SC), SC
+	and	SD, T2
+	xor	SA, T2
+	add	OFFSET(3) (DATA), SB
+	mov	SC, T1
+	add	T2, SB
+	mov	SE, T2
+	rol	$30, SD
+	xor	SD, T2
+	and	SC, T2
+	rol	$5, T1
+	xor	SE, T2
+	add	OFFSET(4) (DATA), SA
+	lea	K1VALUE (T1, SB), SB
+	add	T2, SA
+	rol	$30, SC
+	mov	SD, T2
+	xor	SC, T2
+	and	SB, T2
+	mov	SB, T1
+	rol	$5, T1
+	add	OFFSET(5) (DATA), SE
+	rol	$30, SB
+	xor	SD, T2
+	add	T2, SE
+	mov	SC, T2
+	xor	SB, T2
+	lea	K1VALUE (T1, SA), SA
+	mov	SA, T1
+	add	OFFSET(6) (DATA), SD
+	and	SA, T2
+	rol	$5, T1
+	xor	SC, T2
+	lea	K1VALUE (T1, SE), SE
+	rol	$30, SA
+	add	T2, SD
+	mov	SB, T2
+	mov	SE, T1
+	xor	SA, T2
+	and	SE, T2
+	rol	$5, T1
+	lea	K1VALUE (T1, SD), SD
+	xor	SB, T2
+	add	OFFSET(7) (DATA), SC
+	rol	$30, SE
+	add	OFFSET(8) (DATA), SB
+	mov	SD, T1
+	add	T2, SC
+	mov	SA, T2
+	xor	SE, T2
+	rol	$5, T1
+	and	SD, T2
+	lea	K1VALUE (T1, SC), SC
+	xor	SA, T2
+	add	T2, SB
+	mov	SE, T2
+	mov	SC, T1
+	rol	$30, SD
+	xor	SD, T2
+	rol	$5, T1
+	lea	K1VALUE (T1, SB), SB
+	and	SC, T2
+	xor	SE, T2
+	add	OFFSET(10) (DATA), SE
+	add	OFFSET(9) (DATA), SA
+	mov	SB, T1
+	add	T2, SA
+	rol	$5, T1
+	lea	K1VALUE (T1, SA), SA
+	mov	SD, T2
+	rol	$30, SC
+	xor	SC, T2
+	and	SB, T2
+	xor	SD, T2
+	rol	$30, SB
+	add	T2, SE
+	mov	SC, T2
+	mov	SA, T1
+	xor	SB, T2
+	add	OFFSET(11) (DATA), SD
+	and	SA, T2
+	rol	$30, SA
+	rol	$5, T1
+	xor	SC, T2
+	lea	K1VALUE (T1, SE), SE
+	add	T2, SD
+	mov	SB, T2
+	xor	SA, T2
+	mov	SE, T1
+	rol	$5, T1
+	and	SE, T2
+	lea	K1VALUE (T1, SD), SD
+	xor	SB, T2
+	add	OFFSET(12) (DATA), SC
+	add	T2, SC
+	rol	$30, SE
+	mov	SA, T2
+	xor	SE, T2
+	mov	SD, T1
+	rol	$5, T1
+	and	SD, T2
+	add	OFFSET(13) (DATA), SB
+	lea	K1VALUE (T1, SC), SC
+	xor	SA, T2
+	add	T2, SB
+	mov	SE, T2
+	rol	$30, SD
+	xor	SD, T2
+	and	SC, T2
+	mov	SC, T1
+	rol	$5, T1
+	rol	$30, SC
+	add	OFFSET(14) (DATA), SA
+	xor	SE, T2
+	add	T2, SA
+	mov	SD, T2
+	xor	SC, T2
+	lea	K1VALUE (T1, SB), SB
+	and	SB, T2
+	mov	SB, T1
+	rol	$5, T1
+	lea	K1VALUE (T1, SA), SA
+	mov	SA, T1
+	xor	SD, T2
+	add	OFFSET(15) (DATA), SE
+	add	T2, SE
+	rol	$5, T1
+	lea	K1VALUE (T1, SE), SE
+	mov	OFFSET(0) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	mov	SC, T2
+	xor	OFFSET(8) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	rol	$30, SB
+	xor	SB, T2
+	and	SA, T2
+	xor	SC, T2
+	rol	$1, T1
+	lea	K1VALUE (T1, T2), T2
+	mov	T1, OFFSET(0) (DATA)
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(1) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	rol	$30, SA
+	add	T2, SD
+	mov	SB, T2
+	xor	SA, T2
+	and	SE, T2
+	xor	OFFSET(9) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	xor	SB, T2
+	rol	$1, T1
+	mov	T1, OFFSET(1) (DATA)
+	lea	K1VALUE (T1, T2), T2
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(2) (DATA), T1
+	xor	OFFSET(4) (DATA), T1
+	rol	$30, SE
+	add	T2, SC
+	mov	SA, T2
+	xor	SE, T2
+	xor	OFFSET(10) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	and	SD, T2
+	rol	$1, T1
+	xor	SA, T2
+	mov	T1, OFFSET(2) (DATA)
+	lea	K1VALUE (T1, T2), T2
+	mov	SC, T1
+	rol	$30, SD
+	rol	$5, T1
+	add	T1, SB
+	add	T2, SB
+	mov	SE, T2
+	mov	OFFSET(3) (DATA), T1
+	xor	SD, T2
+	xor	OFFSET(5) (DATA), T1
+	and	SC, T2
+	xor	SE, T2
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(3) (DATA)
+	lea	K1VALUE (T1, T2), T2
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(4) (DATA), T1
+	xor	OFFSET(6) (DATA), T1
+	rol	$30, SC
+	xor	OFFSET(12) (DATA), T1
+	add	T2, SA
+	xor	OFFSET(1) (DATA), T1
+	mov	SD, T2
+	xor	SC, T2
+	rol	$1, T1
+	xor	SB, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(4) (DATA)
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(5) (DATA), T1
+	add	T2, SE
+	mov	SC, T2
+	xor	OFFSET(7) (DATA), T1
+	rol	$30, SB
+	xor	OFFSET(13) (DATA), T1
+	xor	SB, T2
+	xor	OFFSET(2) (DATA), T1
+	xor	SA, T2
+	rol	$1, T1
+	mov	T1, OFFSET(5) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(6) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	add	T2, SD
+	rol	$30, SA
+	xor	OFFSET(14) (DATA), T1
+	mov	SB, T2
+	xor	OFFSET(3) (DATA), T1
+	xor	SA, T2
+	rol	$1, T1
+	xor	SE, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(6) (DATA)
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	add	T2, SC
+	mov	SA, T2
+	rol	$30, SE
+	mov	OFFSET(7) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	xor	SE, T2
+	xor	OFFSET(15) (DATA), T1
+	xor	OFFSET(4) (DATA), T1
+	xor	SD, T2
+	rol	$1, T1
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(7) (DATA)
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(8) (DATA), T1
+	xor	OFFSET(10) (DATA), T1
+	add	T2, SB
+	rol	$30, SD
+	mov	SE, T2
+	xor	OFFSET(0) (DATA), T1
+	xor	OFFSET(5) (DATA), T1
+	xor	SD, T2
+	xor	SC, T2
+	rol	$1, T1
+	mov	T1, OFFSET(8) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(9) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	add	T2, SA
+	xor	OFFSET(6) (DATA), T1
+	mov	SD, T2
+	rol	$1, T1
+	rol	$30, SC
+	xor	SC, T2
+	mov	T1, OFFSET(9) (DATA)
+	xor	SB, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(10) (DATA), T1
+	xor	OFFSET(12) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	add	T2, SE
+	mov	SC, T2
+	rol	$30, SB
+	xor	OFFSET(7) (DATA), T1
+	xor	SB, T2
+	rol	$1, T1
+	xor	SA, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(10) (DATA)
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(11) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	rol	$30, SA
+	xor	OFFSET(3) (DATA), T1
+	add	T2, SD
+	xor	OFFSET(8) (DATA), T1
+	mov	SB, T2
+	xor	SA, T2
+	rol	$1, T1
+	mov	T1, OFFSET(11) (DATA)
+	xor	SE, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(12) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	rol	$30, SE
+	add	T2, SC
+	xor	OFFSET(4) (DATA), T1
+	mov	SA, T2
+	xor	OFFSET(9) (DATA), T1
+	xor	SE, T2
+	rol	$1, T1
+	xor	SD, T2
+	mov	T1, OFFSET(12) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	rol	$30, SD
+	mov	OFFSET(13) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	add	T2, SB
+	mov	SE, T2
+	xor	OFFSET(5) (DATA), T1
+	xor	SD, T2
+	xor	OFFSET(10) (DATA), T1
+	xor	SC, T2
+	rol	$1, T1
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(13) (DATA)
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	add	T2, SA
+	mov	SD, T2
+	mov	OFFSET(14) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	rol	$30, SC
+	xor	OFFSET(6) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	xor	SC, T2
+	xor	SB, T2
+	rol	$1, T1
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(14) (DATA)
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(15) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	add	T2, SE
+	mov	SC, T2
+	rol	$30, SB
+	xor	SB, T2
+	xor	OFFSET(7) (DATA), T1
+	xor	OFFSET(12) (DATA), T1
+	xor	SA, T2
+	rol	$1, T1
+	mov	T1, OFFSET(15) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(0) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	add	T2, SD
+	mov	SB, T2
+	rol	$30, SA
+	xor	SA, T2
+	xor	OFFSET(13) (DATA), T1
+	rol	$1, T1
+	xor	SE, T2
+	mov	T1, OFFSET(0) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(1) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	add	T2, SC
+	mov	SA, T2
+	rol	$30, SE
+	xor	SE, T2
+	xor	OFFSET(9) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	rol	$1, T1
+	xor	SD, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(1) (DATA)
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(2) (DATA), T1
+	rol	$30, SD
+	xor	OFFSET(4) (DATA), T1
+	add	T2, SB
+	mov	SE, T2
+	xor	OFFSET(10) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	xor	SD, T2
+	xor	SC, T2
+	rol	$1, T1
+	mov	T1, OFFSET(2) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(3) (DATA), T1
+	xor	OFFSET(5) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	add	T2, SA
+	rol	$30, SC
+	mov	SD, T2
+	xor	SC, T2
+	rol	$1, T1
+	xor	SB, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(3) (DATA)
+	mov	SA, T1
+	rol	$5, T1
+	rol	$30, SB
+	add	T1, SE
+	mov	OFFSET(4) (DATA), T1
+	add	T2, SE
+	xor	OFFSET(6) (DATA), T1
+	xor	OFFSET(12) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	mov	SC, T2
+	xor	SB, T2
+	rol	$1, T1
+	xor	SA, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	T1, OFFSET(4) (DATA)
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	add	T2, SD
+	mov	OFFSET(5) (DATA), T1
+	mov	SB, T2
+	rol	$30, SA
+	xor	SA, T2
+	xor	SE, T2
+	xor	OFFSET(7) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(5) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(6) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	add	T2, SC
+	xor	OFFSET(14) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(6) (DATA)
+	mov	SA, T2
+	rol	$30, SE
+	xor	SE, T2
+	xor	SD, T2
+	lea	K2VALUE (T1, T2), T2
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	add	T2, SB
+	mov	OFFSET(7) (DATA), T1
+	mov	SE, T2
+	rol	$30, SD
+	xor	OFFSET(9) (DATA), T1
+	xor	SD, T2
+	xor	SC, T2
+	xor	OFFSET(15) (DATA), T1
+	xor	OFFSET(4) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(7) (DATA)
+	lea	K2VALUE (T1, T2), T2
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(8) (DATA), T1
+	xor	OFFSET(10) (DATA), T1
+	rol	$30, SC
+	xor	OFFSET(0) (DATA), T1
+	add	T2, SA
+	mov	SD, T2
+	xor	OFFSET(5) (DATA), T1
+	rol	$1, T1
+	and	SC, T2
+	mov	T1, OFFSET(8) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SE
+	mov	SA, T1
+	mov	SD, T2
+	xor	SC, T2
+	and	SB, T2
+	rol	$30, SB
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(9) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	add	T2, SE
+	mov	SC, T2
+	xor	OFFSET(6) (DATA), T1
+	rol	$1, T1
+	and	SB, T2
+	mov	T1, OFFSET(9) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SD
+	mov	SC, T2
+	xor	SB, T2
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(10) (DATA), T1
+	and	SA, T2
+	add	T2, SD
+	xor	OFFSET(12) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	rol	$30, SA
+	mov	SB, T2
+	and	SA, T2
+	xor	OFFSET(7) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(10) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SC
+	mov	SD, T1
+	rol	$5, T1
+	mov	SB, T2
+	add	T1, SC
+	mov	OFFSET(11) (DATA), T1
+	xor	SA, T2
+	xor	OFFSET(13) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	and	SE, T2
+	xor	OFFSET(8) (DATA), T1
+	add	T2, SC
+	rol	$1, T1
+	mov	SA, T2
+	mov	T1, OFFSET(11) (DATA)
+	rol	$30, SE
+	and	SE, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SB
+	mov	SA, T2
+	mov	SC, T1
+	xor	SE, T2
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(12) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	xor	OFFSET(4) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	and	SD, T2
+	rol	$30, SD
+	add	T2, SB
+	rol	$1, T1
+	mov	T1, OFFSET(12) (DATA)
+	mov	SE, T2
+	and	SD, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SA
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(13) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	mov	SE, T2
+	xor	OFFSET(5) (DATA), T1
+	xor	SD, T2
+	and	SC, T2
+	xor	OFFSET(10) (DATA), T1
+	add	T2, SA
+	rol	$1, T1
+	rol	$30, SC
+	mov	T1, OFFSET(13) (DATA)
+	mov	SD, T2
+	and	SC, T2
+	lea	K3VALUE (T1, T2), T1
+	mov	SD, T2
+	add	T1, SE
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(14) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	xor	SC, T2
+	and	SB, T2
+	xor	OFFSET(6) (DATA), T1
+	rol	$30, SB
+	xor	OFFSET(11) (DATA), T1
+	rol	$1, T1
+	add	T2, SE
+	mov	SC, T2
+	mov	T1, OFFSET(14) (DATA)
+	and	SB, T2
+	lea	K3VALUE (T1, T2), T1
+	mov	SC, T2
+	add	T1, SD
+	mov	SE, T1
+	xor	SB, T2
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(15) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	and	SA, T2
+	xor	OFFSET(7) (DATA), T1
+	xor	OFFSET(12) (DATA), T1
+	add	T2, SD
+	rol	$30, SA
+	mov	SB, T2
+	rol	$1, T1
+	mov	T1, OFFSET(15) (DATA)
+	and	SA, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SC
+	mov	SD, T1
+	mov	SB, T2
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(0) (DATA), T1
+	xor	SA, T2
+	xor	OFFSET(2) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	and	SE, T2
+	add	T2, SC
+	rol	$30, SE
+	rol	$1, T1
+	mov	T1, OFFSET(0) (DATA)
+	mov	SA, T2
+	and	SE, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SB
+	mov	SC, T1
+	mov	SA, T2
+	xor	SE, T2
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(1) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	and	SD, T2
+	xor	OFFSET(14) (DATA), T1
+	add	T2, SB
+	rol	$30, SD
+	mov	SE, T2
+	rol	$1, T1
+	and	SD, T2
+	mov	T1, OFFSET(1) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SA
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	SE, T2
+	mov	OFFSET(2) (DATA), T1
+	xor	SD, T2
+	xor	OFFSET(4) (DATA), T1
+	xor	OFFSET(10) (DATA), T1
+	and	SC, T2
+	add	T2, SA
+	xor	OFFSET(15) (DATA), T1
+	rol	$30, SC
+	mov	SD, T2
+	rol	$1, T1
+	mov	T1, OFFSET(2) (DATA)
+	and	SC, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SE
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(3) (DATA), T1
+	xor	OFFSET(5) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	mov	SD, T2
+	rol	$1, T1
+	xor	SC, T2
+	and	SB, T2
+	mov	T1, OFFSET(3) (DATA)
+	rol	$30, SB
+	add	T2, SE
+	mov	SC, T2
+	and	SB, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SD
+	mov	SE, T1
+	mov	SC, T2
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(4) (DATA), T1
+	xor	OFFSET(6) (DATA), T1
+	xor	SB, T2
+	and	SA, T2
+	add	T2, SD
+	mov	SB, T2
+	xor	OFFSET(12) (DATA), T1
+	rol	$30, SA
+	and	SA, T2
+	xor	OFFSET(1) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(4) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SC
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(5) (DATA), T1
+	xor	OFFSET(7) (DATA), T1
+	mov	SB, T2
+	xor	OFFSET(13) (DATA), T1
+	xor	SA, T2
+	xor	OFFSET(2) (DATA), T1
+	and	SE, T2
+	rol	$30, SE
+	add	T2, SC
+	rol	$1, T1
+	mov	SA, T2
+	mov	T1, OFFSET(5) (DATA)
+	and	SE, T2
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SB
+	mov	SA, T2
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	xor	SE, T2
+	and	SD, T2
+	mov	OFFSET(6) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	rol	$1, T1
+	add	T2, SB
+	rol	$30, SD
+	mov	SE, T2
+	and	SD, T2
+	mov	T1, OFFSET(6) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SA
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(7) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	mov	SE, T2
+	xor	SD, T2
+	xor	OFFSET(15) (DATA), T1
+	and	SC, T2
+	rol	$30, SC
+	add	T2, SA
+	mov	SD, T2
+	xor	OFFSET(4) (DATA), T1
+	rol	$1, T1
+	and	SC, T2
+	mov	T1, OFFSET(7) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SE
+	mov	SA, T1
+	rol	$5, T1
+	mov	SD, T2
+	add	T1, SE
+	mov	OFFSET(8) (DATA), T1
+	xor	OFFSET(10) (DATA), T1
+	xor	SC, T2
+	xor	OFFSET(0) (DATA), T1
+	and	SB, T2
+	add	T2, SE
+	xor	OFFSET(5) (DATA), T1
+	rol	$30, SB
+	mov	SC, T2
+	and	SB, T2
+	rol	$1, T1
+	mov	T1, OFFSET(8) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SD
+	mov	SE, T1
+	rol	$5, T1
+	mov	SC, T2
+	xor	SB, T2
+	add	T1, SD
+	and	SA, T2
+	mov	OFFSET(9) (DATA), T1
+	rol	$30, SA
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	add	T2, SD
+	mov	SB, T2
+	and	SA, T2
+	xor	OFFSET(6) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(9) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	add	T1, SC
+	mov	SD, T1
+	rol	$5, T1
+	mov	SB, T2
+	xor	SA, T2
+	and	SE, T2
+	add	T1, SC
+	mov	OFFSET(10) (DATA), T1
+	xor	OFFSET(12) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	add	T2, SC
+	mov	SA, T2
+	rol	$30, SE
+	xor	OFFSET(7) (DATA), T1
+	rol	$1, T1
+	and	SE, T2
+	mov	T1, OFFSET(10) (DATA)
+	lea	K3VALUE (T1, T2), T1
+	mov	SA, T2
+	xor	SE, T2
+	add	T1, SB
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(11) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	and	SD, T2
+	add	T2, SB
+	mov	SE, T2
+	rol	$1, T1
+	mov	T1, OFFSET(11) (DATA)
+	rol	$30, SD
+	and	SD, T2
+	lea	K3VALUE (T1, T2), T1
+	mov	SE, T2
+	add	T1, SA
+	xor	SD, T2
+	mov	SB, T1
+	and	SC, T2
+	rol	$30, SC
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(12) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	add	T2, SA
+	mov	SD, T2
+	xor	OFFSET(4) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(12) (DATA)
+	xor	SC, T2
+	xor	SB, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(13) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	add	T2, SE
+	rol	$30, SB
+	mov	SC, T2
+	xor	OFFSET(5) (DATA), T1
+	xor	SB, T2
+	xor	OFFSET(10) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(13) (DATA)
+	xor	SA, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(14) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	rol	$30, SA
+	add	T2, SD
+	mov	SB, T2
+	xor	SA, T2
+	xor	SE, T2
+	xor	OFFSET(6) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(14) (DATA)
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	add	T2, SC
+	mov	OFFSET(15) (DATA), T1
+	mov	SA, T2
+	rol	$30, SE
+	xor	OFFSET(1) (DATA), T1
+	xor	OFFSET(7) (DATA), T1
+	xor	SE, T2
+	xor	SD, T2
+	xor	OFFSET(12) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(15) (DATA)
+	lea	K4VALUE (T1, T2), T2
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(0) (DATA), T1
+	add	T2, SB
+	xor	OFFSET(2) (DATA), T1
+	mov	SE, T2
+	rol	$30, SD
+	xor	OFFSET(8) (DATA), T1
+	xor	SD, T2
+	xor	OFFSET(13) (DATA), T1
+	xor	SC, T2
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(0) (DATA)
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(1) (DATA), T1
+	rol	$30, SC
+	xor	OFFSET(3) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	add	T2, SA
+	mov	SD, T2
+	xor	SC, T2
+	rol	$1, T1
+	xor	SB, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(1) (DATA)
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(2) (DATA), T1
+	rol	$30, SB
+	xor	OFFSET(4) (DATA), T1
+	add	T2, SE
+	mov	SC, T2
+	xor	SB, T2
+	xor	OFFSET(10) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	xor	SA, T2
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(2) (DATA)
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	mov	OFFSET(3) (DATA), T1
+	xor	OFFSET(5) (DATA), T1
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	rol	$30, SA
+	add	T2, SD
+	mov	SB, T2
+	rol	$1, T1
+	mov	T1, OFFSET(3) (DATA)
+	xor	SA, T2
+	xor	SE, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(4) (DATA), T1
+	add	T2, SC
+	rol	$30, SE
+	xor	OFFSET(6) (DATA), T1
+	mov	SA, T2
+	xor	OFFSET(12) (DATA), T1
+	xor	SE, T2
+	xor	OFFSET(1) (DATA), T1
+	rol	$1, T1
+	xor	SD, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(4) (DATA)
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	rol	$30, SD
+	mov	OFFSET(5) (DATA), T1
+	add	T2, SB
+	xor	OFFSET(7) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	mov	SE, T2
+	xor	SD, T2
+	xor	OFFSET(2) (DATA), T1
+	xor	SC, T2
+	rol	$1, T1
+	mov	T1, OFFSET(5) (DATA)
+	lea	K4VALUE (T1, T2), T2
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(6) (DATA), T1
+	xor	OFFSET(8) (DATA), T1
+	xor	OFFSET(14) (DATA), T1
+	add	T2, SA
+	xor	OFFSET(3) (DATA), T1
+	mov	SD, T2
+	rol	$30, SC
+	rol	$1, T1
+	xor	SC, T2
+	mov	T1, OFFSET(6) (DATA)
+	xor	SB, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	add	T2, SE
+	mov	OFFSET(7) (DATA), T1
+	xor	OFFSET(9) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	rol	$30, SB
+	xor	OFFSET(4) (DATA), T1
+	mov	SC, T2
+	rol	$1, T1
+	mov	T1, OFFSET(7) (DATA)
+	xor	SB, T2
+	xor	SA, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	rol	$30, SA
+	mov	OFFSET(8) (DATA), T1
+	xor	OFFSET(10) (DATA), T1
+	add	T2, SD
+	xor	OFFSET(0) (DATA), T1
+	xor	OFFSET(5) (DATA), T1
+	rol	$1, T1
+	mov	SB, T2
+	mov	T1, OFFSET(8) (DATA)
+	xor	SA, T2
+	xor	SE, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	add	T2, SC
+	mov	SA, T2
+	mov	OFFSET(9) (DATA), T1
+	rol	$30, SE
+	xor	OFFSET(11) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	xor	OFFSET(6) (DATA), T1
+	xor	SE, T2
+	xor	SD, T2
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(9) (DATA)
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	rol	$30, SD
+	mov	OFFSET(10) (DATA), T1
+	xor	OFFSET(12) (DATA), T1
+	xor	OFFSET(2) (DATA), T1
+	add	T2, SB
+	mov	SE, T2
+	xor	SD, T2
+	xor	SC, T2
+	xor	OFFSET(7) (DATA), T1
+	rol	$1, T1
+	mov	T1, OFFSET(10) (DATA)
+	lea	K4VALUE (T1, T2), T2
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	mov	OFFSET(11) (DATA), T1
+	xor	OFFSET(13) (DATA), T1
+	xor	OFFSET(3) (DATA), T1
+	add	T2, SA
+	mov	SD, T2
+	rol	$30, SC
+	xor	SC, T2
+	xor	OFFSET(8) (DATA), T1
+	rol	$1, T1
+	xor	SB, T2
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(11) (DATA)
+	mov	SA, T1
+	rol	$5, T1
+	add	T1, SE
+	mov	OFFSET(12) (DATA), T1
+	add	T2, SE
+	xor	OFFSET(14) (DATA), T1
+	rol	$30, SB
+	mov	SC, T2
+	xor	OFFSET(4) (DATA), T1
+	xor	SB, T2
+	xor	SA, T2
+	xor	OFFSET(9) (DATA), T1
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(12) (DATA)
+	mov	SE, T1
+	rol	$5, T1
+	add	T1, SD
+	add	T2, SD
+	rol	$30, SA
+	mov	OFFSET(13) (DATA), T1
+	xor	OFFSET(15) (DATA), T1
+	mov	SB, T2
+	xor	OFFSET(5) (DATA), T1
+	xor	SA, T2
+	xor	OFFSET(10) (DATA), T1
+	xor	SE, T2
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(13) (DATA)
+	mov	SD, T1
+	rol	$5, T1
+	add	T1, SC
+	mov	OFFSET(14) (DATA), T1
+	xor	OFFSET(0) (DATA), T1
+	xor	OFFSET(6) (DATA), T1
+	add	T2, SC
+	rol	$30, SE
+	mov	SA, T2
+	xor	SE, T2
+	xor	OFFSET(11) (DATA), T1
+	xor	SD, T2
+	rol	$1, T1
+	lea	K4VALUE (T1, T2), T2
+	mov	T1, OFFSET(14) (DATA)
+	mov	SC, T1
+	rol	$5, T1
+	add	T1, SB
+	mov	OFFSET(15) (DATA), T1
+	xor	OFFSET(1) (DATA), T1
+	xor	OFFSET(7) (DATA), T1
+	rol	$30, SD
+	add	T2, SB
+	xor	OFFSET(12) (DATA), T1
+	mov	SE, T2
+	xor	SD, T2
+	rol	$1, T1
+	xor	SC, T2
+	lea	K4VALUE (T1, T2), T2
+	rol	$30, SC
+	mov	T1, OFFSET(15) (DATA)
+	mov	SB, T1
+	rol	$5, T1
+	add	T1, SA
+	add	T2, SA
 
-	ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
-	ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
-	ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
-	ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
-	ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
+C 	C Load and byteswap data
+C 	movl	88(%esp), T2
+C 
+C 	SWAP( 0, %eax) SWAP( 1, %ebx) SWAP( 2, %ecx) SWAP( 3, %edx)
+C 	SWAP( 4, %eax) SWAP( 5, %ebx) SWAP( 6, %ecx) SWAP( 7, %edx)
+C 	SWAP( 8, %eax) SWAP( 9, %ebx) SWAP(10, %ecx) SWAP(11, %edx)
+C 	SWAP(12, %eax) SWAP(13, %ebx) SWAP(14, %ecx) SWAP(15, %edx)
+C 
+C 	C load the state vector
+C 	movl	84(%esp),T1
+C 	movl	(T1),   SA
+C 	movl	4(T1),  SB
+C 	movl	8(T1),  SC
+C 	movl	12(T1), SD
+C 	movl	16(T1), SE
+C 
+C 	ROUND_F1_NOEXP(SA, SB, SC, SD, SE,  0)
+C 	ROUND_F1_NOEXP(SE, SA, SB, SC, SD,  1)
+C 	ROUND_F1_NOEXP(SD, SE, SA, SB, SC,  2)
+C 	ROUND_F1_NOEXP(SC, SD, SE, SA, SB,  3)
+C 	ROUND_F1_NOEXP(SB, SC, SD, SE, SA,  4)
+C 
+C 	ROUND_F1_NOEXP(SA, SB, SC, SD, SE,  5)
+C 	ROUND_F1_NOEXP(SE, SA, SB, SC, SD,  6)
+C 	ROUND_F1_NOEXP(SD, SE, SA, SB, SC,  7)
+C 	ROUND_F1_NOEXP(SC, SD, SE, SA, SB,  8)
+C 	ROUND_F1_NOEXP(SB, SC, SD, SE, SA,  9)
+C 
+C 	ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 10)
+C 	ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 11)
+C 	ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 12)
+C 	ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 13)
+C 	ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 14)
+C 
+C 	ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 15)
+C 	ROUND_F1(SE, SA, SB, SC, SD, 16)
+C 	ROUND_F1(SD, SE, SA, SB, SC, 17)
+C 	ROUND_F1(SC, SD, SE, SA, SB, 18)
+C 	ROUND_F1(SB, SC, SD, SE, SA, 19)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 20, K2VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 21, K2VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 22, K2VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 23, K2VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 24, K2VALUE)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 25, K2VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 26, K2VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 27, K2VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 28, K2VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 29, K2VALUE)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 30, K2VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 31, K2VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 32, K2VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 33, K2VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 34, K2VALUE)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 35, K2VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 36, K2VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 37, K2VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 38, K2VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 39, K2VALUE)
+C 
+C 	ROUND_F3(SA, SB, SC, SD, SE, 40)
+C 	ROUND_F3(SE, SA, SB, SC, SD, 41)
+C 	ROUND_F3(SD, SE, SA, SB, SC, 42)
+C 	ROUND_F3(SC, SD, SE, SA, SB, 43)
+C 	ROUND_F3(SB, SC, SD, SE, SA, 44)
+C 
+C 	ROUND_F3(SA, SB, SC, SD, SE, 45)
+C 	ROUND_F3(SE, SA, SB, SC, SD, 46)
+C 	ROUND_F3(SD, SE, SA, SB, SC, 47)
+C 	ROUND_F3(SC, SD, SE, SA, SB, 48)
+C 	ROUND_F3(SB, SC, SD, SE, SA, 49)
+C 
+C 	ROUND_F3(SA, SB, SC, SD, SE, 50)
+C 	ROUND_F3(SE, SA, SB, SC, SD, 51)
+C 	ROUND_F3(SD, SE, SA, SB, SC, 52)
+C 	ROUND_F3(SC, SD, SE, SA, SB, 53)
+C 	ROUND_F3(SB, SC, SD, SE, SA, 54)
+C 
+C 	ROUND_F3(SA, SB, SC, SD, SE, 55)
+C 	ROUND_F3(SE, SA, SB, SC, SD, 56)
+C 	ROUND_F3(SD, SE, SA, SB, SC, 57)
+C 	ROUND_F3(SC, SD, SE, SA, SB, 58)
+C 	ROUND_F3(SB, SC, SD, SE, SA, 59)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
+C 
+C 	ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
+C 	ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
+C 	ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
+C 	ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
+C 	ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
 
 	C Update the state vector
 	movl	84(%esp),T1
@@ -285,3 +1528,14 @@ PROLOGUE(_nettle_sha1_compress)
 	popl	%ebx
 	ret
 EPILOGUE(_nettle_sha1_compress)
+
+C TODO:
+
+C * Extend loopmixer so that it can exploit associativity, and for
+C   example reorder
+C
+C	add 	%eax, %ebx
+C	add	%ecx, %ebx
+
+C * Use mmx instructions for the data expansion, doing two words at a
+C   time.