Commit 8581e507 authored by Niels Möller's avatar Niels Möller
Browse files

Improved performance by 17% on AMD K7, by letting loopmix scramble the

instruction order.

Rev: nettle/x86/sha1-compress.asm:1.9
parent 19e6b152
......@@ -25,7 +25,7 @@ define(<SD>,<%edx>)
define(<SE>,<%ebp>)
define(<DATA>,<%esp>)
define(<T1>,<%edi>)
define(<T2>,<%esi>) C Used by SWAP
define(<T2>,<%esi>)
C Constants
define(<K1VALUE>, <0x5A827999>) C Rounds 0-19
......@@ -77,8 +77,8 @@ define(<ROUND_F1>, <
xor $3, T2
and $2, T2
xor $4, T2
lea K1VALUE (T1, T2), T2
rol <$>30, $2
lea K1VALUE (T1, $5), $5
mov $1, T1
rol <$>5, T1
add T1, $5
......@@ -110,8 +110,8 @@ define(<ROUND_F2>, <
mov $4, T2
xor $3, T2
xor $2, T2
lea $7 (T1, T2), T2
rol <$>30, $2
lea $7 (T1, $5), $5
mov $1, T1
rol <$>5, T1
add T1, $5
......@@ -128,14 +128,14 @@ define(<ROUND_F3>, <
mov T1, OFFSET(eval($6 % 16)) (DATA)
mov $4, T2
and $3, T2
lea K3VALUE (T1, T2), T1
mov $4, T2
xor $3, T2
and $2, T2
add T1, $5
lea K3VALUE (T1, $5), $5
mov $4, T1
xor $3, T1
and $2, T1
add T2, $5
rol <$>30, $2
mov $1, T1
rol <$>5, T1
mov $1, T2
rol <$>5, T2
add T1, $5
add T2, $5
>)
......@@ -145,7 +145,7 @@ define(<ROUND_F3>, <
C _nettle_sha1_compress(uint32_t *state, uint8_t *data)
.text
ALIGN(4)
PROLOGUE(_nettle_sha1_compress)
C save all registers that need to be saved
C 88(%esp) data
......@@ -158,117 +158,1360 @@ PROLOGUE(_nettle_sha1_compress)
subl $64, %esp C %esp = W
C Load and byteswap data
movl 88(%esp), T2
SWAP( 0, %eax) SWAP( 1, %ebx) SWAP( 2, %ecx) SWAP( 3, %edx)
SWAP( 4, %eax) SWAP( 5, %ebx) SWAP( 6, %ecx) SWAP( 7, %edx)
SWAP( 8, %eax) SWAP( 9, %ebx) SWAP(10, %ecx) SWAP(11, %edx)
SWAP(12, %eax) SWAP(13, %ebx) SWAP(14, %ecx) SWAP(15, %edx)
C load the state vector
movl 84(%esp),T1
movl (T1), SA
movl 4(T1), SB
movl 8(T1), SC
movl 12(T1), SD
movl 16(T1), SE
ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 0)
ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 1)
ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 2)
ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 3)
ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 4)
ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 5)
ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 6)
ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 7)
ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 8)
ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 9)
ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 10)
ROUND_F1_NOEXP(SE, SA, SB, SC, SD, 11)
ROUND_F1_NOEXP(SD, SE, SA, SB, SC, 12)
ROUND_F1_NOEXP(SC, SD, SE, SA, SB, 13)
ROUND_F1_NOEXP(SB, SC, SD, SE, SA, 14)
ROUND_F1_NOEXP(SA, SB, SC, SD, SE, 15)
ROUND_F1(SE, SA, SB, SC, SD, 16)
ROUND_F1(SD, SE, SA, SB, SC, 17)
ROUND_F1(SC, SD, SE, SA, SB, 18)
ROUND_F1(SB, SC, SD, SE, SA, 19)
ROUND_F2(SA, SB, SC, SD, SE, 20, K2VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 21, K2VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 22, K2VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 23, K2VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 24, K2VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 25, K2VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 26, K2VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 27, K2VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 28, K2VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 29, K2VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 30, K2VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 31, K2VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 32, K2VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 33, K2VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 34, K2VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 35, K2VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 36, K2VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 37, K2VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 38, K2VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 39, K2VALUE)
ROUND_F3(SA, SB, SC, SD, SE, 40)
ROUND_F3(SE, SA, SB, SC, SD, 41)
ROUND_F3(SD, SE, SA, SB, SC, 42)
ROUND_F3(SC, SD, SE, SA, SB, 43)
ROUND_F3(SB, SC, SD, SE, SA, 44)
ROUND_F3(SA, SB, SC, SD, SE, 45)
ROUND_F3(SE, SA, SB, SC, SD, 46)
ROUND_F3(SD, SE, SA, SB, SC, 47)
ROUND_F3(SC, SD, SE, SA, SB, 48)
ROUND_F3(SB, SC, SD, SE, SA, 49)
ROUND_F3(SA, SB, SC, SD, SE, 50)
ROUND_F3(SE, SA, SB, SC, SD, 51)
ROUND_F3(SD, SE, SA, SB, SC, 52)
ROUND_F3(SC, SD, SE, SA, SB, 53)
ROUND_F3(SB, SC, SD, SE, SA, 54)
ROUND_F3(SA, SB, SC, SD, SE, 55)
ROUND_F3(SE, SA, SB, SC, SD, 56)
ROUND_F3(SD, SE, SA, SB, SC, 57)
ROUND_F3(SC, SD, SE, SA, SB, 58)
ROUND_F3(SB, SC, SD, SE, SA, 59)
ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
C Loop-mixed to 520 cycles (for the complete function call) on
C AMD K7.
ALIGN(5)
mov 88(%esp), T2
mov OFFSET(2)(T2), %ecx
mov OFFSET(0)(T2), %eax
bswap %ecx
bswap %eax
mov %ecx, OFFSET(2) (DATA)
mov %eax, OFFSET(0) (DATA)
mov OFFSET(3)(T2), %edx
mov OFFSET(6)(T2), %ecx
mov OFFSET(4)(T2), %eax
mov OFFSET(1)(T2), %ebx
bswap %ebx
bswap %eax
bswap %ecx
mov %ecx, OFFSET(6) (DATA)
mov %eax, OFFSET(4) (DATA)
bswap %edx
mov %edx, OFFSET(3) (DATA)
mov %ebx, OFFSET(1) (DATA)
mov OFFSET(10)(T2), %ecx
mov OFFSET(8)(T2), %eax
mov OFFSET(7)(T2), %edx
bswap %eax
bswap %edx
mov %edx, OFFSET(7) (DATA)
mov OFFSET(5)(T2), %ebx
mov %eax, OFFSET(8) (DATA)
mov OFFSET(11)(T2), %edx
bswap %ecx
bswap %edx
mov OFFSET(12)(T2), %eax
bswap %ebx
mov %ecx, OFFSET(10) (DATA)
mov %ebx, OFFSET(5) (DATA)
mov %edx, OFFSET(11) (DATA)
mov OFFSET(15)(T2), %edx
mov 84(%esp),T1
mov OFFSET(9)(T2), %ebx
bswap %edx
bswap %ebx
bswap %eax
mov OFFSET(14)(T2), %ecx
mov %edx, OFFSET(15) (DATA)
bswap %ecx
mov %ecx, OFFSET(14) (DATA)
mov %ebx, OFFSET(9) (DATA)
mov OFFSET(13)(T2), %ebx
mov 12(T1), SD
bswap %ebx
mov %ebx, OFFSET(13) (DATA)
mov 8(T1), SC
mov 16(T1), SE
mov 4(T1), SB
mov SD, T2
add OFFSET(0) (DATA), SE
xor SC, T2
mov %eax, OFFSET(12) (DATA)
mov (T1), SA
and SB, T2
xor SD, T2
rol $30, SB
add T2, SE
mov SA, T1
mov SC, T2
add OFFSET(1) (DATA), SD
rol $5, T1
xor SB, T2
and SA, T2
xor SC, T2
lea K1VALUE (T1, SE), SE
add T2, SD
mov SB, T2
rol $30, SA
xor SA, T2
and SE, T2
mov SE, T1
add OFFSET(2) (DATA), SC
rol $30, SE
xor SB, T2
rol $5, T1
lea K1VALUE (T1, SD), SD
mov SD, T1
rol $5, T1
add T2, SC
mov SA, T2
xor SE, T2
lea K1VALUE (T1, SC), SC
and SD, T2
xor SA, T2
add OFFSET(3) (DATA), SB
mov SC, T1
add T2, SB
mov SE, T2
rol $30, SD
xor SD, T2
and SC, T2
rol $5, T1
xor SE, T2
add OFFSET(4) (DATA), SA
lea K1VALUE (T1, SB), SB
add T2, SA
rol $30, SC
mov SD, T2
xor SC, T2
and SB, T2
mov SB, T1
rol $5, T1
add OFFSET(5) (DATA), SE
rol $30, SB
xor SD, T2
add T2, SE
mov SC, T2
xor SB, T2
lea K1VALUE (T1, SA), SA
mov SA, T1
add OFFSET(6) (DATA), SD
and SA, T2
rol $5, T1
xor SC, T2
lea K1VALUE (T1, SE), SE
rol $30, SA
add T2, SD
mov SB, T2
mov SE, T1
xor SA, T2
and SE, T2
rol $5, T1
lea K1VALUE (T1, SD), SD
xor SB, T2
add OFFSET(7) (DATA), SC
rol $30, SE
add OFFSET(8) (DATA), SB
mov SD, T1
add T2, SC
mov SA, T2
xor SE, T2
rol $5, T1
and SD, T2
lea K1VALUE (T1, SC), SC
xor SA, T2
add T2, SB
mov SE, T2
mov SC, T1
rol $30, SD
xor SD, T2
rol $5, T1
lea K1VALUE (T1, SB), SB
and SC, T2
xor SE, T2
add OFFSET(10) (DATA), SE
add OFFSET(9) (DATA), SA
mov SB, T1
add T2, SA
rol $5, T1
lea K1VALUE (T1, SA), SA
mov SD, T2
rol $30, SC
xor SC, T2
and SB, T2
xor SD, T2
rol $30, SB
add T2, SE
mov SC, T2
mov SA, T1
xor SB, T2
add OFFSET(11) (DATA), SD
and SA, T2
rol $30, SA
rol $5, T1
xor SC, T2
lea K1VALUE (T1, SE), SE
add T2, SD
mov SB, T2
xor SA, T2
mov SE, T1
rol $5, T1
and SE, T2
lea K1VALUE (T1, SD), SD
xor SB, T2
add OFFSET(12) (DATA), SC
add T2, SC
rol $30, SE
mov SA, T2
xor SE, T2
mov SD, T1
rol $5, T1
and SD, T2
add OFFSET(13) (DATA), SB
lea K1VALUE (T1, SC), SC
xor SA, T2
add T2, SB
mov SE, T2
rol $30, SD
xor SD, T2
and SC, T2
mov SC, T1
rol $5, T1
rol $30, SC
add OFFSET(14) (DATA), SA
xor SE, T2
add T2, SA
mov SD, T2
xor SC, T2
lea K1VALUE (T1, SB), SB
and SB, T2
mov SB, T1
rol $5, T1
lea K1VALUE (T1, SA), SA
mov SA, T1
xor SD, T2
add OFFSET(15) (DATA), SE
add T2, SE
rol $5, T1
lea K1VALUE (T1, SE), SE
mov OFFSET(0) (DATA), T1
xor OFFSET(2) (DATA), T1
mov SC, T2
xor OFFSET(8) (DATA), T1
xor OFFSET(13) (DATA), T1
rol $30, SB
xor SB, T2
and SA, T2
xor SC, T2
rol $1, T1
lea K1VALUE (T1, T2), T2
mov T1, OFFSET(0) (DATA)
mov SE, T1
rol $5, T1
add T1, SD
mov OFFSET(1) (DATA), T1
xor OFFSET(3) (DATA), T1
rol $30, SA
add T2, SD
mov SB, T2
xor SA, T2
and SE, T2
xor OFFSET(9) (DATA), T1
xor OFFSET(14) (DATA), T1
xor SB, T2
rol $1, T1
mov T1, OFFSET(1) (DATA)
lea K1VALUE (T1, T2), T2
mov SD, T1
rol $5, T1
add T1, SC
mov OFFSET(2) (DATA), T1
xor OFFSET(4) (DATA), T1
rol $30, SE
add T2, SC
mov SA, T2
xor SE, T2
xor OFFSET(10) (DATA), T1
xor OFFSET(15) (DATA), T1
and SD, T2
rol $1, T1
xor SA, T2
mov T1, OFFSET(2) (DATA)
lea K1VALUE (T1, T2), T2
mov SC, T1
rol $30, SD
rol $5, T1
add T1, SB
add T2, SB
mov SE, T2
mov OFFSET(3) (DATA), T1
xor SD, T2
xor OFFSET(5) (DATA), T1
and SC, T2
xor SE, T2
xor OFFSET(11) (DATA), T1
xor OFFSET(0) (DATA), T1
rol $1, T1
mov T1, OFFSET(3) (DATA)
lea K1VALUE (T1, T2), T2
mov SB, T1
rol $5, T1
add T1, SA
mov OFFSET(4) (DATA), T1
xor OFFSET(6) (DATA), T1
rol $30, SC
xor OFFSET(12) (DATA), T1
add T2, SA
xor OFFSET(1) (DATA), T1
mov SD, T2
xor SC, T2
rol $1, T1
xor SB, T2
lea K2VALUE (T1, T2), T2
mov T1, OFFSET(4) (DATA)
mov SA, T1
rol $5, T1
add T1, SE
mov OFFSET(5) (DATA), T1
add T2, SE
mov SC, T2
xor OFFSET(7) (DATA), T1
rol $30, SB
xor OFFSET(13) (DATA), T1
xor SB, T2
xor OFFSET(2) (DATA), T1
xor SA, T2
rol $1, T1
mov T1, OFFSET(5) (DATA)
lea K2VALUE (T1, T2), T2
mov SE, T1
rol $5, T1
add T1, SD
mov OFFSET(6) (DATA), T1
xor OFFSET(8) (DATA), T1
add T2, SD
rol $30, SA
xor OFFSET(14) (DATA), T1
mov SB, T2
xor OFFSET(3) (DATA), T1
xor SA, T2
rol $1, T1
xor SE, T2
lea K2VALUE (T1, T2), T2
mov T1, OFFSET(6) (DATA)
mov SD, T1
rol $5, T1
add T1, SC
add T2, SC
mov SA, T2
rol $30, SE
mov OFFSET(7) (DATA), T1
xor OFFSET(9) (DATA), T1
xor SE, T2
xor OFFSET(15) (DATA), T1
xor OFFSET(4) (DATA), T1
xor SD, T2
rol $1, T1
lea K2VALUE (T1, T2), T2
mov T1, OFFSET(7) (DATA)
mov SC, T1
rol $5, T1
add T1, SB
mov OFFSET(8) (DATA), T1
xor OFFSET(10) (DATA), T1
add T2, SB
rol $30, SD
mov SE, T2
xor OFFSET(0) (DATA), T1
xor OFFSET(5) (DATA), T1
xor SD, T2
xor SC, T2
rol $1, T1
mov T1, OFFSET(8) (DATA)
lea K2VALUE (T1, T2), T2
mov SB, T1
rol $5, T1
add T1, SA
mov OFFSET(9) (DATA), T1
xor OFFSET(11) (DATA), T1
xor OFFSET(1) (DATA), T1
add T2, SA
xor OFFSET(6) (DATA), T1
mov SD, T2
rol $1, T1
rol $30, SC
xor SC, T2
mov T1, OFFSET(9) (DATA)
xor SB, T2
lea K2VALUE (T1, T2), T2
mov SA, T1
rol $5, T1
add T1, SE
mov OFFSET(10) (DATA), T1
xor OFFSET(12) (DATA), T1
xor OFFSET(2) (DATA), T1
add T2, SE
mov SC, T2
rol $30, SB
xor OFFSET(7) (DATA), T1
xor SB, T2
rol $1, T1
xor SA, T2
lea K2VALUE (T1, T2), T2
mov T1, OFFSET(10) (DATA)
mov SE, T1
rol $5, T1
add T1, SD
mov OFFSET(11) (DATA), T1
xor OFFSET(13) (DATA), T1
rol $30, SA
xor OFFSET(3) (DATA), T1
add T2, SD
xor OFFSET(8) (DATA), T1
mov SB, T2
xor SA, T2
rol $1, T1
mov T1, OFFSET(11) (DATA)
xor SE, T2
lea K2VALUE (T1, T2), T2
mov SD, T1
rol $5, T1
add T1, SC
mov OFFSET(12) (DATA), T1
xor OFFSET(14) (DATA), T1
rol $30, SE
add T2, SC
xor OFFSET(4) (DATA), T1
mov SA, T2
xor OFFSET(9) (DATA), T1
xor SE, T2
rol $1, T1
xor SD, T2
mov T1, OFFSET(12) (DATA)
lea K2VALUE (T1, T2), T2
mov SC, T1
rol $5, T1
add T1, SB
rol $30, SD
mov OFFSET(13) (DATA), T1
xor OFFSET(15) (DATA), T1
add T2, SB
mov SE, T2
xor OFFSET(5) (DATA), T1
xor SD, T2
xor OFFSET(10) (DATA), T1
xor SC, T2
rol $1, T1
lea K2VALUE (T1, T2), T2
mov T1, OFFSET(13) (DATA)
mov SB, T1