Commit a6f09920 authored by Niels Möller's avatar Niels Möller

Cleanup, removing old cruft. Slight improvement to ROUND_F1_NOEXP.

Rev: nettle/x86/sha1-compress.asm:1.8
parent ae6d18bd
......@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
define(<DATA>,<%esp>)
define(<T1>,<%edi>)
define(<T2>,<%esi>) C Used by SWAP
define(<KVALUE>,<%esi>) C Used by rounds
C Constants
define(<K1VALUE>, <0x5A827999>) C Rounds 0-19
......@@ -42,23 +41,6 @@ define(<SWAP>, <
movl $2, OFFSET($1) (DATA)
>)dnl
C expand(i) is the expansion function
C
C W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1
C
C where W[i] is stored in DATA[i mod 16].
C
C Result is stored back in W[i], and also left in T1, the only
C register that is used.
define(<EXPAND>, <
movl OFFSET(eval($1 % 16)) (DATA), T1
xorl OFFSET(eval(($1 + 2) % 16)) (DATA), T1
xorl OFFSET(eval(($1 + 8) % 16)) (DATA), T1
xorl OFFSET(eval(($1 + 13) % 16)) (DATA), T1
roll <$>1, T1
movl T1, OFFSET(eval($1 % 16)) (DATA)>)dnl
define(<NOEXPAND>, <OFFSET($1) (DATA)>)dnl
C The f functions,
C
C f1(x,y,z) = z ^ (x & (y ^ z))
......@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
add T2, $5
>)
C FIXME: Seems to be a slow sequence.
dnl ROUND_F1_NOEXP(a, b, c, d, e, i)
define(<ROUND_F1_NOEXP>, <
mov $4, T2
xor $3, T2
mov $1, T1
and $2, T2
add OFFSET($6) (DATA), $5
xor $4, T2
add OFFSET($6) (DATA), T2
add T2, $5
rol <$>30, $2
mov $1, T1
rol <$>5, T1
lea K1VALUE (T1, $5), $5
add T2, $5
>)
dnl ROUND_F2(a, b, c, d, e, i, k)
......@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
add T2, $5
>)
C As suggested by George Spelvin, write the F3 function as
C (x&y) | (y&z) | (x&z) == (x & (y^z)) + (y&z). Then, we can compute
C and add each term to e, using a single temporary.
.file "sha1-compress.asm"
C _nettle_sha1_compress(uint32_t *state, uint8_t *data)
......@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
pushl %esi C 68(%esp)
pushl %edi C 64(%esp)
C FIXME: Trim to 64
subl $64, %esp C %esp = W
C Load and byteswap data
......@@ -270,29 +246,29 @@ PROLOGUE(_nettle_sha1_compress)
ROUND_F3(SC, SD, SE, SA, SB, 58)
ROUND_F3(SB, SC, SD, SE, SA, 59)
ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 60, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 61, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 62, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 63, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 64, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 65, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 66, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 67, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 68, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 69, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 70, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 71, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 72, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 73, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 74, K4VALUE)
ROUND_F2(SA, SB, SC, SD, SE, 75, K4VALUE)
ROUND_F2(SE, SA, SB, SC, SD, 76, K4VALUE)
ROUND_F2(SD, SE, SA, SB, SC, 77, K4VALUE)
ROUND_F2(SC, SD, SE, SA, SB, 78, K4VALUE)
ROUND_F2(SB, SC, SD, SE, SA, 79, K4VALUE)
C Update the state vector
movl 84(%esp),T1
......@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
popl %ebx
ret
EPILOGUE(_nettle_sha1_compress)
C George Spelvin also suggested using lea, with an immediate offset
C for the magic constants. This frees one register, which can be used
C for loosen up dependencies and to more operations in parallel. For
C example, take the rounds involving f2, the simplest round function.
C Currently, we have
C
C movl 16(%esp), T1
C xorl 24(%esp), T1
C xorl 48(%esp), T1
C xorl 4(%esp), T1
C roll $1, T1
C movl T1, 16(%esp)
C addl KVALUE, SE C 0
C addl T1, SE C 1
C movl SB, T1 C 0
C xorl SC, T1 C 1
C xorl SD, T1 C 2
C addl T1, SE C 3
C movl SA, T1 C 0
C roll $5, T1 C 1
C addl T1, SE C 4
C roll $30, SB C 0
C These 16 instructions could be executed in 5.33 cycles if there were
C no dependencies. The crucial dependencies are from (previous) SE to
C use SA, and (previous) result SB to use SC. (What does this say
C about recurrency chain? Ought to unroll 5 times to see it).
C It would be preferable to accumulate the terms in two or more
C registers, to make dependencies shallower. Something like
C ...expand, put data in W
C movl SD, T1 C 0
C leal K1VALUE(SE, W), SE C 0
C movl SA, T2 C 0
C xorl SC, T1 C 1
C roll $5, T2 C 1
C xorl SB, T1 C 2
C addl T2, T1 C 3
C addl T1, SE C 4
C a + b + c + d + e = ((((a + b) + c) + d) + e), latency 4
C a + b + c + d + e = ((a + b) + c) + (d + e)
C the out-of-order execution. Next iteration
C
C ...expand...
C roll $1, T1 C 4
C movl T1, 16(%esp) C 5
C addl KVALUE, SD C 0
C addl T1, SD C 5
C movl SA, T1 C 0
C xorl SB, T1 C 1
C xorl SC, T1 C 2
C addl T1, SD C 6
C movl SE, T1 C 8
C roll $5, T1 C 9
C addl T1, SD C 7
C roll $30, SA C 0
C
C Lets look at the latency. Next iteration will operate on (E, A, B, C, D), so we have recurrencies:
C from result SA to use of SE (none, SA not modified)
C from result of SB to use of SA, result of SC to use of SB
C It's possible to shave of half of the stores to tmp in the evaluation of f3,
C although it's probably not worth the effort. This is the trick:
C
C round(a,b,c,d,e,f,k) modifies only b,e.
C
C round(a,b,c,d,e,f3,k)
C round(e,a,b,c,d,f3,k)
C
C ; f3(b,c,d) = (b & c) | (d & (b | c))
C
C movl b, tmp
C andl c, tmp
C movl tmp, tmp2
C movl b, tmp
C orl c, tmp
C andl d, tmp
C orl tmp2, tmp
C
C and corresponding code for f3(a,b,c)
C
C Use the register allocated for c as a temporary?
C
C movl c, tmp2
C ; f3(b,c,d) = (b & c) | (d & (b | c))
C movl b, tmp
C orl c, tmp
C andl b, c
C andl d, tmp
C orl c, tmp
C
C ; f3(a,b,c) = (a & b) | (c & (a | b))
C movl b, tmp
C andl a, tmp
C movl a, c
C orl b, c
C andl tmp2, c
C orl c, tmp
C
C movl tmp2, c
C
C Before: 14 instr, 2 store, 2 load
C After: 13 instr, 1 store, 2 load
C
C Final load can be folded into the next round,
C
C round(d,e,a,b,c,f3,k)
C
C c += d <<< 5 + f(e, a, b) + k + w
C
C if we arrange to have w placed directly into the register
C corresponding to w. That way we save one more instruction, total save
C of two instructions, one of which is a store, per two rounds. For the
C twenty rounds involving f3, that's 20 instructions, 10 of which are
C stores, or about 1.5 %.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment