Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Marcus Hoffmann
nettle
Commits
60dfd8d4
Commit
60dfd8d4
authored
Sep 15, 2009
by
Niels Möller
Browse files
Cleanup, removing old cruft. Slight improvement to ROUND_F1_NOEXP.
Rev: nettle/x86/sha1-compress.asm:1.8
parent
d8e65e43
Changes
1
Hide whitespace changes
Inline
Side-by-side
x86/sha1-compress.asm
View file @
60dfd8d4
...
@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
...
@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
define
(
<
DATA
>
,
<%
esp
>
)
define
(
<
DATA
>
,
<%
esp
>
)
define
(
<
T1
>
,
<%
edi
>
)
define
(
<
T1
>
,
<%
edi
>
)
define
(
<
T2
>
,
<%
esi
>
)
C
Used
by
SWAP
define
(
<
T2
>
,
<%
esi
>
)
C
Used
by
SWAP
define
(
<
KVALUE
>
,
<%
esi
>
)
C
Used
by
rounds
C
Constants
C
Constants
define
(
<
K1VALUE
>
,
<
0x5A827999
>
)
C
Rounds
0
-
19
define
(
<
K1VALUE
>
,
<
0x5A827999
>
)
C
Rounds
0
-
19
...
@@ -42,23 +41,6 @@ define(<SWAP>, <
...
@@ -42,23 +41,6 @@ define(<SWAP>, <
movl
$
2
,
OFFSET
(
$
1
)
(
DATA
)
movl
$
2
,
OFFSET
(
$
1
)
(
DATA
)
>)
dnl
>)
dnl
C
expand
(
i
)
is
the
expansion
function
C
C
W
[
i
]
=
(
W
[
i
-
16
]
^
W
[
i
-
14
]
^
W
[
i
-
8
]
^
W
[
i
-
3
])
<<<
1
C
C
where
W
[
i
]
is
stored
in
DATA
[
i
mod
16
]
.
C
C
Result
is
stored
back
in
W
[
i
],
and
al
so
left
in
T1
,
the
only
C
register
that
is
used.
define
(
<
EXPAND
>
,
<
movl
OFFSET
(
eval
(
$
1
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
2
)
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
8
)
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
13
)
%
16
))
(
DATA
),
T1
roll
<
$
>
1
,
T1
movl
T1
,
OFFSET
(
eval
(
$
1
%
16
))
(
DATA
)
>
)
dnl
define
(
<
NOEXPAND
>
,
<
OFFSET
(
$
1
)
(
DATA
)
>
)
dnl
C
The
f
functions
,
C
The
f
functions
,
C
C
C
f1
(
x
,
y
,
z
)
=
z
^
(
x
&
(
y
^
z
))
C
f1
(
x
,
y
,
z
)
=
z
^
(
x
&
(
y
^
z
))
...
@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
...
@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
add
T2
,
$
5
add
T2
,
$
5
>)
>)
C
FIXME
:
Seems
to
be
a
slow
sequence.
dnl
ROUND_F1_NOEXP
(
a
,
b
,
c
,
d
,
e
,
i
)
define
(
<
ROUND_F1_NOEXP
>
,
<
define
(
<
ROUND_F1_NOEXP
>
,
<
mov
$
4
,
T2
mov
$
4
,
T2
xor
$
3
,
T2
xor
$
3
,
T2
mov
$
1
,
T1
and
$
2
,
T2
and
$
2
,
T2
add
OFFSET
(
$
6
)
(
DATA
),
$
5
xor
$
4
,
T2
xor
$
4
,
T2
add
OFFSET
(
$
6
)
(
DATA
),
T2
add
T2
,
$
5
rol
<
$
>
30
,
$
2
rol
<
$
>
30
,
$
2
mov
$
1
,
T1
rol
<
$
>
5
,
T1
rol
<
$
>
5
,
T1
lea
K1VALUE
(
T1
,
$
5
),
$
5
lea
K1VALUE
(
T1
,
$
5
),
$
5
add
T2
,
$
5
>)
>)
dnl
ROUND_F2
(
a
,
b
,
c
,
d
,
e
,
i
,
k
)
dnl
ROUND_F2
(
a
,
b
,
c
,
d
,
e
,
i
,
k
)
...
@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
...
@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
add
T2
,
$
5
add
T2
,
$
5
>)
>)
C
As
suggested
by
George
Sp
elvin
,
write
the
F3
function
as
C
(
x
&
y
)
|
(
y
&
z
)
|
(
x
&
z
)
==
(
x
&
(
y
^
z
))
+
(
y
&
z
)
.
Then
,
we
can
compute
C
and
add
each
term
to
e
,
using
a
si
ngle
temporary.
.file
"sha1-compress.asm"
.file
"sha1-compress.asm"
C
_nettle_sha1_compress
(
uint32_t
*
state
,
uint8_t
*
data
)
C
_nettle_sha1_compress
(
uint32_t
*
state
,
uint8_t
*
data
)
...
@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
...
@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
pushl
%
esi
C
68
(
%
esp
)
pushl
%
esi
C
68
(
%
esp
)
pushl
%
edi
C
64
(
%
esp
)
pushl
%
edi
C
64
(
%
esp
)
C
FIXME
:
Trim
to
64
subl
$
64
,
%
esp
C
%
esp
=
W
subl
$
64
,
%
esp
C
%
esp
=
W
C
Load
and
byte
swap
data
C
Load
and
byte
swap
data
...
@@ -270,29 +246,29 @@ PROLOGUE(_nettle_sha1_compress)
...
@@ -270,29 +246,29 @@ PROLOGUE(_nettle_sha1_compress)
ROUND_F3
(
SC
,
SD
,
SE
,
SA
,
SB
,
58
)
ROUND_F3
(
SC
,
SD
,
SE
,
SA
,
SB
,
58
)
ROUND_F3
(
SB
,
SC
,
SD
,
SE
,
SA
,
59
)
ROUND_F3
(
SB
,
SC
,
SD
,
SE
,
SA
,
59
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
60
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
60
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
61
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
61
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
62
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
62
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
63
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
63
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
64
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
64
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
65
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
65
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
66
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
66
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
67
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
67
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
68
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
68
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
69
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
69
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
70
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
70
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
71
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
71
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
72
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
72
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
73
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
73
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
74
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
74
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
75
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
75
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
76
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
76
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
77
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
77
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
78
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
78
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
79
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
79
,
K4VALUE
)
C
Update
the
state
vector
C
Update
the
state
vector
movl
84
(
%
esp
),
T1
movl
84
(
%
esp
),
T1
...
@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
...
@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
popl
%
ebx
popl
%
ebx
ret
ret
EPILOGUE
(
_nettle_sha1_compress
)
EPILOGUE
(
_nettle_sha1_compress
)
C
George
Sp
elvin
al
so
suggested
using
lea
,
with
an
immediate
offset
C
for
the
magic
constants.
This
frees
one
register
,
which
can
be
used
C
for
loosen
up
dependencies
and
to
more
operations
in
parallel.
For
C
example
,
take
the
rounds
involving
f2
,
the
si
mplest
round
function.
C
Currently
,
we
have
C
C
movl
16
(
%
esp
),
T1
C
xorl
24
(
%
esp
),
T1
C
xorl
48
(
%
esp
),
T1
C
xorl
4
(
%
esp
),
T1
C
roll
$
1
,
T1
C
movl
T1
,
16
(
%
esp
)
C
addl
KVALUE
,
SE
C
0
C
addl
T1
,
SE
C
1
C
movl
SB
,
T1
C
0
C
xorl
SC
,
T1
C
1
C
xorl
SD
,
T1
C
2
C
addl
T1
,
SE
C
3
C
movl
SA
,
T1
C
0
C
roll
$
5
,
T1
C
1
C
addl
T1
,
SE
C
4
C
roll
$
30
,
SB
C
0
C
These
16
instructions
could
be
executed
in
5.33
cycles
if
there
were
C
no
dependencies.
The
crucial
dependencies
are
from
(
previous
)
SE
to
C
use
SA
,
and
(
previous
)
result
SB
to
use
SC.
(
What
does
this
say
C
about
recurrency
ch
ain?
Ought
to
unroll
5
times
to
see
it
)
.
C
It
would
be
preferable
to
accumulate
the
terms
in
two
or
more
C
registers
,
to
make
dependencies
shallower.
Something
like
C
...expand
,
put
data
in
W
C
movl
SD
,
T1
C
0
C
leal
K1VALUE
(
SE
,
W
),
SE
C
0
C
movl
SA
,
T2
C
0
C
xorl
SC
,
T1
C
1
C
roll
$
5
,
T2
C
1
C
xorl
SB
,
T1
C
2
C
addl
T2
,
T1
C
3
C
addl
T1
,
SE
C
4
C
a
+
b
+
c
+
d
+
e
=
((((
a
+
b
)
+
c
)
+
d
)
+
e
),
latency
4
C
a
+
b
+
c
+
d
+
e
=
((
a
+
b
)
+
c
)
+
(
d
+
e
)
C
the
out
-
of
-
order
execution.
Next
iteration
C
C
...expand...
C
roll
$
1
,
T1
C
4
C
movl
T1
,
16
(
%
esp
)
C
5
C
addl
KVALUE
,
SD
C
0
C
addl
T1
,
SD
C
5
C
movl
SA
,
T1
C
0
C
xorl
SB
,
T1
C
1
C
xorl
SC
,
T1
C
2
C
addl
T1
,
SD
C
6
C
movl
SE
,
T1
C
8
C
roll
$
5
,
T1
C
9
C
addl
T1
,
SD
C
7
C
roll
$
30
,
SA
C
0
C
C
Lets
look
at
the
latency.
Next
iteration
will
operate
on
(
E
,
A
,
B
,
C
,
D
),
so
we
have
recurrencies
:
C
from
result
SA
to
use
of
SE
(
none
,
SA
not
modified
)
C
from
result
of
SB
to
use
of
SA
,
result
of
SC
to
use
of
SB
C
It
'
s
possible
to
shave
of
half
of
the
stores
to
tmp
in
the
evaluation
of
f3
,
C
al
though
it
'
s
probably
not
worth
the
effort.
This
is
the
trick
:
C
C
round
(
a
,
b
,
c
,
d
,
e
,
f
,
k
)
modifies
only
b
,
e.
C
C
round
(
a
,
b
,
c
,
d
,
e
,
f3
,
k
)
C
round
(
e
,
a
,
b
,
c
,
d
,
f3
,
k
)
C
C
; f3(b,c,d) = (b & c) | (d & (b | c))
C
C
movl
b
,
tmp
C
andl
c
,
tmp
C
movl
tmp
,
tmp2
C
movl
b
,
tmp
C
orl
c
,
tmp
C
andl
d
,
tmp
C
orl
tmp2
,
tmp
C
C
and
corresponding
code
for
f3
(
a
,
b
,
c
)
C
C
Use
the
register
al
located
for
c
as
a
temporary?
C
C
movl
c
,
tmp2
C
; f3(b,c,d) = (b & c) | (d & (b | c))
C
movl
b
,
tmp
C
orl
c
,
tmp
C
andl
b
,
c
C
andl
d
,
tmp
C
orl
c
,
tmp
C
C
; f3(a,b,c) = (a & b) | (c & (a | b))
C
movl
b
,
tmp
C
andl
a
,
tmp
C
movl
a
,
c
C
orl
b
,
c
C
andl
tmp2
,
c
C
orl
c
,
tmp
C
C
movl
tmp2
,
c
C
C
Before
:
14
instr
,
2
store
,
2
load
C
After
:
13
instr
,
1
store
,
2
load
C
C
Final
load
can
be
folded
into
the
next
round
,
C
C
round
(
d
,
e
,
a
,
b
,
c
,
f3
,
k
)
C
C
c
+
=
d
<<<
5
+
f
(
e
,
a
,
b
)
+
k
+
w
C
C
if
we
arrange
to
have
w
placed
di
rectly
into
the
register
C
corresponding
to
w.
That
way
we
save
one
more
instruction
,
total
save
C
of
two
instructions
,
one
of
which
is
a
store
,
per
two
rounds.
For
the
C
twenty
rounds
involving
f3
,
that
'
s
20
instructions
,
10
of
which
are
C
stores
,
or
about
1.5
%
.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment