Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
nettle
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
5
Merge Requests
5
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
Nettle
nettle
Commits
60dfd8d4
Commit
60dfd8d4
authored
Sep 15, 2009
by
Niels Möller
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Cleanup, removing old cruft. Slight improvement to ROUND_F1_NOEXP.
Rev: nettle/x86/sha1-compress.asm:1.8
parent
d8e65e43
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
169 deletions
+27
-169
x86/sha1-compress.asm
x86/sha1-compress.asm
+27
-169
No files found.
x86/sha1-compress.asm
View file @
60dfd8d4
...
...
@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
define
(
<
DATA
>
,
<%
esp
>
)
define
(
<
T1
>
,
<%
edi
>
)
define
(
<
T2
>
,
<%
esi
>
)
C
Used
by
SWAP
define
(
<
KVALUE
>
,
<%
esi
>
)
C
Used
by
rounds
C
Constants
define
(
<
K1VALUE
>
,
<
0x5A827999
>
)
C
Rounds
0
-
19
...
...
@@ -42,23 +41,6 @@ define(<SWAP>, <
movl
$
2
,
OFFSET
(
$
1
)
(
DATA
)
>)
dnl
C
expand
(
i
)
is
the
expansion
function
C
C
W
[
i
]
=
(
W
[
i
-
16
]
^
W
[
i
-
14
]
^
W
[
i
-
8
]
^
W
[
i
-
3
])
<<<
1
C
C
where
W
[
i
]
is
stored
in
DATA
[
i
mod
16
]
.
C
C
Result
is
stored
back
in
W
[
i
],
and
al
so
left
in
T1
,
the
only
C
register
that
is
used.
define
(
<
EXPAND
>
,
<
movl
OFFSET
(
eval
(
$
1
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
2
)
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
8
)
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
13
)
%
16
))
(
DATA
),
T1
roll
<
$
>
1
,
T1
movl
T1
,
OFFSET
(
eval
(
$
1
%
16
))
(
DATA
)
>
)
dnl
define
(
<
NOEXPAND
>
,
<
OFFSET
(
$
1
)
(
DATA
)
>
)
dnl
C
The
f
functions
,
C
C
f1
(
x
,
y
,
z
)
=
z
^
(
x
&
(
y
^
z
))
...
...
@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
add
T2
,
$
5
>)
C
FIXME
:
Seems
to
be
a
slow
sequence.
dnl
ROUND_F1_NOEXP
(
a
,
b
,
c
,
d
,
e
,
i
)
define
(
<
ROUND_F1_NOEXP
>
,
<
mov
$
4
,
T2
xor
$
3
,
T2
mov
$
1
,
T1
and
$
2
,
T2
add
OFFSET
(
$
6
)
(
DATA
),
$
5
xor
$
4
,
T2
add
OFFSET
(
$
6
)
(
DATA
),
T2
add
T2
,
$
5
rol
<
$
>
30
,
$
2
mov
$
1
,
T1
rol
<
$
>
5
,
T1
lea
K1VALUE
(
T1
,
$
5
),
$
5
add
T2
,
$
5
>)
dnl
ROUND_F2
(
a
,
b
,
c
,
d
,
e
,
i
,
k
)
...
...
@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
add
T2
,
$
5
>)
C
As
suggested
by
George
Sp
elvin
,
write
the
F3
function
as
C
(
x
&
y
)
|
(
y
&
z
)
|
(
x
&
z
)
==
(
x
&
(
y
^
z
))
+
(
y
&
z
)
.
Then
,
we
can
compute
C
and
add
each
term
to
e
,
using
a
si
ngle
temporary.
.file
"sha1-compress.asm"
C
_nettle_sha1_compress
(
uint32_t
*
state
,
uint8_t
*
data
)
...
...
@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
pushl
%
esi
C
68
(
%
esp
)
pushl
%
edi
C
64
(
%
esp
)
C
FIXME
:
Trim
to
64
subl
$
64
,
%
esp
C
%
esp
=
W
C
Load
and
byte
swap
data
...
...
@@ -270,29 +246,29 @@ PROLOGUE(_nettle_sha1_compress)
ROUND_F3
(
SC
,
SD
,
SE
,
SA
,
SB
,
58
)
ROUND_F3
(
SB
,
SC
,
SD
,
SE
,
SA
,
59
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
60
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
61
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
62
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
63
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
64
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
65
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
66
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
67
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
68
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
69
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
70
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
71
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
72
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
73
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
74
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
75
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
76
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
77
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
78
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
79
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
60
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
61
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
62
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
63
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
64
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
65
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
66
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
67
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
68
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
69
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
70
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
71
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
72
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
73
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
74
,
K4VALUE
)
ROUND_F2
(
SA
,
SB
,
SC
,
SD
,
SE
,
75
,
K4VALUE
)
ROUND_F2
(
SE
,
SA
,
SB
,
SC
,
SD
,
76
,
K4VALUE
)
ROUND_F2
(
SD
,
SE
,
SA
,
SB
,
SC
,
77
,
K4VALUE
)
ROUND_F2
(
SC
,
SD
,
SE
,
SA
,
SB
,
78
,
K4VALUE
)
ROUND_F2
(
SB
,
SC
,
SD
,
SE
,
SA
,
79
,
K4VALUE
)
C
Update
the
state
vector
movl
84
(
%
esp
),
T1
...
...
@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
popl
%
ebx
ret
EPILOGUE
(
_nettle_sha1_compress
)
C
George
Sp
elvin
al
so
suggested
using
lea
,
with
an
immediate
offset
C
for
the
magic
constants.
This
frees
one
register
,
which
can
be
used
C
for
loosen
up
dependencies
and
to
more
operations
in
parallel.
For
C
example
,
take
the
rounds
involving
f2
,
the
si
mplest
round
function.
C
Currently
,
we
have
C
C
movl
16
(
%
esp
),
T1
C
xorl
24
(
%
esp
),
T1
C
xorl
48
(
%
esp
),
T1
C
xorl
4
(
%
esp
),
T1
C
roll
$
1
,
T1
C
movl
T1
,
16
(
%
esp
)
C
addl
KVALUE
,
SE
C
0
C
addl
T1
,
SE
C
1
C
movl
SB
,
T1
C
0
C
xorl
SC
,
T1
C
1
C
xorl
SD
,
T1
C
2
C
addl
T1
,
SE
C
3
C
movl
SA
,
T1
C
0
C
roll
$
5
,
T1
C
1
C
addl
T1
,
SE
C
4
C
roll
$
30
,
SB
C
0
C
These
16
instructions
could
be
executed
in
5.33
cycles
if
there
were
C
no
dependencies.
The
crucial
dependencies
are
from
(
previous
)
SE
to
C
use
SA
,
and
(
previous
)
result
SB
to
use
SC.
(
What
does
this
say
C
about
recurrency
ch
ain?
Ought
to
unroll
5
times
to
see
it
)
.
C
It
would
be
preferable
to
accumulate
the
terms
in
two
or
more
C
registers
,
to
make
dependencies
shallower.
Something
like
C
...expand
,
put
data
in
W
C
movl
SD
,
T1
C
0
C
leal
K1VALUE
(
SE
,
W
),
SE
C
0
C
movl
SA
,
T2
C
0
C
xorl
SC
,
T1
C
1
C
roll
$
5
,
T2
C
1
C
xorl
SB
,
T1
C
2
C
addl
T2
,
T1
C
3
C
addl
T1
,
SE
C
4
C
a
+
b
+
c
+
d
+
e
=
((((
a
+
b
)
+
c
)
+
d
)
+
e
),
latency
4
C
a
+
b
+
c
+
d
+
e
=
((
a
+
b
)
+
c
)
+
(
d
+
e
)
C
the
out
-
of
-
order
execution.
Next
iteration
C
C
...expand...
C
roll
$
1
,
T1
C
4
C
movl
T1
,
16
(
%
esp
)
C
5
C
addl
KVALUE
,
SD
C
0
C
addl
T1
,
SD
C
5
C
movl
SA
,
T1
C
0
C
xorl
SB
,
T1
C
1
C
xorl
SC
,
T1
C
2
C
addl
T1
,
SD
C
6
C
movl
SE
,
T1
C
8
C
roll
$
5
,
T1
C
9
C
addl
T1
,
SD
C
7
C
roll
$
30
,
SA
C
0
C
C
Lets
look
at
the
latency.
Next
iteration
will
operate
on
(
E
,
A
,
B
,
C
,
D
),
so
we
have
recurrencies
:
C
from
result
SA
to
use
of
SE
(
none
,
SA
not
modified
)
C
from
result
of
SB
to
use
of
SA
,
result
of
SC
to
use
of
SB
C
It
'
s
possible
to
shave
of
half
of
the
stores
to
tmp
in
the
evaluation
of
f3
,
C
al
though
it
'
s
probably
not
worth
the
effort.
This
is
the
trick
:
C
C
round
(
a
,
b
,
c
,
d
,
e
,
f
,
k
)
modifies
only
b
,
e.
C
C
round
(
a
,
b
,
c
,
d
,
e
,
f3
,
k
)
C
round
(
e
,
a
,
b
,
c
,
d
,
f3
,
k
)
C
C
; f3(b,c,d) = (b & c) | (d & (b | c))
C
C
movl
b
,
tmp
C
andl
c
,
tmp
C
movl
tmp
,
tmp2
C
movl
b
,
tmp
C
orl
c
,
tmp
C
andl
d
,
tmp
C
orl
tmp2
,
tmp
C
C
and
corresponding
code
for
f3
(
a
,
b
,
c
)
C
C
Use
the
register
al
located
for
c
as
a
temporary?
C
C
movl
c
,
tmp2
C
; f3(b,c,d) = (b & c) | (d & (b | c))
C
movl
b
,
tmp
C
orl
c
,
tmp
C
andl
b
,
c
C
andl
d
,
tmp
C
orl
c
,
tmp
C
C
; f3(a,b,c) = (a & b) | (c & (a | b))
C
movl
b
,
tmp
C
andl
a
,
tmp
C
movl
a
,
c
C
orl
b
,
c
C
andl
tmp2
,
c
C
orl
c
,
tmp
C
C
movl
tmp2
,
c
C
C
Before
:
14
instr
,
2
store
,
2
load
C
After
:
13
instr
,
1
store
,
2
load
C
C
Final
load
can
be
folded
into
the
next
round
,
C
C
round
(
d
,
e
,
a
,
b
,
c
,
f3
,
k
)
C
C
c
+
=
d
<<<
5
+
f
(
e
,
a
,
b
)
+
k
+
w
C
C
if
we
arrange
to
have
w
placed
di
rectly
into
the
register
C
corresponding
to
w.
That
way
we
save
one
more
instruction
,
total
save
C
of
two
instructions
,
one
of
which
is
a
store
,
per
two
rounds.
For
the
C
twenty
rounds
involving
f3
,
that
'
s
20
instructions
,
10
of
which
are
C
stores
,
or
about
1.5
%
.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment