Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
N
nettle
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Wim Lewis
nettle
Commits
60dfd8d4
Commit
60dfd8d4
authored
Sep 15, 2009
by
Niels Möller
Browse files
Options
Downloads
Patches
Plain Diff
Cleanup, removing old cruft. Slight improvement to ROUND_F1_NOEXP.
Rev: nettle/x86/sha1-compress.asm:1.8
parent
d8e65e43
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
x86/sha1-compress.asm
+27
-169
27 additions, 169 deletions
x86/sha1-compress.asm
with
27 additions
and
169 deletions
x86/sha1-compress.asm
+
27
−
169
View file @
60dfd8d4
...
@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
...
@@ -26,7 +26,6 @@ define(<SE>,<%ebp>)
define
(
<
DATA
>
,
<%
esp
>
)
define
(
<
DATA
>
,
<%
esp
>
)
define
(
<
T1
>
,
<%
edi
>
)
define
(
<
T1
>
,
<%
edi
>
)
define
(
<
T2
>
,
<%
esi
>
)
C
Used
by
SWAP
define
(
<
T2
>
,
<%
esi
>
)
C
Used
by
SWAP
define
(
<
KVALUE
>
,
<%
esi
>
)
C
Used
by
rounds
C
Constants
C
Constants
define
(
<
K1VALUE
>
,
<
0x5A827999
>
)
C
Rounds
0
-
19
define
(
<
K1VALUE
>
,
<
0x5A827999
>
)
C
Rounds
0
-
19
...
@@ -42,23 +41,6 @@ define(<SWAP>, <
...
@@ -42,23 +41,6 @@ define(<SWAP>, <
movl
$
2
,
OFFSET
(
$
1
)
(
DATA
)
movl
$
2
,
OFFSET
(
$
1
)
(
DATA
)
>
)
dnl
>
)
dnl
C
expand
(
i
)
is
the
expansion
function
C
C
W
[
i
]
=
(
W
[
i
-
16
]
^
W
[
i
-
14
]
^
W
[
i
-
8
]
^
W
[
i
-
3
])
<<<
1
C
C
where
W
[
i
]
is
stored
in
DATA
[
i
mod
16
]
.
C
C
Result
is
stored
back
in
W
[
i
],
and
al
so
left
in
T1
,
the
only
C
register
that
is
used.
define
(
<
EXPAND
>
,
<
movl
OFFSET
(
eval
(
$
1
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
2
)
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
8
)
%
16
))
(
DATA
),
T1
xorl
OFFSET
(
eval
((
$
1
+
13
)
%
16
))
(
DATA
),
T1
roll
<
$
>
1
,
T1
movl
T1
,
OFFSET
(
eval
(
$
1
%
16
))
(
DATA
)
>
)
dnl
define
(
<
NOEXPAND
>
,
<
OFFSET
(
$
1
)
(
DATA
)
>
)
dnl
C
The
f
functions
,
C
The
f
functions
,
C
C
C
f1
(
x
,
y
,
z
)
=
z
^
(
x
&
(
y
^
z
))
C
f1
(
x
,
y
,
z
)
=
z
^
(
x
&
(
y
^
z
))
...
@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
...
@@ -103,18 +85,18 @@ define(<ROUND_F1>, <
add
T2
,
$
5
add
T2
,
$
5
>
)
>
)
C
FIXME
:
Seems
to
be
a
slow
sequence.
dnl
ROUND_F1_NOEXP
(
a
,
b
,
c
,
d
,
e
,
i
)
define
(
<
ROUND_F1_NOEXP
>
,
<
define
(
<
ROUND_F1_NOEXP
>
,
<
mov
$
4
,
T2
mov
$
4
,
T2
xor
$
3
,
T2
xor
$
3
,
T2
mov
$
1
,
T1
and
$
2
,
T2
and
$
2
,
T2
add
OFFSET
(
$
6
)
(
DATA
),
$
5
xor
$
4
,
T2
xor
$
4
,
T2
add
OFFSET
(
$
6
)
(
DATA
),
T2
add
T2
,
$
5
rol
<
$
>
30
,
$
2
rol
<
$
>
30
,
$
2
mov
$
1
,
T1
rol
<
$
>
5
,
T1
rol
<
$
>
5
,
T1
lea
K1VALUE
(
T1
,
$
5
),
$
5
lea
K1VALUE
(
T1
,
$
5
),
$
5
add
T2
,
$
5
>
)
>
)
dnl
ROUND_F2
(
a
,
b
,
c
,
d
,
e
,
i
,
k
)
dnl
ROUND_F2
(
a
,
b
,
c
,
d
,
e
,
i
,
k
)
...
@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
...
@@ -158,11 +140,6 @@ define(<ROUND_F3>, <
add
T2
,
$
5
add
T2
,
$
5
>
)
>
)
C
As
suggested
by
George
Sp
elvin
,
write
the
F3
function
as
C
(
x
&
y
)
|
(
y
&
z
)
|
(
x
&
z
)
==
(
x
&
(
y
^
z
))
+
(
y
&
z
)
.
Then
,
we
can
compute
C
and
add
each
term
to
e
,
using
a
si
ngle
temporary.
.file
"sha1-compress.asm"
.file
"sha1-compress.asm"
C
_nettle_sha1_compress
(
uint32_t
*
state
,
uint8_t
*
data
)
C
_nettle_sha1_compress
(
uint32_t
*
state
,
uint8_t
*
data
)
...
@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
...
@@ -179,7 +156,6 @@ PROLOGUE(_nettle_sha1_compress)
pushl
%
esi
C
68
(
%
esp
)
pushl
%
esi
C
68
(
%
esp
)
pushl
%
edi
C
64
(
%
esp
)
pushl
%
edi
C
64
(
%
esp
)
C
FIXME
:
Trim
to
64
subl
$
64
,
%
esp
C
%
esp
=
W
subl
$
64
,
%
esp
C
%
esp
=
W
C
Load
and
byte
swap
data
C
Load
and
byte
swap
data
...
@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
...
@@ -309,121 +285,3 @@ PROLOGUE(_nettle_sha1_compress)
popl
%
ebx
popl
%
ebx
ret
ret
EPILOGUE
(
_nettle_sha1_compress
)
EPILOGUE
(
_nettle_sha1_compress
)
C
George
Sp
elvin
al
so
suggested
using
lea
,
with
an
immediate
offset
C
for
the
magic
constants.
This
frees
one
register
,
which
can
be
used
C
for
loosen
up
dependencies
and
to
more
operations
in
parallel.
For
C
example
,
take
the
rounds
involving
f2
,
the
si
mplest
round
function.
C
Currently
,
we
have
C
C
movl
16
(
%
esp
),
T1
C
xorl
24
(
%
esp
),
T1
C
xorl
48
(
%
esp
),
T1
C
xorl
4
(
%
esp
),
T1
C
roll
$
1
,
T1
C
movl
T1
,
16
(
%
esp
)
C
addl
KVALUE
,
SE
C
0
C
addl
T1
,
SE
C
1
C
movl
SB
,
T1
C
0
C
xorl
SC
,
T1
C
1
C
xorl
SD
,
T1
C
2
C
addl
T1
,
SE
C
3
C
movl
SA
,
T1
C
0
C
roll
$
5
,
T1
C
1
C
addl
T1
,
SE
C
4
C
roll
$
30
,
SB
C
0
C
These
16
instructions
could
be
executed
in
5.33
cycles
if
there
were
C
no
dependencies.
The
crucial
dependencies
are
from
(
previous
)
SE
to
C
use
SA
,
and
(
previous
)
result
SB
to
use
SC.
(
What
does
this
say
C
about
recurrency
ch
ain?
Ought
to
unroll
5
times
to
see
it
)
.
C
It
would
be
preferable
to
accumulate
the
terms
in
two
or
more
C
registers
,
to
make
dependencies
shallower.
Something
like
C
...expand
,
put
data
in
W
C
movl
SD
,
T1
C
0
C
leal
K1VALUE
(
SE
,
W
),
SE
C
0
C
movl
SA
,
T2
C
0
C
xorl
SC
,
T1
C
1
C
roll
$
5
,
T2
C
1
C
xorl
SB
,
T1
C
2
C
addl
T2
,
T1
C
3
C
addl
T1
,
SE
C
4
C
a
+
b
+
c
+
d
+
e
=
((((
a
+
b
)
+
c
)
+
d
)
+
e
),
latency
4
C
a
+
b
+
c
+
d
+
e
=
((
a
+
b
)
+
c
)
+
(
d
+
e
)
C
the
out
-
of
-
order
execution.
Next
iteration
C
C
...expand...
C
roll
$
1
,
T1
C
4
C
movl
T1
,
16
(
%
esp
)
C
5
C
addl
KVALUE
,
SD
C
0
C
addl
T1
,
SD
C
5
C
movl
SA
,
T1
C
0
C
xorl
SB
,
T1
C
1
C
xorl
SC
,
T1
C
2
C
addl
T1
,
SD
C
6
C
movl
SE
,
T1
C
8
C
roll
$
5
,
T1
C
9
C
addl
T1
,
SD
C
7
C
roll
$
30
,
SA
C
0
C
C
Lets
look
at
the
latency.
Next
iteration
will
operate
on
(
E
,
A
,
B
,
C
,
D
),
so
we
have
recurrencies
:
C
from
result
SA
to
use
of
SE
(
none
,
SA
not
modified
)
C
from
result
of
SB
to
use
of
SA
,
result
of
SC
to
use
of
SB
C
It
'
s
possible
to
shave
of
half
of
the
stores
to
tmp
in
the
evaluation
of
f3
,
C
al
though
it
'
s
probably
not
worth
the
effort.
This
is
the
trick
:
C
C
round
(
a
,
b
,
c
,
d
,
e
,
f
,
k
)
modifies
only
b
,
e.
C
C
round
(
a
,
b
,
c
,
d
,
e
,
f3
,
k
)
C
round
(
e
,
a
,
b
,
c
,
d
,
f3
,
k
)
C
C
; f3(b,c,d) = (b & c) | (d & (b | c))
C
C
movl
b
,
tmp
C
andl
c
,
tmp
C
movl
tmp
,
tmp2
C
movl
b
,
tmp
C
orl
c
,
tmp
C
andl
d
,
tmp
C
orl
tmp2
,
tmp
C
C
and
corresponding
code
for
f3
(
a
,
b
,
c
)
C
C
Use
the
register
al
located
for
c
as
a
temporary?
C
C
movl
c
,
tmp2
C
; f3(b,c,d) = (b & c) | (d & (b | c))
C
movl
b
,
tmp
C
orl
c
,
tmp
C
andl
b
,
c
C
andl
d
,
tmp
C
orl
c
,
tmp
C
C
; f3(a,b,c) = (a & b) | (c & (a | b))
C
movl
b
,
tmp
C
andl
a
,
tmp
C
movl
a
,
c
C
orl
b
,
c
C
andl
tmp2
,
c
C
orl
c
,
tmp
C
C
movl
tmp2
,
c
C
C
Before
:
14
instr
,
2
store
,
2
load
C
After
:
13
instr
,
1
store
,
2
load
C
C
Final
load
can
be
folded
into
the
next
round
,
C
C
round
(
d
,
e
,
a
,
b
,
c
,
f3
,
k
)
C
C
c
+
=
d
<<<
5
+
f
(
e
,
a
,
b
)
+
k
+
w
C
C
if
we
arrange
to
have
w
placed
di
rectly
into
the
register
C
corresponding
to
w.
That
way
we
save
one
more
instruction
,
total
save
C
of
two
instructions
,
one
of
which
is
a
store
,
per
two
rounds.
For
the
C
twenty
rounds
involving
f3
,
that
'
s
20
instructions
,
10
of
which
are
C
stores
,
or
about
1.5
%
.
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment