Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Brian Smith
nettle
Commits
f2f7f56c
Commit
f2f7f56c
authored
Dec 03, 2012
by
Niels Möller
Browse files
Basic x86_64 sha3-permute.asm.
parent
d56962a0
Changes
3
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
f2f7f56c
2012-12-03 Niels Möller <nisse@lysator.liu.se>
* configure.ac: Added sha3-permute.asm.
* x86_64/sha3-permute.asm: New file. 30% speedup over current C
code, 4300 cycles.
* nettle.texinfo (Hash functions): Split into several sections,
separating recommended hash functions and legacy hash functions.
Document sha3-256.
...
...
configure.ac
View file @
f2f7f56c
...
...
@@ -240,7 +240,7 @@ if test "x$enable_assembler" = xyes ; then
md5-compress.asm memxor.asm \
salsa20-crypt.asm salsa20-core-internal.asm \
serpent-encrypt.asm serpent-decrypt.asm \
sha1-compress.asm machine.m4; do
sha1-compress.asm
sha3-permute.asm
machine.m4; do
# echo "Looking for $srcdir/$asm_path/$tmp_f"
if test -f "$srcdir/$asm_path/$tmp_f"; then
# echo found
...
...
x86_64/sha3-permute.asm
0 → 100644
View file @
f2f7f56c
C
nettle
,
low
-
level
cryptographics
library
C
C
Copyright
(
C
)
2012
Niels
M
ö
ller
C
C
The
nettle
library
is
free
software
; you can redistribute it and/or modify
C
it
under
the
terms
of
the
GNU
Lesser
General
Public
License
as
published
by
C
the
Free
Software
Foundation
; either version 2.1 of the License, or (at your
C
option
)
any
later
version.
C
C
The
nettle
library
is
di
stributed
in
the
hope
that
it
will
be
useful
,
but
C
WITHOUT
ANY
WARRANTY
; without even the implied warranty of MERCHANTABILITY
C
or
FITNESS
FOR
A
PARTICULAR
PURPOSE.
See
the
GNU
Lesser
General
Public
C
License
for
more
details.
C
C
You
should
have
received
a
copy
of
the
GNU
Lesser
General
Public
License
C
al
ong
with
the
nettle
library
; see the file COPYING.LIB. If not, write to
C
the
Free
Software
Foundation
,
Inc.
,
51
Franklin
Street
,
Fifth
Floor
,
Boston
,
C
MA
02111
-
1301
,
USA.
define
(
<
CTX
>
,
<%
rdi
>
)
C
25
64
-
bit
values
,
200
byte
s.
define
(
<
COUNT
>
,
<%
r8
>
)
C
Avoid
cl
obbering
%
rsi
,
for
W64.
define
(
<
C01
>
,
<%
xmm0
>
)
define
(
<
C23
>
,
<%
xmm1
>
)
define
(
<
C4
>
,
<%
rdx
>
)
define
(
<
T01
>
,
<%
xmm2
>
)
define
(
<
T23
>
,
<%
xmm3
>
)
define
(
<
T4
>
,
<%
r9
>
)
define
(
<
D12
>
,
<%
xmm4
>
)
define
(
<
D34
>
,
<%
xmm5
>
)
define
(
<
D0
>
,
<%
r10
>
)
define
(
<
T40
>
,
<%
xmm6
>
)
define
(
<
D43
>
,
<%
xmm7
>
)
define
(
<
RC_END
>
,
<%
r11
>
)
define
(
<
FRAME_SIZE
>
,
<
200
>
)
define
(
<
OFFSET
>
,
<
ifelse
(
$
1
,
0
,,
eval
(
8
*
$
1
))
>
)
define
(
<
A
>
,
<
OFFSET
(
$
1
)(
CTX
)
>
)
define
(
<
B
>
,
<
OFFSET
(
$
1
)(
%
rsp
)
>
)
C
FIXME
:
Possible
optimizations.
C
*
Compute
the
parity
vector
C
at
the
end
of
the
ch
i
step.
C
This
avoids
one
pass
over
the
data.
C
*
Micro
optimizations
with
register
use
and
scheduling.
C
*
Try
di
fferent
order
during
the
permutation
step
,
maybe
C
doing
sequential
writes
rather
than
sequential
reads.
C
*
Try
to
do
the
permutation
and
the
ch
i
step
,
without
C
storing
intermediate
values?
That
would
reducing
the
C
number
of
passes
over
the
data.
We
still
need
a
copy
,
but
C
we
would
let
the
theta
step
produce
that
copy.
.file
"sha3-permute.asm"
C
sha3_permute
(
struct
sha3_state
*
ctx
)
.text
ALIGN
(
4
)
PROLOGUE
(
nettle_sha3_permute
)
W64_ENTRY
(
1
,
8
)
subq
$
FRAME_SIZE
,
%
rsp
movl
$
24
,
XREG
(
COUNT
)
negq
COUNT
lea
.rc_end
(
%
rip
),
RC_END
ALIGN
(
4
)
.Loop:
C
theta
step
C
Compute
parity
vector
C
[
0
,
...
,
4
]
.
movups
A
(
0
),
C01
movups
A
(
2
),
C23
movq
A
(
4
),
C4
movups
A
(
5
),
T01
movups
A
(
7
),
T23
xorq
A
(
9
),
C4
C
C
[
4
]
^
=
A
[
9
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
5
,
6
]
movups
A
(
10
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
7
,
8
]
movups
A
(
12
),
T23
xorq
A
(
14
),
C4
C
C
[
4
]
^
=
A
[
14
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
10
,
11
]
movups
A
(
15
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
12
,
13
]
movups
A
(
17
),
T23
xorq
A
(
19
),
C4
C
C
[
4
]
^
=
A
[
19
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
15
,
16
]
movups
A
(
20
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
17
,
18
]
movups
A
(
22
),
T23
xorq
A
(
24
),
C4
C
C
[
4
]
^
=
A
[
24
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
20
,
21
]
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
22
,
23
]
C
Combine
parity
bits
:
C
D
[
0
]
=
C
[
4
]
^
ROTL64
(
1
,
C
[
1
])
C
D
[
1
,
2
]
=
C
[
0
,
1
]
^
ROTL64
(
1
,
C
[
2
,
3
])
C
D
[
3
,
4
]
=
C
[
2
,
3
]
^
ROTL64
(
1
,
C
[
4
,
0
])
C
Copy
to
D0
,
D12
,
D34
,
rotate
original
movdqa
C01
,
D12
movdqa
C23
,
D34
movdqa
C01
,
T01
movdqa
C23
,
T23
psllq
$
1
,
T01
psllq
$
1
,
T23
psrlq
$
63
,
C01
psrlq
$
63
,
C23
movq
C4
,
D0
rolq
$
1
,
C4
por
T01
,
C01
por
T23
,
C23
C
Move
around
,
putting
C
T4
<--
ROTL
(
1
,
C1
),
T40
<--
ROTL
(
1
,
C
[
4
,
0
])
movq
C4
,
T40
punpcklqdq
C01
,
T40
psrldq
$
8
,
C01
movd
C01
,
T4
C
Really
a
movq
!
pxor
C23
,
D12
xorq
T4
,
D0
pxor
T40
,
D34
C
xor
D
on
top
of
state
xorq
D0
,
A
(
0
)
movups
A
(
1
),
T01
movups
A
(
3
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
1
)
movups
T23
,
A
(
3
)
xorq
D0
,
A
(
5
)
movups
A
(
6
),
T01
movups
A
(
8
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
6
)
movups
T23
,
A
(
8
)
xorq
D0
,
A
(
10
)
movups
A
(
11
),
T01
movups
A
(
13
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
11
)
movups
T23
,
A
(
13
)
xorq
D0
,
A
(
15
)
movups
A
(
16
),
T01
movups
A
(
18
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
16
)
movups
T23
,
A
(
18
)
xorq
D0
,
A
(
20
)
movups
A
(
21
),
T01
movups
A
(
23
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
21
)
movups
T23
,
A
(
23
)
C
rho
and
pi
steps
:
Rotate
and
permute
movq
A
(
0
),
C4
C
rot
0
,
perm
0
movq
A
(
1
),
T4
C
rot
1
,
perm
10
movq
C4
,
B
(
0
)
rolq
$
1
,
T4
movq
A
(
2
),
C4
C
rot
62
,
perm
20
movq
T4
,
B
(
10
)
rolq
$
62
,
C4
movq
A
(
3
),
T4
C
rot
28
,
perm
5
movq
C4
,
B
(
20
)
rolq
$
28
,
T4
movq
A
(
4
),
C4
C
rot
27
,
perm
15
movq
T4
,
B
(
5
)
rolq
$
27
,
C4
movq
A
(
5
),
T4
C
rot
36
,
perm
16
movq
C4
,
B
(
15
)
rolq
$
36
,
T4
movq
A
(
6
),
C4
C
rot
44
,
perm
1
movq
T4
,
B
(
16
)
rolq
$
44
,
C4
movq
A
(
7
),
T4
C
rot
6
,
perm
11
movq
C4
,
B
(
1
)
rolq
$
6
,
T4
movq
A
(
8
),
C4
C
rot
55
,
perm
21
movq
T4
,
B
(
11
)
rolq
$
55
,
C4
movq
A
(
9
),
T4
C
rot
20
,
perm
6
movq
C4
,
B
(
21
)
rolq
$
20
,
T4
movq
A
(
10
),
C4
C
rot
3
,
perm
7
movq
T4
,
B
(
6
)
rolq
$
3
,
C4
movq
A
(
11
),
T4
C
rot
10
,
perm
17
movq
C4
,
B
(
7
)
rolq
$
10
,
T4
movq
A
(
12
),
C4
C
rot
43
,
perm
2
movq
T4
,
B
(
17
)
rolq
$
43
,
C4
movq
A
(
13
),
T4
C
rot
25
,
perm
12
movq
C4
,
B
(
2
)
rolq
$
25
,
T4
movq
A
(
14
),
C4
C
rot
39
,
perm
22
movq
T4
,
B
(
12
)
rolq
$
39
,
C4
movq
A
(
15
),
T4
C
rot
41
,
perm
23
movq
C4
,
B
(
22
)
rolq
$
41
,
T4
movq
A
(
16
),
C4
C
rot
45
,
perm
8
movq
T4
,
B
(
23
)
rolq
$
45
,
C4
movq
A
(
17
),
T4
C
rot
15
,
perm
18
movq
C4
,
B
(
8
)
rolq
$
15
,
T4
movq
A
(
18
),
C4
C
rot
21
,
perm
3
movq
T4
,
B
(
18
)
rolq
$
21
,
C4
movq
A
(
19
),
T4
C
rot
8
,
perm
13
movq
C4
,
B
(
3
)
rolq
$
8
,
T4
movq
A
(
20
),
C4
C
rot
18
,
perm
14
movq
T4
,
B
(
13
)
rolq
$
18
,
C4
movq
A
(
21
),
T4
C
rot
2
,
perm
24
movq
C4
,
B
(
14
)
rolq
$
2
,
T4
movq
A
(
22
),
C4
C
rot
61
,
perm
9
movq
T4
,
B
(
24
)
rolq
$
61
,
C4
movq
A
(
23
),
T4
C
rot
56
,
perm
19
movq
C4
,
B
(
9
)
rolq
$
56
,
T4
movq
A
(
24
),
C4
C
rot
14
,
perm
4
movq
T4
,
B
(
19
)
rolq
$
14
,
C4
movq
C4
,
B
(
4
)
C
ch
i
step
C
Read
with
some
overlap
,
pairs
C01
,
D12
,
D34
C
Then
al
so
construct
pairs
C23
and
T40.
C
We
do
the
operations
as
C
A01
=
B01
^
(
~
B12
&
B23
)
C
A12
=
B12
^
(
~
B23
&
B34
)
C
A34
=
B34
^
(
~
B40
&
B01
)
C
Where
we
store
only
the
low
64
bits
of
A01
,
and
add
in
the
C
round
key
if
applicable.
movups
B
(
0
),
C01
movups
B
(
1
),
D12
movups
B
(
3
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
3
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movd
T40
,
T4
C
Really
movq
!
xorq
(
RC_END
,
COUNT
,
8
),
T4
movq
T4
,
A
(
0
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
1
)
movups
B
(
5
),
C01
movups
B
(
6
),
D12
movups
B
(
8
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
8
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
5
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
6
)
movups
B
(
10
),
C01
movups
B
(
11
),
D12
movups
B
(
13
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
13
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
10
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
11
)
movups
B
(
15
),
C01
movups
B
(
16
),
D12
movups
B
(
18
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
18
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
15
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
16
)
movups
B
(
20
),
C01
movups
B
(
21
),
D12
movups
B
(
23
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
23
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
20
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
21
)
incq
COUNT
jnz
.Loop
addq
$
FRAME_SIZE
,
%
rsp
W64_EXIT
(
1
,
8
)
ret
EPILOGUE
(
nettle_sha3_permute
)
ALIGN
(
4
)
.quad
0x0000000000000001
,
0X0000000000008082
.quad
0X800000000000808A
,
0X8000000080008000
.quad
0X000000000000808B
,
0X0000000080000001
.quad
0X8000000080008081
,
0X8000000000008009
.quad
0X000000000000008A
,
0X0000000000000088
.quad
0X0000000080008009
,
0X000000008000000A
.quad
0X000000008000808B
,
0X800000000000008B
.quad
0X8000000000008089
,
0X8000000000008003
.quad
0X8000000000008002
,
0X8000000000000080
.quad
0X000000000000800A
,
0X800000008000000A
.quad
0X8000000080008081
,
0X8000000000008080
.quad
0X0000000080000001
,
0X8000000080008008
.rc_end:
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment