Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Nettle
nettle
Commits
588017df
Commit
588017df
authored
Dec 13, 2012
by
Niels Möller
Browse files
Rewrote x86_64 sha3-permute.asm.
parent
a7457dfa
Changes
2
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
588017df
2012-12-13 Niels Möller <nisse@lysator.liu.se>
* x86_64/sha3-permute.asm: Rewrote, to keep all state in
registers. 2400 cycles on x86_&4, only slightly faster than the
current C code.
2012-12-09 Niels Möller <nisse@lysator.liu.se>
* sha3-permute.c (sha3_permute): Rewrote to do permutation in
...
...
x86_64/sha3-permute.asm
View file @
588017df
...
...
@@ -20,41 +20,59 @@ C MA 02111-1301, USA.
define
(
<
CTX
>
,
<%
rdi
>
)
C
25
64
-
bit
values
,
200
byte
s.
define
(
<
COUNT
>
,
<%
r8
>
)
C
Avoid
cl
obbering
%
rsi
,
for
W64.
define
(
<
C01
>
,
<%
xmm0
>
)
define
(
<
C23
>
,
<%
xmm
1
>
)
define
(
<
C
4
>
,
<%
rdx
>
)
define
(
<
A00
>
,
<%
rax
>
)
define
(
<
A0102
>
,
<%
xmm
0
>
)
define
(
<
A030
4
>
,
<%
xmm1
>
)
define
(
<
T01
>
,
<%
xmm2
>
)
define
(
<
T23
>
,
<%
xmm3
>
)
define
(
<
T4
>
,
<%
r9
>
)
define
(
<
D12
>
,
<%
xmm4
>
)
define
(
<
D34
>
,
<%
xmm5
>
)
define
(
<
D0
>
,
<%
r10
>
)
define
(
<
T40
>
,
<%
xmm6
>
)
define
(
<
D43
>
,
<%
xmm7
>
)
define
(
<
A05
>
,
<%
rcx
>
)
define
(
<
A0607
>
,
<%
xmm2
>
)
define
(
<
A0809
>
,
<%
xmm3
>
)
define
(
<
A10
>
,
<%
rdx
>
)
define
(
<
A1112
>
,
<%
xmm4
>
)
define
(
<
A1314
>
,
<%
xmm5
>
)
define
(
<
RC_END
>
,
<%
r11
>
)
define
(
<
A15
>
,
<%
rbp
>
)
define
(
<
A1617
>
,
<%
xmm6
>
)
define
(
<
A1819
>
,
<%
xmm7
>
)
define
(
<
A20
>
,
<%
r9
>
)
define
(
<
A2122
>
,
<%
xmm8
>
)
define
(
<
A2324
>
,
<%
xmm9
>
)
define
(
<
FRAME_SIZE
>
,
<
200
>
)
define
(
<
C0
>
,
<%
r10
>
)
define
(
<
C12
>
,
<%
xmm10
>
)
define
(
<
C34
>
,
<%
xmm11
>
)
define
(
<
OFFSET
>
,
<
ifelse
(
$
1
,
0
,,
eval
(
8
*
$
1
))
>
)
define
(
<
A
>
,
<
OFFSET
(
$
1
)(
CTX
)
>
)
define
(
<
B
>
,
<
OFFSET
(
$
1
)(
%
rsp
)
>
)
define
(
<
D0
>
,
<%
r11
>
)
define
(
<
D12
>
,
<
%
xmm12
>
)
define
(
<
D34
>
,
<
%
xmm13
>
)
C
FIXME
:
Possible
optimizations.
C
Wide
temporaries
define
(
<
W0
>
,
<%
xmm14
>
)
define
(
<
W1
>
,
<%
xmm15
>
)
define
(
<
W2
>
,
<%
xmm12
>
)
C
Overlap
D12
define
(
<
W3
>
,
<%
xmm13
>
)
C
Overlap
D34
C
*
Compute
the
parity
vector
C
at
the
end
of
the
ch
i
step.
C
This
avoids
one
pass
over
the
data.
C
*
Micro
optimizations
with
register
use
and
scheduling.
define
(
<
T0
>
,
<%
r12
>
)
define
(
<
T1
>
,
<%
r13
>
)
define
(
<
T2
>
,
<%
r11
>
)
C
Overlap
D0
define
(
<
T3
>
,
<%
r10
>
)
C
Overlap
C0
define
(
<
RC
>
,
<%
r14
>
)
define
(
<
OFFSET
>
,
<
ifelse
(
$
1
,
0
,,
eval
(
8
*
$
1
))
>
)
define
(
<
STATE
>
,
<
OFFSET
(
$
1
)(
CTX
)
>
)
C
*
Try
di
fferent
order
during
the
permutation
step
,
maybe
C
doing
sequential
writes
rather
than
sequential
reads.
define
(
<
SWAP64
>
,
<
pshufd
<
$
>
0x4e
,
>
)
C
*
Try
to
do
the
permutation
and
the
ch
i
step
,
without
C
storing
intermediate
values?
That
would
reducing
the
C
number
of
passes
over
the
data.
We
still
need
a
copy
,
but
C
we
would
let
the
theta
step
produce
that
copy.
C
ROTL64
(
rot
,
register
,
temp
)
C
Caller
needs
to
or
together
the
result.
define
(
<
ROTL64
>
,
<
movdqa
$
2
,
$
3
psllq
<
$
>
$
1
,
$
2
psrlq
<
$
>
eval
(
64
-
$
1
),
$
3
>)
.file
"sha3-permute.asm"
...
...
@@ -62,351 +80,416 @@ define(<B>, <OFFSET($1)(%rsp)>)
.text
ALIGN
(
4
)
PROLOGUE
(
nettle_sha3_permute
)
W64_ENTRY
(
1
,
8
)
subq
$
FRAME_SIZE
,
%
rsp
movl
$
24
,
XREG
(
COUNT
)
negq
COUNT
W64_ENTRY
(
1
,
16
)
push
%
rbp
push
%
r12
push
%
r13
push
%
r14
lea
.rc_end
(
%
rip
),
RC_END
movl
$
24
,
XREG
(
COUNT
)
lea
.rc
-
8
(
%
rip
),
RC
movq
STATE
(
0
),
A00
movups
STATE
(
1
),
A0102
movups
STATE
(
3
),
A0304
movq
A00
,
C0
movq
STATE
(
5
),
A05
movdqa
A0102
,
C12
movups
STATE
(
6
),
A0607
movdqa
A0304
,
C34
movups
STATE
(
8
),
A0809
xorq
A05
,
C0
movq
STATE
(
10
),
A10
pxor
A0607
,
C12
movups
STATE
(
11
),
A1112
pxor
A0809
,
C34
movups
STATE
(
13
),
A1314
xorq
A10
,
C0
movq
STATE
(
15
),
A15
pxor
A1112
,
C12
movups
STATE
(
16
),
A1617
pxor
A1314
,
C34
movups
STATE
(
18
),
A1819
xorq
A15
,
C0
movq
STATE
(
20
),
A20
pxor
A1617
,
C12
movups
STATE
(
21
),
A2122
pxor
A1819
,
C34
movups
STATE
(
23
),
A2324
xorq
A20
,
C0
pxor
A2122
,
C12
pxor
A2324
,
C34
ALIGN
(
4
)
.Loop:
C
theta
step
C
Compute
parity
vector
C
[
0
,
...
,
4
]
.
movups
A
(
0
),
C01
movups
A
(
2
),
C23
movq
A
(
4
),
C4
movups
A
(
5
),
T01
movups
A
(
7
),
T23
xorq
A
(
9
),
C4
C
C
[
4
]
^
=
A
[
9
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
5
,
6
]
movups
A
(
10
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
7
,
8
]
movups
A
(
12
),
T23
xorq
A
(
14
),
C4
C
C
[
4
]
^
=
A
[
14
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
10
,
11
]
movups
A
(
15
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
12
,
13
]
movups
A
(
17
),
T23
xorq
A
(
19
),
C4
C
C
[
4
]
^
=
A
[
19
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
15
,
16
]
movups
A
(
20
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
17
,
18
]
movups
A
(
22
),
T23
xorq
A
(
24
),
C4
C
C
[
4
]
^
=
A
[
24
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
20
,
21
]
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
22
,
23
]
C
Combine
parity
bits
:
C
D
[
0
]
=
C
[
4
]
^
ROTL64
(
1
,
C
[
1
])
C
D
[
1
,
2
]
=
C
[
0
,
1
]
^
ROTL64
(
1
,
C
[
2
,
3
])
C
D
[
3
,
4
]
=
C
[
2
,
3
]
^
ROTL64
(
1
,
C
[
4
,
0
])
C
The
theta
step.
Combine
parity
bits
,
then
xor
to
state.
C
D0
=
C4
^
(
C1
<<<
1
)
C
D1
=
C0
^
(
C2
<<<
1
)
C
D2
=
C1
^
(
C3
<<<
1
)
C
D3
=
C2
^
(
C4
<<<
1
)
C
D4
=
C3
^
(
C0
<<<
1
)
C
Shift
the
word
s
around
,
putting
(
C0
,
C1
)
in
D12
,
(
C2
,
C3
)
in
C
D34
,
and
(
C4
,
C0
)
in
C34.
C
Copy
to
D0
,
D12
,
D34
,
rotate
original
movdqa
C01
,
D12
movdqa
C23
,
D34
movdqa
C01
,
T01
movdqa
C23
,
T23
psllq
$
1
,
T01
psllq
$
1
,
T23
psrlq
$
63
,
C01
psrlq
$
63
,
C23
movq
C4
,
D0
rolq
$
1
,
C4
por
T01
,
C01
por
T23
,
C23
C
Move
around
,
putting
C
T4
<--
ROTL
(
1
,
C1
),
T40
<--
ROTL
(
1
,
C
[
4
,
0
])
movq
C4
,
T40
punpcklqdq
C01
,
T40
psrldq
$
8
,
C01
movd
C01
,
T4
C
Really
a
movq
!
pxor
C23
,
D12
xorq
T4
,
D0
pxor
T40
,
D34
C
xor
D
on
top
of
state
xorq
D0
,
A
(
0
)
movups
A
(
1
),
T01
movups
A
(
3
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
1
)
movups
T23
,
A
(
3
)
C
Notes
on
"
unpack
"
instructions
:
C
punpckhqdq
01
,
23
gives
31
C
punpcklqdq
01
,
23
gives
20
SWAP64
C34
,
C34
C
Holds
C4
,
C3
movdqa
C12
,
D34
movq
C0
,
D12
punpcklqdq
C12
,
D12
C
Holds
C0
,
C1
punpckhqdq
C34
,
D34
C
Holds
C2
,
C3
punpcklqdq
D12
,
C34
C
Holds
C4
,
C0
movq
C34
,
D0
movq
C12
,
T0
rolq
$
1
,
T0
xorq
T0
,
D0
C
Can
use
C12
as
temporary
movdqa
D34
,
W0
movdqa
D34
,
W1
psllq
$
1
,
W0
psrlq
$
63
,
W1
pxor
W0
,
D12
pxor
W1
,
D12
C
Done
D12
xorq
D0
,
A
(
5
)
movups
A
(
6
),
T01
movups
A
(
8
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
6
)
movups
T23
,
A
(
8
)
xorq
D0
,
A
(
10
)
movups
A
(
11
),
T01
movups
A
(
13
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
11
)
movups
T23
,
A
(
13
)
xorq
D0
,
A
(
15
)
movups
A
(
16
),
T01
movups
A
(
18
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
16
)
movups
T23
,
A
(
18
)
xorq
D0
,
A
(
20
)
movups
A
(
21
),
T01
movups
A
(
23
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
21
)
movups
T23
,
A
(
23
)
C
rho
and
pi
steps
:
Rotate
and
permute
movq
A
(
0
),
C4
C
rot
0
,
perm
0
movq
A
(
1
),
T4
C
rot
1
,
perm
10
movq
C4
,
B
(
0
)
rolq
$
1
,
T4
movq
A
(
2
),
C4
C
rot
62
,
perm
20
movq
T4
,
B
(
10
)
rolq
$
62
,
C4
movq
A
(
3
),
T4
C
rot
28
,
perm
5
movq
C4
,
B
(
20
)
rolq
$
28
,
T4
movq
A
(
4
),
C4
C
rot
27
,
perm
15
movq
T4
,
B
(
5
)
rolq
$
27
,
C4
movq
A
(
5
),
T4
C
rot
36
,
perm
16
movq
C4
,
B
(
15
)
rolq
$
36
,
T4
movq
A
(
6
),
C4
C
rot
44
,
perm
1
movq
T4
,
B
(
16
)
rolq
$
44
,
C4
movq
A
(
7
),
T4
C
rot
6
,
perm
11
movq
C4
,
B
(
1
)
rolq
$
6
,
T4
movq
A
(
8
),
C4
C
rot
55
,
perm
21
movq
T4
,
B
(
11
)
rolq
$
55
,
C4
movq
A
(
9
),
T4
C
rot
20
,
perm
6
movq
C4
,
B
(
21
)
rolq
$
20
,
T4
movq
A
(
10
),
C4
C
rot
3
,
perm
7
movq
T4
,
B
(
6
)
rolq
$
3
,
C4
movq
A
(
11
),
T4
C
rot
10
,
perm
17
movq
C4
,
B
(
7
)
rolq
$
10
,
T4
movq
A
(
12
),
C4
C
rot
43
,
perm
2
movq
T4
,
B
(
17
)
rolq
$
43
,
C4
movq
A
(
13
),
T4
C
rot
25
,
perm
12
movq
C4
,
B
(
2
)
rolq
$
25
,
T4
movq
A
(
14
),
C4
C
rot
39
,
perm
22
movq
T4
,
B
(
12
)
rolq
$
39
,
C4
movq
A
(
15
),
T4
C
rot
41
,
perm
23
movq
C4
,
B
(
22
)
rolq
$
41
,
T4
movq
A
(
16
),
C4
C
rot
45
,
perm
8
movq
T4
,
B
(
23
)
rolq
$
45
,
C4
movq
A
(
17
),
T4
C
rot
15
,
perm
18
movq
C4
,
B
(
8
)
rolq
$
15
,
T4
movq
A
(
18
),
C4
C
rot
21
,
perm
3
movq
T4
,
B
(
18
)
rolq
$
21
,
C4
movq
A
(
19
),
T4
C
rot
8
,
perm
13
movq
C4
,
B
(
3
)
rolq
$
8
,
T4
movq
A
(
20
),
C4
C
rot
18
,
perm
14
movq
T4
,
B
(
13
)
rolq
$
18
,
C4
movq
A
(
21
),
T4
C
rot
2
,
perm
24
movq
C4
,
B
(
14
)
rolq
$
2
,
T4
movq
A
(
22
),
C4
C
rot
61
,
perm
9
movq
T4
,
B
(
24
)
rolq
$
61
,
C4
movq
A
(
23
),
T4
C
rot
56
,
perm
19
movq
C4
,
B
(
9
)
rolq
$
56
,
T4
movq
A
(
24
),
C4
C
rot
14
,
perm
4
movq
T4
,
B
(
19
)
rolq
$
14
,
C4
movq
C4
,
B
(
4
)
C
ch
i
step
C
Read
with
some
overlap
,
pairs
C01
,
D12
,
D34
C
Then
al
so
construct
pairs
C23
and
T40.
C
We
do
the
operations
as
C
A01
=
B01
^
(
~
B12
&
B23
)
C
A12
=
B12
^
(
~
B23
&
B34
)
C
A34
=
B34
^
(
~
B40
&
B01
)
C
Where
we
store
only
the
low
64
bits
of
A01
,
and
add
in
the
C
round
key
if
applicable.
movdqa
C34
,
C12
psrlq
$
63
,
C34
psllq
$
1
,
C12
pxor
C34
,
D34
pxor
C12
,
D34
C
Done
D34
xorq
D0
,
A00
xorq
D0
,
A05
xorq
D0
,
A10
xorq
D0
,
A15
xorq
D0
,
A20
pxor
D12
,
A0102
pxor
D12
,
A0607
pxor
D12
,
A1112
pxor
D12
,
A1617
pxor
D12
,
A2122
pxor
D34
,
A0304
pxor
D34
,
A0809
pxor
D34
,
A1314
pxor
D34
,
A1819
pxor
D34
,
A2324
C
theta
step
done
,
no
C
,
D
or
W
temporaries
al
ive.
C
rho
and
pi
steps.
When
doing
the
permutations
,
al
so
C
transpose
the
matrix.
movups
B
(
0
),
C01
movups
B
(
1
),
D12
movups
B
(
3
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
3
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movd
T40
,
T4
C
Really
movq
!
xorq
(
RC_END
,
COUNT
,
8
),
T4
movq
T4
,
A
(
0
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
1
)
movups
B
(
5
),
C01
movups
B
(
6
),
D12
movups
B
(
8
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
8
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
5
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
6
)
movups
B
(
10
),
C01
movups
B
(
11
),
D12
movups
B
(
13
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
13
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
10
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
11
)
movups
B
(
15
),
C01
movups
B
(
16
),
D12
movups
B
(
18
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
18
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
15
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
16
)
movups
B
(
20
),
C01
movups
B
(
21
),
D12
movups
B
(
23
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
23
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
20
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
21
)
incq
COUNT
C
The
combined
permutation
+
transpose
gives
the
following
C
cycles
(
rotation
counts
in
parenthesis
)
C
0
<-
0
(
0
)
C
1
<-
3
(
28
)
<-
4
(
27
)
<-
2
(
62
)
<-
1
(
1
)
C
5
<-
6
(
44
)
<-
9
(
20
)
<-
8
(
55
)
<-
5
(
36
)
C
7
<-
7
(
6
)
C
10
<-
12
(
43
)
<-
13
(
25
)
<-
11
(
10
)
<-
10
(
3
)
C
14
<-
14
(
39
)
C
15
<-
18
(
21
)
<-
17
(
15
)
<-
19
(
8
)
<-
15
(
41
)
C
16
<-
16
(
45
)
C
20
<-
24
(
14
)
<-
21
(
2
)
<-
22
(
61
)
<-
20
(
18
)
C
23
<-
23
(
56
)
C
Do
the
1
,
2
,
3
,
4
row.
First
rotate
,
then
permute.
movdqa
A0102
,
W0
movdqa
A0102
,
W1
movdqa
A0102
,
W2
psllq
$
1
,
A0102
psrlq
$
63
,
W0
psllq
$
62
,
W1
por
A0102
,
W0
C
rotl
1
(
A01
)
psrlq
$
2
,
W2
por
W1
,
W2
C
rotl
62
(
A02
)
movdqa
A0304
,
A0102
movdqa
A0304
,
W1
psllq
$
28
,
A0102
psrlq
$
36
,
W1
por
W1
,
A0102
C
rotl
28
(
A03
)
movdqa
A0304
,
W1
psllq
$
27
,
A0304
psrlq
$
37
,
W1
por
W1
,
A0304
C
rotl
27
(
A04
)
punpcklqdq
W0
,
A0102
punpckhqdq
W2
,
A0304
C
5
<-
6
(
44
)
<-
9
(
20
)
<-
8
(
55
)
<-
5
(
36
)
C
7
<-
7
(
6
)
C
__
_______
C
_
L
'
` L_ __`
C
|
5
|
|
6
|
7
|
|
8
|
9
|
C
`-_________-^`
-^
rolq
$
36
,
A05
movq
A05
,
W0
movq
A0607
,
A05
rolq
$
44
,
A05
C
Done
A05
ROTL64
(
6
,
A0607
,
W1
)
por
A0607
,
W1
movdqa
A0809
,
A0607
ROTL64
(
20
,
A0607
,
W2
)
por
W2
,
A0607
punpckhqdq
W1
,
A0607
C
Done
A0607
ROTL64
(
55
,
A0809
,
W1
)
por
A0809
,
W1
movdqa
W0
,
A0809
punpcklqdq
W1
,
A0809
C
Done
0809
C
10
<-
12
(
43
)
<-
13
(
25
)
<-
11
(
10
)
<-
10
(
3
)
C
14
<-
14
(
39
)
C
_____
___
C
__L
'
__
`_L_ `
_____
C
|
10
|
|
11
|
12
|
|
13
|
14
|
C
`-___-^`
-
______
-^
C
rolq
$
42
,
A10
C
42
+
25
=
3
(
mod
64
)
SWAP64
A1112
,
W0
movq
A10
,
A1112
movq
W0
,
A10
rolq
$
43
,
A10
C
Done
A10
punpcklqdq
A1314
,
A1112
ROTL64
(
25
,
A1112
,
W1
)
por
W1
,
A1112
C
Done
A1112
ROTL64
(
39
,
A1314
,
W2
)
por
A1314
,
W2
ROTL64
(
10
,
W0
,
A1314
)