Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
nettle
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
5
Merge Requests
5
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
Nettle
nettle
Commits
588017df
Commit
588017df
authored
Dec 13, 2012
by
Niels Möller
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Rewrote x86_64 sha3-permute.asm.
parent
a7457dfa
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
451 additions
and
362 deletions
+451
-362
ChangeLog
ChangeLog
+6
-0
x86_64/sha3-permute.asm
x86_64/sha3-permute.asm
+445
-362
No files found.
ChangeLog
View file @
588017df
2012-12-13 Niels Möller <nisse@lysator.liu.se>
* x86_64/sha3-permute.asm: Rewrote, to keep all state in
registers. 2400 cycles on x86_&4, only slightly faster than the
current C code.
2012-12-09 Niels Möller <nisse@lysator.liu.se>
* sha3-permute.c (sha3_permute): Rewrote to do permutation in
...
...
x86_64/sha3-permute.asm
View file @
588017df
...
...
@@ -20,41 +20,59 @@ C MA 02111-1301, USA.
define
(
<
CTX
>
,
<%
rdi
>
)
C
25
64
-
bit
values
,
200
byte
s.
define
(
<
COUNT
>
,
<%
r8
>
)
C
Avoid
cl
obbering
%
rsi
,
for
W64.
define
(
<
C01
>
,
<%
xmm0
>
)
define
(
<
C23
>
,
<%
xmm1
>
)
define
(
<
C4
>
,
<%
rdx
>
)
define
(
<
A00
>
,
<%
rax
>
)
define
(
<
A0102
>
,
<%
xmm0
>
)
define
(
<
A0304
>
,
<%
xmm1
>
)
define
(
<
T01
>
,
<%
xmm2
>
)
define
(
<
T23
>
,
<%
xmm3
>
)
define
(
<
T4
>
,
<%
r9
>
)
define
(
<
D12
>
,
<%
xmm4
>
)
define
(
<
D34
>
,
<%
xmm5
>
)
define
(
<
D0
>
,
<%
r10
>
)
define
(
<
T40
>
,
<%
xmm6
>
)
define
(
<
D43
>
,
<%
xmm7
>
)
define
(
<
A05
>
,
<%
rcx
>
)
define
(
<
A0607
>
,
<%
xmm2
>
)
define
(
<
A0809
>
,
<%
xmm3
>
)
define
(
<
A10
>
,
<%
rdx
>
)
define
(
<
A1112
>
,
<%
xmm4
>
)
define
(
<
A1314
>
,
<%
xmm5
>
)
define
(
<
RC_END
>
,
<%
r11
>
)
define
(
<
A15
>
,
<%
rbp
>
)
define
(
<
A1617
>
,
<%
xmm6
>
)
define
(
<
A1819
>
,
<%
xmm7
>
)
define
(
<
A20
>
,
<%
r9
>
)
define
(
<
A2122
>
,
<%
xmm8
>
)
define
(
<
A2324
>
,
<%
xmm9
>
)
define
(
<
FRAME_SIZE
>
,
<
200
>
)
define
(
<
C0
>
,
<%
r10
>
)
define
(
<
C12
>
,
<%
xmm10
>
)
define
(
<
C34
>
,
<%
xmm11
>
)
define
(
<
OFFSET
>
,
<
ifelse
(
$
1
,
0
,,
eval
(
8
*
$
1
))
>
)
define
(
<
A
>
,
<
OFFSET
(
$
1
)(
CTX
)
>
)
define
(
<
B
>
,
<
OFFSET
(
$
1
)(
%
rsp
)
>
)
define
(
<
D0
>
,
<%
r11
>
)
define
(
<
D12
>
,
<%
xmm12
>
)
define
(
<
D34
>
,
<%
xmm13
>
)
C
FIXME
:
Possible
optimizations.
C
Wide
temporaries
define
(
<
W0
>
,
<%
xmm14
>
)
define
(
<
W1
>
,
<%
xmm15
>
)
define
(
<
W2
>
,
<%
xmm12
>
)
C
Overlap
D12
define
(
<
W3
>
,
<%
xmm13
>
)
C
Overlap
D34
C
*
Compute
the
parity
vector
C
at
the
end
of
the
ch
i
step.
C
This
avoids
one
pass
over
the
data.
C
*
Micro
optimizations
with
register
use
and
scheduling.
define
(
<
T0
>
,
<%
r12
>
)
define
(
<
T1
>
,
<%
r13
>
)
define
(
<
T2
>
,
<%
r11
>
)
C
Overlap
D0
define
(
<
T3
>
,
<%
r10
>
)
C
Overlap
C0
define
(
<
RC
>
,
<%
r14
>
)
define
(
<
OFFSET
>
,
<
ifelse
(
$
1
,
0
,,
eval
(
8
*
$
1
))
>
)
define
(
<
STATE
>
,
<
OFFSET
(
$
1
)(
CTX
)
>
)
C
*
Try
di
fferent
order
during
the
permutation
step
,
maybe
C
doing
sequential
writes
rather
than
sequential
reads.
define
(
<
SWAP64
>
,
<
pshufd
<
$
>
0x4e
,
>
)
C
*
Try
to
do
the
permutation
and
the
ch
i
step
,
without
C
storing
intermediate
values?
That
would
reducing
the
C
number
of
passes
over
the
data.
We
still
need
a
copy
,
but
C
we
would
let
the
theta
step
produce
that
copy.
C
ROTL64
(
rot
,
register
,
temp
)
C
Caller
needs
to
or
together
the
result.
define
(
<
ROTL64
>
,
<
movdqa
$
2
,
$
3
psllq
<
$
>
$
1
,
$
2
psrlq
<
$
>
eval
(
64
-
$
1
),
$
3
>)
.file
"sha3-permute.asm"
...
...
@@ -62,351 +80,416 @@ define(<B>, <OFFSET($1)(%rsp)>)
.text
ALIGN
(
4
)
PROLOGUE
(
nettle_sha3_permute
)
W64_ENTRY
(
1
,
8
)
subq
$
FRAME_SIZE
,
%
rsp
movl
$
24
,
XREG
(
COUNT
)
negq
COUNT
W64_ENTRY
(
1
,
16
)
push
%
rbp
push
%
r12
push
%
r13
push
%
r14
lea
.rc_end
(
%
rip
),
RC_END
movl
$
24
,
XREG
(
COUNT
)
lea
.rc
-
8
(
%
rip
),
RC
movq
STATE
(
0
),
A00
movups
STATE
(
1
),
A0102
movups
STATE
(
3
),
A0304
movq
A00
,
C0
movq
STATE
(
5
),
A05
movdqa
A0102
,
C12
movups
STATE
(
6
),
A0607
movdqa
A0304
,
C34
movups
STATE
(
8
),
A0809
xorq
A05
,
C0
movq
STATE
(
10
),
A10
pxor
A0607
,
C12
movups
STATE
(
11
),
A1112
pxor
A0809
,
C34
movups
STATE
(
13
),
A1314
xorq
A10
,
C0
movq
STATE
(
15
),
A15
pxor
A1112
,
C12
movups
STATE
(
16
),
A1617
pxor
A1314
,
C34
movups
STATE
(
18
),
A1819
xorq
A15
,
C0
movq
STATE
(
20
),
A20
pxor
A1617
,
C12
movups
STATE
(
21
),
A2122
pxor
A1819
,
C34
movups
STATE
(
23
),
A2324
xorq
A20
,
C0
pxor
A2122
,
C12
pxor
A2324
,
C34
ALIGN
(
4
)
.Loop:
C
theta
step
C
Compute
parity
vector
C
[
0
,
...
,
4
]
.
movups
A
(
0
),
C01
movups
A
(
2
),
C23
movq
A
(
4
),
C4
movups
A
(
5
),
T01
movups
A
(
7
),
T23
xorq
A
(
9
),
C4
C
C
[
4
]
^
=
A
[
9
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
5
,
6
]
movups
A
(
10
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
7
,
8
]
movups
A
(
12
),
T23
xorq
A
(
14
),
C4
C
C
[
4
]
^
=
A
[
14
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
10
,
11
]
movups
A
(
15
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
12
,
13
]
movups
A
(
17
),
T23
xorq
A
(
19
),
C4
C
C
[
4
]
^
=
A
[
19
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
15
,
16
]
movups
A
(
20
),
T01
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
17
,
18
]
movups
A
(
22
),
T23
xorq
A
(
24
),
C4
C
C
[
4
]
^
=
A
[
24
]
pxor
T01
,
C01
C
C
[
0
,
1
]
^
=
A
[
20
,
21
]
pxor
T23
,
C23
C
C
[
2
,
3
]
^
=
A
[
22
,
23
]
C
Combine
parity
bits
:
C
D
[
0
]
=
C
[
4
]
^
ROTL64
(
1
,
C
[
1
])
C
D
[
1
,
2
]
=
C
[
0
,
1
]
^
ROTL64
(
1
,
C
[
2
,
3
])
C
D
[
3
,
4
]
=
C
[
2
,
3
]
^
ROTL64
(
1
,
C
[
4
,
0
])
C
The
theta
step.
Combine
parity
bits
,
then
xor
to
state.
C
D0
=
C4
^
(
C1
<<<
1
)
C
D1
=
C0
^
(
C2
<<<
1
)
C
D2
=
C1
^
(
C3
<<<
1
)
C
D3
=
C2
^
(
C4
<<<
1
)
C
D4
=
C3
^
(
C0
<<<
1
)
C
Shift
the
word
s
around
,
putting
(
C0
,
C1
)
in
D12
,
(
C2
,
C3
)
in
C
D34
,
and
(
C4
,
C0
)
in
C34.
C
Copy
to
D0
,
D12
,
D34
,
rotate
original
movdqa
C01
,
D12
movdqa
C23
,
D34
movdqa
C01
,
T01
movdqa
C23
,
T23
psllq
$
1
,
T01
psllq
$
1
,
T23
psrlq
$
63
,
C01
psrlq
$
63
,
C23
movq
C4
,
D0
rolq
$
1
,
C4
por
T01
,
C01
por
T23
,
C23
C
Move
around
,
putting
C
T4
<--
ROTL
(
1
,
C1
),
T40
<--
ROTL
(
1
,
C
[
4
,
0
])
movq
C4
,
T40
punpcklqdq
C01
,
T40
psrldq
$
8
,
C01
movd
C01
,
T4
C
Really
a
movq
!
pxor
C23
,
D12
xorq
T4
,
D0
pxor
T40
,
D34
C
xor
D
on
top
of
state
xorq
D0
,
A
(
0
)
movups
A
(
1
),
T01
movups
A
(
3
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
1
)
movups
T23
,
A
(
3
)
C
Notes
on
"
unpack
"
instructions
:
C
punpckhqdq
01
,
23
gives
31
C
punpcklqdq
01
,
23
gives
20
SWAP64
C34
,
C34
C
Holds
C4
,
C3
movdqa
C12
,
D34
movq
C0
,
D12
punpcklqdq
C12
,
D12
C
Holds
C0
,
C1
punpckhqdq
C34
,
D34
C
Holds
C2
,
C3
punpcklqdq
D12
,
C34
C
Holds
C4
,
C0
movq
C34
,
D0
movq
C12
,
T0
rolq
$
1
,
T0
xorq
T0
,
D0
C
Can
use
C12
as
temporary
movdqa
D34
,
W0
movdqa
D34
,
W1
psllq
$
1
,
W0
psrlq
$
63
,
W1
pxor
W0
,
D12
pxor
W1
,
D12
C
Done
D12
xorq
D0
,
A
(
5
)
movups
A
(
6
),
T01
movups
A
(
8
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
6
)
movups
T23
,
A
(
8
)
xorq
D0
,
A
(
10
)
movups
A
(
11
),
T01
movups
A
(
13
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
11
)
movups
T23
,
A
(
13
)
xorq
D0
,
A
(
15
)
movups
A
(
16
),
T01
movups
A
(
18
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
16
)
movups
T23
,
A
(
18
)
xorq
D0
,
A
(
20
)
movups
A
(
21
),
T01
movups
A
(
23
),
T23
pxor
D12
,
T01
pxor
D34
,
T23
movups
T01
,
A
(
21
)
movups
T23
,
A
(
23
)
C
rho
and
pi
steps
:
Rotate
and
permute
movq
A
(
0
),
C4
C
rot
0
,
perm
0
movq
A
(
1
),
T4
C
rot
1
,
perm
10
movq
C4
,
B
(
0
)
rolq
$
1
,
T4
movq
A
(
2
),
C4
C
rot
62
,
perm
20
movq
T4
,
B
(
10
)
rolq
$
62
,
C4
movq
A
(
3
),
T4
C
rot
28
,
perm
5
movq
C4
,
B
(
20
)
rolq
$
28
,
T4
movq
A
(
4
),
C4
C
rot
27
,
perm
15
movq
T4
,
B
(
5
)
rolq
$
27
,
C4
movq
A
(
5
),
T4
C
rot
36
,
perm
16
movq
C4
,
B
(
15
)
rolq
$
36
,
T4
movq
A
(
6
),
C4
C
rot
44
,
perm
1
movq
T4
,
B
(
16
)
rolq
$
44
,
C4
movq
A
(
7
),
T4
C
rot
6
,
perm
11
movq
C4
,
B
(
1
)
rolq
$
6
,
T4
movq
A
(
8
),
C4
C
rot
55
,
perm
21
movq
T4
,
B
(
11
)
rolq
$
55
,
C4
movq
A
(
9
),
T4
C
rot
20
,
perm
6
movq
C4
,
B
(
21
)
rolq
$
20
,
T4
movq
A
(
10
),
C4
C
rot
3
,
perm
7
movq
T4
,
B
(
6
)
rolq
$
3
,
C4
movq
A
(
11
),
T4
C
rot
10
,
perm
17
movq
C4
,
B
(
7
)
rolq
$
10
,
T4
movq
A
(
12
),
C4
C
rot
43
,
perm
2
movq
T4
,
B
(
17
)
rolq
$
43
,
C4
movq
A
(
13
),
T4
C
rot
25
,
perm
12
movq
C4
,
B
(
2
)
rolq
$
25
,
T4
movq
A
(
14
),
C4
C
rot
39
,
perm
22
movq
T4
,
B
(
12
)
rolq
$
39
,
C4
movq
A
(
15
),
T4
C
rot
41
,
perm
23
movq
C4
,
B
(
22
)
rolq
$
41
,
T4
movq
A
(
16
),
C4
C
rot
45
,
perm
8
movq
T4
,
B
(
23
)
rolq
$
45
,
C4
movq
A
(
17
),
T4
C
rot
15
,
perm
18
movq
C4
,
B
(
8
)
rolq
$
15
,
T4
movq
A
(
18
),
C4
C
rot
21
,
perm
3
movq
T4
,
B
(
18
)
rolq
$
21
,
C4
movq
A
(
19
),
T4
C
rot
8
,
perm
13
movq
C4
,
B
(
3
)
rolq
$
8
,
T4
movq
A
(
20
),
C4
C
rot
18
,
perm
14
movq
T4
,
B
(
13
)
rolq
$
18
,
C4
movq
A
(
21
),
T4
C
rot
2
,
perm
24
movq
C4
,
B
(
14
)
rolq
$
2
,
T4
movq
A
(
22
),
C4
C
rot
61
,
perm
9
movq
T4
,
B
(
24
)
rolq
$
61
,
C4
movq
A
(
23
),
T4
C
rot
56
,
perm
19
movq
C4
,
B
(
9
)
rolq
$
56
,
T4
movq
A
(
24
),
C4
C
rot
14
,
perm
4
movq
T4
,
B
(
19
)
rolq
$
14
,
C4
movq
C4
,
B
(
4
)
C
ch
i
step
C
Read
with
some
overlap
,
pairs
C01
,
D12
,
D34
C
Then
al
so
construct
pairs
C23
and
T40.
C
We
do
the
operations
as
C
A01
=
B01
^
(
~
B12
&
B23
)
C
A12
=
B12
^
(
~
B23
&
B34
)
C
A34
=
B34
^
(
~
B40
&
B01
)
C
Where
we
store
only
the
low
64
bits
of
A01
,
and
add
in
the
C
round
key
if
applicable.
movdqa
C34
,
C12
psrlq
$
63
,
C34
psllq
$
1
,
C12
pxor
C34
,
D34
pxor
C12
,
D34
C
Done
D34
xorq
D0
,
A00
xorq
D0
,
A05
xorq
D0
,
A10
xorq
D0
,
A15
xorq
D0
,
A20
pxor
D12
,
A0102
pxor
D12
,
A0607
pxor
D12
,
A1112
pxor
D12
,
A1617
pxor
D12
,
A2122
pxor
D34
,
A0304
pxor
D34
,
A0809
pxor
D34
,
A1314
pxor
D34
,
A1819
pxor
D34
,
A2324
C
theta
step
done
,
no
C
,
D
or
W
temporaries
al
ive.
C
rho
and
pi
steps.
When
doing
the
permutations
,
al
so
C
transpose
the
matrix.
movups
B
(
0
),
C01
movups
B
(
1
),
D12
movups
B
(
3
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
3
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movd
T40
,
T4
C
Really
movq
!
xorq
(
RC_END
,
COUNT
,
8
),
T4
movq
T4
,
A
(
0
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
1
)
movups
B
(
5
),
C01
movups
B
(
6
),
D12
movups
B
(
8
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
8
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
5
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
6
)
movups
B
(
10
),
C01
movups
B
(
11
),
D12
movups
B
(
13
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
13
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
10
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
11
)
movups
B
(
15
),
C01
movups
B
(
16
),
D12
movups
B
(
18
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
18
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
15
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
16
)
movups
B
(
20
),
C01
movups
B
(
21
),
D12
movups
B
(
23
),
D34
pshufd
$
0x4e
,
D34
,
D43
movdqa
D43
,
T40
punpcklqdq
C01
,
T40
C
Get
40
movdqa
D12
,
C23
punpckhqdq
D43
,
C23
C
Get
23
pandn
C01
,
T40
pxor
D34
,
T40
movups
T40
,
A
(
23
)
movdqa
D12
,
T40
pandn
C23
,
T40
pxor
C01
,
T40
movq
T40
,
A
(
20
)
pandn
D34
,
C23
pxor
D12
,
C23
movups
C23
,
A
(
21
)
incq
COUNT
C
The
combined
permutation
+
transpose
gives
the
following
C
cycles
(
rotation
counts
in
parenthesis
)
C
0
<-
0
(
0
)
C
1
<-
3
(
28
)
<-
4
(
27
)
<-
2
(
62
)
<-
1
(
1
)
C
5
<-
6
(
44
)
<-
9
(
20
)
<-
8
(
55
)
<-
5
(
36
)
C
7
<-
7
(
6
)
C
10
<-
12
(
43
)
<-
13
(
25
)
<-
11
(
10
)
<-
10
(
3
)
C
14
<-
14
(
39
)
C
15
<-
18
(
21
)
<-
17
(
15
)
<-
19
(
8
)
<-
15
(
41
)
C
16
<-
16
(
45
)
C
20
<-
24
(
14
)
<-
21
(
2
)
<-
22
(
61
)
<-
20
(
18
)
C
23
<-
23
(
56
)
C
Do
the
1
,
2
,
3
,
4
row.
First
rotate
,
then
permute.
movdqa
A0102
,
W0
movdqa
A0102
,
W1
movdqa
A0102
,
W2
psllq
$
1
,
A0102
psrlq
$
63
,
W0
psllq
$
62
,
W1
por
A0102
,
W0
C
rotl
1
(
A01
)
psrlq
$
2
,
W2
por
W1
,
W2
C
rotl
62
(
A02
)
movdqa
A0304
,
A0102
movdqa
A0304
,
W1
psllq
$
28
,
A0102
psrlq
$
36
,
W1
por
W1
,
A0102
C
rotl
28
(
A03
)
movdqa
A0304
,
W1
psllq
$
27
,
A0304
psrlq
$
37
,
W1
por
W1
,
A0304
C
rotl
27
(
A04
)
punpcklqdq
W0
,
A0102
punpckhqdq
W2
,
A0304
C
5
<-
6
(
44
)
<-
9
(
20
)
<-
8
(
55
)
<-
5
(
36
)
C
7
<-
7
(
6
)
C
__
_______
C
_
L
'
` L_ __`
C
|
5
|
|
6
|
7
|
|
8
|
9
|
C
`-_________-^`
-^
rolq
$
36
,
A05
movq
A05
,
W0
movq
A0607
,
A05
rolq
$
44
,
A05
C
Done
A05
ROTL64
(
6
,
A0607
,
W1
)
por
A0607
,
W1
movdqa
A0809
,
A0607
ROTL64
(
20
,
A0607
,
W2
)
por
W2
,
A0607
punpckhqdq
W1
,
A0607
C
Done
A0607
ROTL64
(
55
,
A0809
,
W1
)
por
A0809
,
W1
movdqa
W0
,
A0809
punpcklqdq
W1
,
A0809
C
Done
0809
C
10
<-
12
(
43
)
<-
13
(
25
)
<-
11
(
10
)
<-
10
(
3
)
C
14
<-
14
(
39
)
C
_____
___
C
__L
'
__
`_L_ `
_____
C
|
10
|
|
11
|
12
|
|
13
|
14
|
C
`-___-^`
-
______
-^
C
rolq
$
42
,
A10
C
42
+
25
=
3
(
mod
64
)
SWAP64
A1112
,
W0
movq
A10
,
A1112
movq
W0
,
A10
rolq
$
43
,
A10
C
Done
A10
punpcklqdq
A1314
,
A1112
ROTL64
(
25
,
A1112
,
W1
)
por
W1
,
A1112
C
Done
A1112
ROTL64
(
39
,
A1314
,
W2
)
por
A1314
,
W2
ROTL64
(
10
,
W0
,
A1314
)
por
W0
,
A1314
punpckhqdq
W2
,
A1314
C
Done
A1314