x86_64 sha3: Go via memory for moves between general registers and xmm registers.

2013-04-11 Niels Möller <>
* x86_64/sha3-permute.asm: Go via memory for moves between general
registers and xmm registers.
2013-04-06 Niels Möller <>
From Edgar E. Iglesias:
......@@ -72,9 +72,21 @@ define(<STATE>, <OFFSET($1)(CTX)>)
define(<SWAP64>, <pshufd <$>0x4e,>)
define(<DIRECT_MOVQ>, <no>)
C MOVQ(src, dst), for moves between a general register and an xmm
C register.
ifelse(DIRECT_MOVQ, yes, <
C movq calls that are equal to the corresponding movd,
C where the Apple assembler requires them to be written as movd.
define(<MOVQ>, <movd>)
define(<MOVQ>, <movd $1, $2>)
>, <
C Moving via (cached) memory is generally faster.
define(<MOVQ>, <
movq $1, (CTX)
movq (CTX), $2
C ROTL64(rot, register, temp)
C Caller needs to or together the result.
......@@ -151,12 +163,12 @@ PROLOGUE(nettle_sha3_permute)
SWAP64 C34, C34 C Holds C4, C3
movdqa C12, D34
MOVQ C0, D12
MOVQ(C0, D12)
punpcklqdq C12, D12 C Holds C0, C1
punpckhqdq C34, D34 C Holds C2, C3
punpcklqdq D12, C34 C Holds C4, C0
MOVQ C34, D0
MOVQ C12, T0
MOVQ(C34, D0)
MOVQ(C12, T0)
rolq $1, T0
xorq T0, D0
......@@ -240,8 +252,8 @@ PROLOGUE(nettle_sha3_permute)
C `-_________-^`-^
rolq $36, A05
MOVQ A05, W0
MOVQ A0607, A05
MOVQ(A05, W0)
MOVQ(A0607, A05)
rolq $44, A05 C Done A05
ROTL64(6, A0607, W1)
por A0607, W1
......@@ -264,8 +276,8 @@ PROLOGUE(nettle_sha3_permute)
rolq $42, A10 C 42 + 25 = 3 (mod 64)
SWAP64 A1112, W0
MOVQ A10, A1112
MOVQ W0, A10
MOVQ(A10, A1112)
MOVQ(W0, A10)
rolq $43, A10 C Done A10
punpcklqdq A1314, A1112
......@@ -289,8 +301,8 @@ PROLOGUE(nettle_sha3_permute)
SWAP64 A1819, W0
rolq $41, A15
MOVQ A15, W1
MOVQ A1819, A15
MOVQ(A15, W1)
MOVQ(A1819, A15)
rolq $21, A15 C Done A15
SWAP64 A1617, A1819
ROTL64(45, A1617, W2)
......@@ -312,7 +324,7 @@ PROLOGUE(nettle_sha3_permute)
C \_______/
rolq $18, A20
MOVQ A20, W0
MOVQ(A20, W0)
SWAP64 A2324, W1
movd W1, A20
rolq $14, A20 C Done A20
......@@ -390,21 +402,21 @@ PROLOGUE(nettle_sha3_permute)
C Swap (A05, A10) <-> A0102, and (A15, A20) <-> A0304,
C and also copy to C12 and C34 while at it.
MOVQ A05, C12
MOVQ A15, C34
MOVQ A10, W0
MOVQ A20, W1
MOVQ(A05, C12)
MOVQ(A15, C34)
MOVQ(A10, W0)
MOVQ(A20, W1)
movq A00, C0
punpcklqdq W0, C12
punpcklqdq W1, C34
MOVQ A0102, A05
MOVQ A0304, A15
MOVQ(A0102, A05)
MOVQ(A0304, A15)
psrldq $8, A0102
psrldq $8, A0304
xorq A05, C0
xorq A15, C0
MOVQ A0102, A10
MOVQ A0304, A20
MOVQ(A0102, A10)
MOVQ(A0304, A20)
movdqa C12, A0102
movdqa C34, A0304
