Commit 588017df authored by Niels Möller's avatar Niels Möller
Browse files

Rewrote x86_64 sha3-permute.asm.

parent a7457dfa
2012-12-13 Niels Möller <nisse@lysator.liu.se>
* x86_64/sha3-permute.asm: Rewrote, to keep all state in
registers. 2400 cycles on x86_&4, only slightly faster than the
current C code.
2012-12-09 Niels Möller <nisse@lysator.liu.se>
* sha3-permute.c (sha3_permute): Rewrote to do permutation in
......
......@@ -20,41 +20,59 @@ C MA 02111-1301, USA.
define(<CTX>, <%rdi>) C 25 64-bit values, 200 bytes.
define(<COUNT>, <%r8>) C Avoid clobbering %rsi, for W64.
define(<C01>, <%xmm0>)
define(<C23>, <%xmm1>)
define(<C4>, <%rdx>)
define(<A00>, <%rax>)
define(<A0102>, <%xmm0>)
define(<A0304>, <%xmm1>)
define(<T01>, <%xmm2>)
define(<T23>, <%xmm3>)
define(<T4>, <%r9>)
define(<D12>, <%xmm4>)
define(<D34>, <%xmm5>)
define(<D0>, <%r10>)
define(<T40>, <%xmm6>)
define(<D43>, <%xmm7>)
define(<A05>, <%rcx>)
define(<A0607>, <%xmm2>)
define(<A0809>, <%xmm3>)
define(<A10>, <%rdx>)
define(<A1112>, <%xmm4>)
define(<A1314>, <%xmm5>)
define(<RC_END>, <%r11>)
define(<A15>, <%rbp>)
define(<A1617>, <%xmm6>)
define(<A1819>, <%xmm7>)
define(<A20>, <%r9>)
define(<A2122>, <%xmm8>)
define(<A2324>, <%xmm9>)
define(<FRAME_SIZE>, <200>)
define(<C0>, <%r10>)
define(<C12>, <%xmm10>)
define(<C34>, <%xmm11>)
define(<OFFSET>, <ifelse($1,0,,eval(8*$1))>)
define(<A>, <OFFSET($1)(CTX)>)
define(<B>, <OFFSET($1)(%rsp)>)
define(<D0>, <%r11>)
define(<D12>, <%xmm12>)
define(<D34>, <%xmm13>)
C FIXME: Possible optimizations.
C Wide temporaries
define(<W0>, <%xmm14>)
define(<W1>, <%xmm15>)
define(<W2>, <%xmm12>) C Overlap D12
define(<W3>, <%xmm13>) C Overlap D34
C * Compute the parity vector C at the end of the chi step.
C This avoids one pass over the data.
C * Micro optimizations with register use and scheduling.
define(<T0>, <%r12>)
define(<T1>, <%r13>)
define(<T2>, <%r11>) C Overlap D0
define(<T3>, <%r10>) C Overlap C0
define(<RC>, <%r14>)
define(<OFFSET>, <ifelse($1,0,,eval(8*$1))>)
define(<STATE>, <OFFSET($1)(CTX)>)
C * Try different order during the permutation step, maybe
C doing sequential writes rather than sequential reads.
define(<SWAP64>, <pshufd <$>0x4e,>)
C * Try to do the permutation and the chi step, without
C storing intermediate values? That would reducing the
C number of passes over the data. We still need a copy, but
C we would let the theta step produce that copy.
C ROTL64(rot, register, temp)
C Caller needs to or together the result.
define(<ROTL64>, <
movdqa $2, $3
psllq <$>$1, $2
psrlq <$>eval(64-$1), $3
>)
.file "sha3-permute.asm"
......@@ -62,351 +80,416 @@ define(<B>, <OFFSET($1)(%rsp)>)
.text
ALIGN(4)
PROLOGUE(nettle_sha3_permute)
W64_ENTRY(1, 8)
subq $FRAME_SIZE, %rsp
movl $24, XREG(COUNT)
negq COUNT
W64_ENTRY(1, 16)
push %rbp
push %r12
push %r13
push %r14
lea .rc_end(%rip), RC_END
movl $24, XREG(COUNT)
lea .rc-8(%rip), RC
movq STATE(0), A00
movups STATE(1), A0102
movups STATE(3), A0304
movq A00, C0
movq STATE(5), A05
movdqa A0102, C12
movups STATE(6), A0607
movdqa A0304, C34
movups STATE(8), A0809
xorq A05, C0
movq STATE(10), A10
pxor A0607, C12
movups STATE(11), A1112
pxor A0809, C34
movups STATE(13), A1314
xorq A10, C0
movq STATE(15), A15
pxor A1112, C12
movups STATE(16), A1617
pxor A1314, C34
movups STATE(18), A1819
xorq A15, C0
movq STATE(20), A20
pxor A1617, C12
movups STATE(21), A2122
pxor A1819, C34
movups STATE(23), A2324
xorq A20, C0
pxor A2122, C12
pxor A2324, C34
ALIGN(4)
.Loop:
C theta step
C Compute parity vector C[0,...,4].
movups A(0), C01
movups A(2), C23
movq A(4), C4
movups A(5), T01
movups A(7), T23
xorq A(9), C4 C C[4] ^= A[9]
pxor T01, C01 C C[0,1] ^= A[5,6]
movups A(10), T01
pxor T23, C23 C C[2,3] ^= A[7,8]
movups A(12), T23
xorq A(14), C4 C C[4] ^= A[14]
pxor T01, C01 C C[0,1] ^= A[10,11]
movups A(15), T01
pxor T23, C23 C C[2,3] ^= A[12,13]
movups A(17), T23
xorq A(19), C4 C C[4] ^= A[19]
pxor T01, C01 C C[0,1] ^= A[15,16]
movups A(20), T01
pxor T23, C23 C C[2,3] ^= A[17,18]
movups A(22), T23
xorq A(24), C4 C C[4] ^= A[24]
pxor T01, C01 C C[0,1] ^= A[20,21]
pxor T23, C23 C C[2,3] ^= A[22,23]
C Combine parity bits:
C D[0] = C[4] ^ ROTL64(1, C[1])
C D[1,2] = C[0,1] ^ ROTL64(1, C[2,3])
C D[3,4] = C[2,3] ^ ROTL64(1, C[4,0])
C The theta step. Combine parity bits, then xor to state.
C D0 = C4 ^ (C1 <<< 1)
C D1 = C0 ^ (C2 <<< 1)
C D2 = C1 ^ (C3 <<< 1)
C D3 = C2 ^ (C4 <<< 1)
C D4 = C3 ^ (C0 <<< 1)
C Shift the words around, putting (C0, C1) in D12, (C2, C3) in
C D34, and (C4, C0) in C34.
C Copy to D0, D12, D34, rotate original
movdqa C01, D12
movdqa C23, D34
movdqa C01, T01
movdqa C23, T23
psllq $1, T01
psllq $1, T23
psrlq $63, C01
psrlq $63, C23
movq C4, D0
rolq $1, C4
por T01, C01
por T23, C23
C Move around, putting
C T4 <-- ROTL(1,C1), T40 <-- ROTL(1,C[4,0])
movq C4, T40
punpcklqdq C01, T40
psrldq $8, C01
movd C01, T4 C Really a movq!
pxor C23, D12
xorq T4, D0
pxor T40, D34
C xor D on top of state
xorq D0, A(0)
movups A(1), T01
movups A(3), T23
pxor D12, T01
pxor D34, T23
movups T01, A(1)
movups T23, A(3)
C Notes on "unpack" instructions:
C punpckhqdq 01, 23 gives 31
C punpcklqdq 01, 23 gives 20
SWAP64 C34, C34 C Holds C4, C3
movdqa C12, D34
movq C0, D12
punpcklqdq C12, D12 C Holds C0, C1
punpckhqdq C34, D34 C Holds C2, C3
punpcklqdq D12, C34 C Holds C4, C0
movq C34, D0
movq C12, T0
rolq $1, T0
xorq T0, D0
C Can use C12 as temporary
movdqa D34, W0
movdqa D34, W1
psllq $1, W0
psrlq $63, W1
pxor W0, D12
pxor W1, D12 C Done D12
xorq D0, A(5)
movups A(6), T01
movups A(8), T23
pxor D12, T01
pxor D34, T23
movups T01, A(6)
movups T23, A(8)
xorq D0, A(10)
movups A(11), T01
movups A(13), T23
pxor D12, T01
pxor D34, T23
movups T01, A(11)
movups T23, A(13)
xorq D0, A(15)
movups A(16), T01
movups A(18), T23
pxor D12, T01
pxor D34, T23
movups T01, A(16)
movups T23, A(18)
xorq D0, A(20)
movups A(21), T01
movups A(23), T23
pxor D12, T01
pxor D34, T23
movups T01, A(21)
movups T23, A(23)
C rho and pi steps: Rotate and permute
movq A(0), C4 C rot 0, perm 0
movq A(1), T4 C rot 1, perm 10
movq C4, B(0)
rolq $1, T4
movq A(2), C4 C rot 62, perm 20
movq T4, B(10)
rolq $62, C4
movq A(3), T4 C rot 28, perm 5
movq C4, B(20)
rolq $28, T4
movq A(4), C4 C rot 27, perm 15
movq T4, B(5)
rolq $27, C4
movq A(5), T4 C rot 36, perm 16
movq C4, B(15)
rolq $36, T4
movq A(6), C4 C rot 44, perm 1
movq T4, B(16)
rolq $44, C4
movq A(7), T4 C rot 6, perm 11
movq C4, B(1)
rolq $6, T4
movq A(8), C4 C rot 55, perm 21
movq T4, B(11)
rolq $55, C4
movq A(9), T4 C rot 20, perm 6
movq C4, B(21)
rolq $20, T4
movq A(10), C4 C rot 3, perm 7
movq T4, B(6)
rolq $3, C4
movq A(11), T4 C rot 10, perm 17
movq C4, B(7)
rolq $10, T4
movq A(12), C4 C rot 43, perm 2
movq T4, B(17)
rolq $43, C4
movq A(13), T4 C rot 25, perm 12
movq C4, B(2)
rolq $25, T4
movq A(14), C4 C rot 39, perm 22
movq T4, B(12)
rolq $39, C4
movq A(15), T4 C rot 41, perm 23
movq C4, B(22)
rolq $41, T4
movq A(16), C4 C rot 45, perm 8
movq T4, B(23)
rolq $45, C4
movq A(17), T4 C rot 15, perm 18
movq C4, B(8)
rolq $15, T4
movq A(18), C4 C rot 21, perm 3
movq T4, B(18)
rolq $21, C4
movq A(19), T4 C rot 8, perm 13
movq C4, B(3)
rolq $8, T4
movq A(20), C4 C rot 18, perm 14
movq T4, B(13)
rolq $18, C4
movq A(21), T4 C rot 2, perm 24
movq C4, B(14)
rolq $2, T4
movq A(22), C4 C rot 61, perm 9
movq T4, B(24)
rolq $61, C4
movq A(23), T4 C rot 56, perm 19
movq C4, B(9)
rolq $56, T4
movq A(24), C4 C rot 14, perm 4
movq T4, B(19)
rolq $14, C4
movq C4, B(4)
C chi step
C Read with some overlap, pairs C01, D12, D34
C Then also construct pairs C23 and T40.
C We do the operations as
C A01 = B01 ^ (~B12 & B23)
C A12 = B12 ^ (~B23 & B34)
C A34 = B34 ^ (~B40 & B01)
C Where we store only the low 64 bits of A01, and add in the
C round key if applicable.
movdqa C34, C12
psrlq $63, C34
psllq $1, C12
pxor C34, D34
pxor C12, D34 C Done D34
xorq D0, A00
xorq D0, A05
xorq D0, A10
xorq D0, A15
xorq D0, A20
pxor D12, A0102
pxor D12, A0607
pxor D12, A1112
pxor D12, A1617
pxor D12, A2122
pxor D34, A0304
pxor D34, A0809
pxor D34, A1314
pxor D34, A1819
pxor D34, A2324
C theta step done, no C, D or W temporaries alive.
C rho and pi steps. When doing the permutations, also
C transpose the matrix.
movups B(0), C01
movups B(1), D12
movups B(3), D34
pshufd $0x4e, D34, D43
movdqa D43, T40
punpcklqdq C01, T40 C Get 40
movdqa D12, C23
punpckhqdq D43, C23 C Get 23
pandn C01, T40
pxor D34, T40
movups T40, A(3)
movdqa D12, T40
pandn C23, T40
pxor C01, T40
movd T40, T4 C Really movq!
xorq (RC_END, COUNT, 8), T4
movq T4, A(0)
pandn D34, C23
pxor D12, C23
movups C23, A(1)
movups B(5), C01
movups B(6), D12
movups B(8), D34
pshufd $0x4e, D34, D43
movdqa D43, T40
punpcklqdq C01, T40 C Get 40
movdqa D12, C23
punpckhqdq D43, C23 C Get 23
pandn C01, T40
pxor D34, T40
movups T40, A(8)
movdqa D12, T40
pandn C23, T40
pxor C01, T40
movq T40, A(5)
pandn D34, C23
pxor D12, C23
movups C23, A(6)
movups B(10), C01
movups B(11), D12
movups B(13), D34
pshufd $0x4e, D34, D43
movdqa D43, T40
punpcklqdq C01, T40 C Get 40
movdqa D12, C23
punpckhqdq D43, C23 C Get 23
pandn C01, T40
pxor D34, T40
movups T40, A(13)
movdqa D12, T40
pandn C23, T40
pxor C01, T40
movq T40, A(10)
pandn D34, C23
pxor D12, C23
movups C23, A(11)
movups B(15), C01
movups B(16), D12
movups B(18), D34
pshufd $0x4e, D34, D43
movdqa D43, T40
punpcklqdq C01, T40 C Get 40
movdqa D12, C23
punpckhqdq D43, C23 C Get 23
pandn C01, T40
pxor D34, T40
movups T40, A(18)
movdqa D12, T40
pandn C23, T40
pxor C01, T40
movq T40, A(15)
pandn D34, C23
pxor D12, C23
movups C23, A(16)
movups B(20), C01
movups B(21), D12
movups B(23), D34
pshufd $0x4e, D34, D43
movdqa D43, T40
punpcklqdq C01, T40 C Get 40
movdqa D12, C23
punpckhqdq D43, C23 C Get 23
pandn C01, T40
pxor D34, T40
movups T40, A(23)
movdqa D12, T40
pandn C23, T40
pxor C01, T40
movq T40, A(20)
pandn D34, C23
pxor D12, C23
movups C23, A(21)
incq COUNT
C The combined permutation + transpose gives the following
C cycles (rotation counts in parenthesis)
C 0 <- 0(0)
C 1 <- 3(28) <- 4(27) <- 2(62) <- 1(1)
C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
C 7 <- 7(6)
C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
C 14 <- 14(39)
C 15 <- 18(21) <- 17(15) <- 19(8) <- 15(41)
C 16 <- 16(45)
C 20 <- 24(14) <- 21(2) <- 22(61) <- 20(18)
C 23 <- 23(56)
C Do the 1,2,3,4 row. First rotate, then permute.
movdqa A0102, W0
movdqa A0102, W1
movdqa A0102, W2
psllq $1, A0102
psrlq $63, W0
psllq $62, W1
por A0102, W0 C rotl 1 (A01)
psrlq $2, W2
por W1, W2 C rotl 62 (A02)
movdqa A0304, A0102
movdqa A0304, W1
psllq $28, A0102
psrlq $36, W1
por W1, A0102 C rotl 28 (A03)
movdqa A0304, W1
psllq $27, A0304
psrlq $37, W1
por W1, A0304 C rotl 27 (A04)
punpcklqdq W0, A0102
punpckhqdq W2, A0304
C 5 <- 6(44) <- 9(20) <- 8(55) <- 5(36)
C 7 <- 7(6)
C __ _______
C _ L' ` L_ __`
C |5| |6|7| |8|9|
C `-_________-^`-^
rolq $36, A05
movq A05, W0
movq A0607, A05
rolq $44, A05 C Done A05
ROTL64(6, A0607, W1)
por A0607, W1
movdqa A0809, A0607
ROTL64(20, A0607, W2)
por W2, A0607
punpckhqdq W1, A0607 C Done A0607
ROTL64(55, A0809, W1)
por A0809, W1
movdqa W0, A0809
punpcklqdq W1, A0809 C Done 0809
C 10 <- 12(43) <- 13(25) <- 11(10) <- 10(3)
C 14 <- 14(39)
C _____ ___
C __L' __`_L_ `_____
C |10| |11|12| |13|14|
C `-___-^`-______-^
C
rolq $42, A10 C 42 + 25 = 3 (mod 64)
SWAP64 A1112, W0
movq A10, A1112
movq W0, A10
rolq $43, A10 C Done A10
punpcklqdq A1314, A1112
ROTL64(25, A1112, W1)
por W1, A1112 C Done A1112
ROTL64(39, A1314, W2)
por A1314, W2
ROTL64(10, W0, A1314)
por W0, A1314