diff --git a/ChangeLog b/ChangeLog index d14403fad15f0b9d201cdaffdec1e3c25a80965b..39c9a5e61118b63e23d7b70a56f55a8325d21462 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2012-12-03 Niels Möller + * configure.ac: Added sha3-permute.asm. + + * x86_64/sha3-permute.asm: New file. 30% speedup over current C + code, 4300 cycles. + * nettle.texinfo (Hash functions): Split into several sections, separating recommended hash functions and legacy hash functions. Document sha3-256. diff --git a/configure.ac b/configure.ac index f550f21663010c6b979e10d1377e53c159715d9c..e2ed52b89c91500839199410ba8fa6706c019d86 100644 --- a/configure.ac +++ b/configure.ac @@ -240,7 +240,7 @@ if test "x$enable_assembler" = xyes ; then md5-compress.asm memxor.asm \ salsa20-crypt.asm salsa20-core-internal.asm \ serpent-encrypt.asm serpent-decrypt.asm \ - sha1-compress.asm machine.m4; do + sha1-compress.asm sha3-permute.asm machine.m4; do # echo "Looking for $srcdir/$asm_path/$tmp_f" if test -f "$srcdir/$asm_path/$tmp_f"; then # echo found diff --git a/x86_64/sha3-permute.asm b/x86_64/sha3-permute.asm new file mode 100644 index 0000000000000000000000000000000000000000..2f742331288c8720f54983f51674acef2b5f6569 --- /dev/null +++ b/x86_64/sha3-permute.asm @@ -0,0 +1,412 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2012 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +define(, <%rdi>) C 25 64-bit values, 200 bytes. +define(, <%r8>) C Avoid clobbering %rsi, for W64. + +define(, <%xmm0>) +define(, <%xmm1>) +define(, <%rdx>) + +define(, <%xmm2>) +define(, <%xmm3>) +define(, <%r9>) +define(, <%xmm4>) +define(, <%xmm5>) +define(, <%r10>) +define(, <%xmm6>) +define(, <%xmm7>) + +define(, <%r11>) + +define(, <200>) + +define(, ) +define(, ) +define(, ) + + C FIXME: Possible optimizations. + + C * Compute the parity vector C at the end of the chi step. + C This avoids one pass over the data. + + C * Micro optimizations with register use and scheduling. + + C * Try different order during the permutation step, maybe + C doing sequential writes rather than sequential reads. + + C * Try to do the permutation and the chi step, without + C storing intermediate values? That would reducing the + C number of passes over the data. We still need a copy, but + C we would let the theta step produce that copy. + + .file "sha3-permute.asm" + + C sha3_permute(struct sha3_state *ctx) + .text + ALIGN(4) +PROLOGUE(nettle_sha3_permute) + W64_ENTRY(1, 8) + subq $FRAME_SIZE, %rsp + movl $24, XREG(COUNT) + negq COUNT + + lea .rc_end(%rip), RC_END + ALIGN(4) +.Loop: + C theta step + C Compute parity vector C[0,...,4]. + movups A(0), C01 + movups A(2), C23 + movq A(4), C4 + + movups A(5), T01 + movups A(7), T23 + xorq A(9), C4 C C[4] ^= A[9] + + pxor T01, C01 C C[0,1] ^= A[5,6] + movups A(10), T01 + pxor T23, C23 C C[2,3] ^= A[7,8] + movups A(12), T23 + xorq A(14), C4 C C[4] ^= A[14] + + pxor T01, C01 C C[0,1] ^= A[10,11] + movups A(15), T01 + pxor T23, C23 C C[2,3] ^= A[12,13] + movups A(17), T23 + xorq A(19), C4 C C[4] ^= A[19] + + pxor T01, C01 C C[0,1] ^= A[15,16] + movups A(20), T01 + pxor T23, C23 C C[2,3] ^= A[17,18] + movups A(22), T23 + xorq A(24), C4 C C[4] ^= A[24] + + pxor T01, C01 C C[0,1] ^= A[20,21] + pxor T23, C23 C C[2,3] ^= A[22,23] + + C Combine parity bits: + C D[0] = C[4] ^ ROTL64(1, C[1]) + C D[1,2] = C[0,1] ^ ROTL64(1, C[2,3]) + C D[3,4] = C[2,3] ^ ROTL64(1, C[4,0]) + + C Copy to D0, D12, D34, rotate original + movdqa C01, D12 + movdqa C23, D34 + movdqa C01, T01 + movdqa C23, T23 + psllq $1, T01 + psllq $1, T23 + psrlq $63, C01 + psrlq $63, C23 + movq C4, D0 + rolq $1, C4 + por T01, C01 + por T23, C23 + + C Move around, putting + C T4 <-- ROTL(1,C1), T40 <-- ROTL(1,C[4,0]) + movq C4, T40 + punpcklqdq C01, T40 + psrldq $8, C01 + movd C01, T4 C Really a movq! + + pxor C23, D12 + xorq T4, D0 + pxor T40, D34 + + C xor D on top of state + xorq D0, A(0) + movups A(1), T01 + movups A(3), T23 + pxor D12, T01 + pxor D34, T23 + movups T01, A(1) + movups T23, A(3) + + xorq D0, A(5) + movups A(6), T01 + movups A(8), T23 + pxor D12, T01 + pxor D34, T23 + movups T01, A(6) + movups T23, A(8) + + xorq D0, A(10) + movups A(11), T01 + movups A(13), T23 + pxor D12, T01 + pxor D34, T23 + movups T01, A(11) + movups T23, A(13) + + xorq D0, A(15) + movups A(16), T01 + movups A(18), T23 + pxor D12, T01 + pxor D34, T23 + movups T01, A(16) + movups T23, A(18) + + xorq D0, A(20) + movups A(21), T01 + movups A(23), T23 + pxor D12, T01 + pxor D34, T23 + movups T01, A(21) + movups T23, A(23) + + C rho and pi steps: Rotate and permute + movq A(0), C4 C rot 0, perm 0 + movq A(1), T4 C rot 1, perm 10 + movq C4, B(0) + rolq $1, T4 + movq A(2), C4 C rot 62, perm 20 + movq T4, B(10) + rolq $62, C4 + movq A(3), T4 C rot 28, perm 5 + movq C4, B(20) + rolq $28, T4 + movq A(4), C4 C rot 27, perm 15 + movq T4, B(5) + rolq $27, C4 + movq A(5), T4 C rot 36, perm 16 + movq C4, B(15) + rolq $36, T4 + movq A(6), C4 C rot 44, perm 1 + movq T4, B(16) + rolq $44, C4 + movq A(7), T4 C rot 6, perm 11 + movq C4, B(1) + rolq $6, T4 + movq A(8), C4 C rot 55, perm 21 + movq T4, B(11) + rolq $55, C4 + movq A(9), T4 C rot 20, perm 6 + movq C4, B(21) + rolq $20, T4 + movq A(10), C4 C rot 3, perm 7 + movq T4, B(6) + rolq $3, C4 + movq A(11), T4 C rot 10, perm 17 + movq C4, B(7) + rolq $10, T4 + movq A(12), C4 C rot 43, perm 2 + movq T4, B(17) + rolq $43, C4 + movq A(13), T4 C rot 25, perm 12 + movq C4, B(2) + rolq $25, T4 + movq A(14), C4 C rot 39, perm 22 + movq T4, B(12) + rolq $39, C4 + movq A(15), T4 C rot 41, perm 23 + movq C4, B(22) + rolq $41, T4 + movq A(16), C4 C rot 45, perm 8 + movq T4, B(23) + rolq $45, C4 + movq A(17), T4 C rot 15, perm 18 + movq C4, B(8) + rolq $15, T4 + movq A(18), C4 C rot 21, perm 3 + movq T4, B(18) + rolq $21, C4 + movq A(19), T4 C rot 8, perm 13 + movq C4, B(3) + rolq $8, T4 + movq A(20), C4 C rot 18, perm 14 + movq T4, B(13) + rolq $18, C4 + movq A(21), T4 C rot 2, perm 24 + movq C4, B(14) + rolq $2, T4 + movq A(22), C4 C rot 61, perm 9 + movq T4, B(24) + rolq $61, C4 + movq A(23), T4 C rot 56, perm 19 + movq C4, B(9) + rolq $56, T4 + movq A(24), C4 C rot 14, perm 4 + movq T4, B(19) + rolq $14, C4 + movq C4, B(4) + + C chi step + C Read with some overlap, pairs C01, D12, D34 + C Then also construct pairs C23 and T40. + + C We do the operations as + C A01 = B01 ^ (~B12 & B23) + C A12 = B12 ^ (~B23 & B34) + C A34 = B34 ^ (~B40 & B01) + + C Where we store only the low 64 bits of A01, and add in the + C round key if applicable. + + movups B(0), C01 + movups B(1), D12 + movups B(3), D34 + + pshufd $0x4e, D34, D43 + movdqa D43, T40 + punpcklqdq C01, T40 C Get 40 + movdqa D12, C23 + punpckhqdq D43, C23 C Get 23 + + pandn C01, T40 + pxor D34, T40 + movups T40, A(3) + + movdqa D12, T40 + pandn C23, T40 + pxor C01, T40 + + movd T40, T4 C Really movq! + xorq (RC_END, COUNT, 8), T4 + movq T4, A(0) + + pandn D34, C23 + pxor D12, C23 + movups C23, A(1) + + + movups B(5), C01 + movups B(6), D12 + movups B(8), D34 + + pshufd $0x4e, D34, D43 + movdqa D43, T40 + punpcklqdq C01, T40 C Get 40 + movdqa D12, C23 + punpckhqdq D43, C23 C Get 23 + + pandn C01, T40 + pxor D34, T40 + movups T40, A(8) + + movdqa D12, T40 + pandn C23, T40 + pxor C01, T40 + + movq T40, A(5) + + pandn D34, C23 + pxor D12, C23 + movups C23, A(6) + + + movups B(10), C01 + movups B(11), D12 + movups B(13), D34 + + pshufd $0x4e, D34, D43 + movdqa D43, T40 + punpcklqdq C01, T40 C Get 40 + movdqa D12, C23 + punpckhqdq D43, C23 C Get 23 + + pandn C01, T40 + pxor D34, T40 + movups T40, A(13) + + movdqa D12, T40 + pandn C23, T40 + pxor C01, T40 + + movq T40, A(10) + + pandn D34, C23 + pxor D12, C23 + movups C23, A(11) + + + movups B(15), C01 + movups B(16), D12 + movups B(18), D34 + + pshufd $0x4e, D34, D43 + movdqa D43, T40 + punpcklqdq C01, T40 C Get 40 + movdqa D12, C23 + punpckhqdq D43, C23 C Get 23 + + pandn C01, T40 + pxor D34, T40 + movups T40, A(18) + + movdqa D12, T40 + pandn C23, T40 + pxor C01, T40 + + movq T40, A(15) + + pandn D34, C23 + pxor D12, C23 + movups C23, A(16) + + + movups B(20), C01 + movups B(21), D12 + movups B(23), D34 + + pshufd $0x4e, D34, D43 + movdqa D43, T40 + punpcklqdq C01, T40 C Get 40 + movdqa D12, C23 + punpckhqdq D43, C23 C Get 23 + + pandn C01, T40 + pxor D34, T40 + movups T40, A(23) + + movdqa D12, T40 + pandn C23, T40 + pxor C01, T40 + + movq T40, A(20) + + pandn D34, C23 + pxor D12, C23 + movups C23, A(21) + + + incq COUNT + jnz .Loop + + addq $FRAME_SIZE, %rsp + W64_EXIT(1, 8) + ret + +EPILOGUE(nettle_sha3_permute) + +ALIGN(4) + .quad 0x0000000000000001, 0X0000000000008082 + .quad 0X800000000000808A, 0X8000000080008000 + .quad 0X000000000000808B, 0X0000000080000001 + .quad 0X8000000080008081, 0X8000000000008009 + .quad 0X000000000000008A, 0X0000000000000088 + .quad 0X0000000080008009, 0X000000008000000A + .quad 0X000000008000808B, 0X800000000000008B + .quad 0X8000000000008089, 0X8000000000008003 + .quad 0X8000000000008002, 0X8000000000000080 + .quad 0X000000000000800A, 0X800000008000000A + .quad 0X8000000080008081, 0X8000000000008080 + .quad 0X0000000080000001, 0X8000000080008008 +.rc_end: