diff --git a/ChangeLog b/ChangeLog index 49f223123b06978a01b0e39425b2c50c5c1b6e4a..0f0a0ae98abe9a1cc7b33e379f8f63bd7be4ac34 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2012-10-29 Niels Möller <nisse@lysator.liu.se> + + * x86_64/salsa20-core-internal.asm: New file. + * configure.ac: Added salsa20-core-internal.asm. + * examples/nettle-benchmark.c (bench_salsa20_core): New function. + 2012-10-27 Niels Möller <nisse@lysator.liu.se> * testsuite/Makefile.in (TS_SOURCES, CXX_SOURCES): Include sources diff --git a/configure.ac b/configure.ac index 998ed5570cd499c2d4679102295d3115a28f8485..b5f3571b002629416b54dee6ab0eec79e123ba6e 100644 --- a/configure.ac +++ b/configure.ac @@ -237,7 +237,8 @@ if test "x$enable_assembler" = xyes ; then found=no for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \ arcfour-crypt.asm camellia-crypt-internal.asm \ - md5-compress.asm memxor.asm salsa20-crypt.asm \ + md5-compress.asm memxor.asm \ + salsa20-crypt.asm salsa20-core-internal.asm \ serpent-encrypt.asm serpent-decrypt.asm \ sha1-compress.asm machine.m4; do # echo "Looking for $srcdir/$asm_path/$tmp_f" diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c index 3d0786862ec8cbb1b7edf0c4801b56c0cadd93ec..7446fbcec2c73a177bfd84e833a27e77055d230d 100644 --- a/examples/nettle-benchmark.c +++ b/examples/nettle-benchmark.c @@ -47,6 +47,7 @@ #include "des.h" #include "gcm.h" #include "memxor.h" +#include "salsa20.h" #include "serpent.h" #include "sha.h" #include "twofish.h" @@ -563,10 +564,10 @@ compare_double(const void *ap, const void *bp) } /* Try to get accurate cycle times for assembler functions. */ +#if WITH_CYCLE_COUNTER static void bench_sha1_compress(void) { -#if WITH_CYCLE_COUNTER uint32_t state[_SHA1_DIGEST_LENGTH]; uint8_t data[BENCH_ITERATIONS * SHA1_DATA_SIZE]; uint32_t start_lo, start_hi, end_lo, end_hi; @@ -594,9 +595,42 @@ bench_sha1_compress(void) qsort(count, 5, sizeof(double), compare_double); printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS); -#endif } +static void +bench_salsa20_core(void) +{ + uint32_t state[_SALSA20_INPUT_LENGTH]; + uint32_t start_lo, start_hi, end_lo, end_hi; + + double count[5]; + + uint8_t *p; + unsigned i, j; + + for (j = 0; j < 5; j++) + { + i = 0; + GET_CYCLE_COUNTER(start_hi, start_lo); + for (; i < BENCH_ITERATIONS; i++) + _nettle_salsa20_core(state, state, 20); + + GET_CYCLE_COUNTER(end_hi, end_lo); + + end_hi -= (start_hi + (start_lo > end_lo)); + end_lo -= start_lo; + + count[j] = ldexp(end_hi, 32) + end_lo; + } + + qsort(count, 5, sizeof(double), compare_double); + printf("salsa20_core: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS); +} +#else +#define bench_sha1_compress() +#define bench_salsa20_core() +#endif + #if WITH_OPENSSL # define OPENSSL(x) x, #else @@ -684,7 +718,7 @@ main(int argc, char **argv) } #endif bench_sha1_compress(); - + bench_salsa20_core(); time_overhead(); header(); diff --git a/x86_64/salsa20-core-internal.asm b/x86_64/salsa20-core-internal.asm new file mode 100644 index 0000000000000000000000000000000000000000..81ca2cc83d14712654e1d9148f440e3e3685cb28 --- /dev/null +++ b/x86_64/salsa20-core-internal.asm @@ -0,0 +1,98 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2012 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +define(<DST>, <%rdi>) +define(<SRC>, <%rsi>) +define(<COUNT>, <%rdx>) +define(<X0>, <%xmm0>) +define(<X1>, <%xmm1>) +define(<X2>, <%xmm2>) +define(<X3>, <%xmm3>) +define(<T0>, <%xmm4>) +define(<T1>, <%xmm5>) +define(<M0101>, <%xmm6>) +define(<M0110>, <%xmm7>) +define(<M0011>, <%xmm8>) + +include_src(<x86_64/salsa20.m4>) + + C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) + .text + ALIGN(4) +PROLOGUE(_nettle_salsa20_core) + W64_ENTRY(3, 9) + + C Load mask registers + mov $-1, %eax + movd %eax, M0101 + pshufd $0x09, M0101, M0011 C 01 01 00 00 + pshufd $0x41, M0101, M0110 C 01 00 00 01 + pshufd $0x22, M0101, M0101 C 01 00 01 00 + + movups (SRC), X0 + movups 16(SRC), X1 + movups 32(SRC), X2 + movups 48(SRC), X3 + + C The original rows are now diagonals. + SWAP(X0, X1, M0101) + SWAP(X2, X3, M0101) + SWAP(X1, X3, M0110) + SWAP(X0, X2, M0011) + + shrl $1, XREG(COUNT) + + ALIGN(4) +.Loop: + QROUND(X0, X1, X2, X3) + pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left) + pshufd $0x4e, X2, X2 C 10 11 00 01 + pshufd $0x39, X3, X3 C 01 10 11 00 + + QROUND(X0, X3, X2, X1) + + C Inverse rotation of the rows + pshufd $0x39, X1, X1 C 01 10 11 00 + pshufd $0x4e, X2, X2 C 10 11 00 01 + pshufd $0x93, X3, X3 C 11 00 01 10 + + decl XREG(COUNT) + jnz .Loop + + SWAP(X0, X2, M0011) + SWAP(X1, X3, M0110) + SWAP(X0, X1, M0101) + SWAP(X2, X3, M0101) + + movups (SRC), T0 + movups 16(SRC), T1 + paddd T0, X0 + paddd T1, X1 + movups X0,(DST) + movups X1,16(DST) + movups 32(SRC), T0 + movups 48(SRC), T1 + paddd T0, X2 + paddd T1, X3 + movups X2,32(DST) + movups X3,48(DST) + + W64_EXIT(3, 9) + ret +EPILOGUE(_nettle_salsa20_core)