Commit 3e302cc0 authored by Niels Möller's avatar Niels Möller
Browse files

Added x86_64 assembly for _salsa20_core.

parent 0f46bfe9
2012-10-29 Niels Möller <nisse@lysator.liu.se>
* x86_64/salsa20-core-internal.asm: New file.
* configure.ac: Added salsa20-core-internal.asm.
* examples/nettle-benchmark.c (bench_salsa20_core): New function.
2012-10-27 Niels Möller <nisse@lysator.liu.se> 2012-10-27 Niels Möller <nisse@lysator.liu.se>
* testsuite/Makefile.in (TS_SOURCES, CXX_SOURCES): Include sources * testsuite/Makefile.in (TS_SOURCES, CXX_SOURCES): Include sources
......
...@@ -237,7 +237,8 @@ if test "x$enable_assembler" = xyes ; then ...@@ -237,7 +237,8 @@ if test "x$enable_assembler" = xyes ; then
found=no found=no
for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \ for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \
arcfour-crypt.asm camellia-crypt-internal.asm \ arcfour-crypt.asm camellia-crypt-internal.asm \
md5-compress.asm memxor.asm salsa20-crypt.asm \ md5-compress.asm memxor.asm \
salsa20-crypt.asm salsa20-core-internal.asm \
serpent-encrypt.asm serpent-decrypt.asm \ serpent-encrypt.asm serpent-decrypt.asm \
sha1-compress.asm machine.m4; do sha1-compress.asm machine.m4; do
# echo "Looking for $srcdir/$asm_path/$tmp_f" # echo "Looking for $srcdir/$asm_path/$tmp_f"
......
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
#include "des.h" #include "des.h"
#include "gcm.h" #include "gcm.h"
#include "memxor.h" #include "memxor.h"
#include "salsa20.h"
#include "serpent.h" #include "serpent.h"
#include "sha.h" #include "sha.h"
#include "twofish.h" #include "twofish.h"
...@@ -563,10 +564,10 @@ compare_double(const void *ap, const void *bp) ...@@ -563,10 +564,10 @@ compare_double(const void *ap, const void *bp)
} }
/* Try to get accurate cycle times for assembler functions. */ /* Try to get accurate cycle times for assembler functions. */
#if WITH_CYCLE_COUNTER
static void static void
bench_sha1_compress(void) bench_sha1_compress(void)
{ {
#if WITH_CYCLE_COUNTER
uint32_t state[_SHA1_DIGEST_LENGTH]; uint32_t state[_SHA1_DIGEST_LENGTH];
uint8_t data[BENCH_ITERATIONS * SHA1_DATA_SIZE]; uint8_t data[BENCH_ITERATIONS * SHA1_DATA_SIZE];
uint32_t start_lo, start_hi, end_lo, end_hi; uint32_t start_lo, start_hi, end_lo, end_hi;
...@@ -594,9 +595,42 @@ bench_sha1_compress(void) ...@@ -594,9 +595,42 @@ bench_sha1_compress(void)
qsort(count, 5, sizeof(double), compare_double); qsort(count, 5, sizeof(double), compare_double);
printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS); printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);
#endif
} }
static void
bench_salsa20_core(void)
{
uint32_t state[_SALSA20_INPUT_LENGTH];
uint32_t start_lo, start_hi, end_lo, end_hi;
double count[5];
uint8_t *p;
unsigned i, j;
for (j = 0; j < 5; j++)
{
i = 0;
GET_CYCLE_COUNTER(start_hi, start_lo);
for (; i < BENCH_ITERATIONS; i++)
_nettle_salsa20_core(state, state, 20);
GET_CYCLE_COUNTER(end_hi, end_lo);
end_hi -= (start_hi + (start_lo > end_lo));
end_lo -= start_lo;
count[j] = ldexp(end_hi, 32) + end_lo;
}
qsort(count, 5, sizeof(double), compare_double);
printf("salsa20_core: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);
}
#else
#define bench_sha1_compress()
#define bench_salsa20_core()
#endif
#if WITH_OPENSSL #if WITH_OPENSSL
# define OPENSSL(x) x, # define OPENSSL(x) x,
#else #else
...@@ -684,7 +718,7 @@ main(int argc, char **argv) ...@@ -684,7 +718,7 @@ main(int argc, char **argv)
} }
#endif #endif
bench_sha1_compress(); bench_sha1_compress();
bench_salsa20_core();
time_overhead(); time_overhead();
header(); header();
......
C nettle, low-level cryptographics library
C
C Copyright (C) 2012 Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
define(<DST>, <%rdi>)
define(<SRC>, <%rsi>)
define(<COUNT>, <%rdx>)
define(<X0>, <%xmm0>)
define(<X1>, <%xmm1>)
define(<X2>, <%xmm2>)
define(<X3>, <%xmm3>)
define(<T0>, <%xmm4>)
define(<T1>, <%xmm5>)
define(<M0101>, <%xmm6>)
define(<M0110>, <%xmm7>)
define(<M0011>, <%xmm8>)
include_src(<x86_64/salsa20.m4>)
C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
.text
ALIGN(4)
PROLOGUE(_nettle_salsa20_core)
W64_ENTRY(3, 9)
C Load mask registers
mov $-1, %eax
movd %eax, M0101
pshufd $0x09, M0101, M0011 C 01 01 00 00
pshufd $0x41, M0101, M0110 C 01 00 00 01
pshufd $0x22, M0101, M0101 C 01 00 01 00
movups (SRC), X0
movups 16(SRC), X1
movups 32(SRC), X2
movups 48(SRC), X3
C The original rows are now diagonals.
SWAP(X0, X1, M0101)
SWAP(X2, X3, M0101)
SWAP(X1, X3, M0110)
SWAP(X0, X2, M0011)
shrl $1, XREG(COUNT)
ALIGN(4)
.Loop:
QROUND(X0, X1, X2, X3)
pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left)
pshufd $0x4e, X2, X2 C 10 11 00 01
pshufd $0x39, X3, X3 C 01 10 11 00
QROUND(X0, X3, X2, X1)
C Inverse rotation of the rows
pshufd $0x39, X1, X1 C 01 10 11 00
pshufd $0x4e, X2, X2 C 10 11 00 01
pshufd $0x93, X3, X3 C 11 00 01 10
decl XREG(COUNT)
jnz .Loop
SWAP(X0, X2, M0011)
SWAP(X1, X3, M0110)
SWAP(X0, X1, M0101)
SWAP(X2, X3, M0101)
movups (SRC), T0
movups 16(SRC), T1
paddd T0, X0
paddd T1, X1
movups X0,(DST)
movups X1,16(DST)
movups 32(SRC), T0
movups 48(SRC), T1
paddd T0, X2
paddd T1, X3
movups X2,32(DST)
movups X3,48(DST)
W64_EXIT(3, 9)
ret
EPILOGUE(_nettle_salsa20_core)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment