Commit 3e302cc0 authored by Niels Möller's avatar Niels Möller

Added x86_64 assembly for _salsa20_core.

parent 0f46bfe9
2012-10-29 Niels Möller <>
* x86_64/salsa20-core-internal.asm: New file.
* Added salsa20-core-internal.asm.
* examples/nettle-benchmark.c (bench_salsa20_core): New function.
2012-10-27 Niels Möller <>
* testsuite/ (TS_SOURCES, CXX_SOURCES): Include sources
......@@ -237,7 +237,8 @@ if test "x$enable_assembler" = xyes ; then
for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \
arcfour-crypt.asm camellia-crypt-internal.asm \
md5-compress.asm memxor.asm salsa20-crypt.asm \
md5-compress.asm memxor.asm \
salsa20-crypt.asm salsa20-core-internal.asm \
serpent-encrypt.asm serpent-decrypt.asm \
sha1-compress.asm machine.m4; do
# echo "Looking for $srcdir/$asm_path/$tmp_f"
......@@ -47,6 +47,7 @@
#include "des.h"
#include "gcm.h"
#include "memxor.h"
#include "salsa20.h"
#include "serpent.h"
#include "sha.h"
#include "twofish.h"
......@@ -563,10 +564,10 @@ compare_double(const void *ap, const void *bp)
/* Try to get accurate cycle times for assembler functions. */
static void
uint32_t state[_SHA1_DIGEST_LENGTH];
uint32_t start_lo, start_hi, end_lo, end_hi;
......@@ -594,9 +595,42 @@ bench_sha1_compress(void)
qsort(count, 5, sizeof(double), compare_double);
printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);
static void
uint32_t state[_SALSA20_INPUT_LENGTH];
uint32_t start_lo, start_hi, end_lo, end_hi;
double count[5];
uint8_t *p;
unsigned i, j;
for (j = 0; j < 5; j++)
i = 0;
GET_CYCLE_COUNTER(start_hi, start_lo);
for (; i < BENCH_ITERATIONS; i++)
_nettle_salsa20_core(state, state, 20);
GET_CYCLE_COUNTER(end_hi, end_lo);
end_hi -= (start_hi + (start_lo > end_lo));
end_lo -= start_lo;
count[j] = ldexp(end_hi, 32) + end_lo;
qsort(count, 5, sizeof(double), compare_double);
printf("salsa20_core: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);
#define bench_sha1_compress()
#define bench_salsa20_core()
# define OPENSSL(x) x,
......@@ -684,7 +718,7 @@ main(int argc, char **argv)
C nettle, low-level cryptographics library
C Copyright (C) 2012 Niels Möller
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
define(<DST>, <%rdi>)
define(<SRC>, <%rsi>)
define(<COUNT>, <%rdx>)
define(<X0>, <%xmm0>)
define(<X1>, <%xmm1>)
define(<X2>, <%xmm2>)
define(<X3>, <%xmm3>)
define(<T0>, <%xmm4>)
define(<T1>, <%xmm5>)
define(<M0101>, <%xmm6>)
define(<M0110>, <%xmm7>)
define(<M0011>, <%xmm8>)
C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
W64_ENTRY(3, 9)
C Load mask registers
mov $-1, %eax
movd %eax, M0101
pshufd $0x09, M0101, M0011 C 01 01 00 00
pshufd $0x41, M0101, M0110 C 01 00 00 01
pshufd $0x22, M0101, M0101 C 01 00 01 00
movups (SRC), X0
movups 16(SRC), X1
movups 32(SRC), X2
movups 48(SRC), X3
C The original rows are now diagonals.
SWAP(X0, X1, M0101)
SWAP(X2, X3, M0101)
SWAP(X1, X3, M0110)
SWAP(X0, X2, M0011)
shrl $1, XREG(COUNT)
QROUND(X0, X1, X2, X3)
pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left)
pshufd $0x4e, X2, X2 C 10 11 00 01
pshufd $0x39, X3, X3 C 01 10 11 00
QROUND(X0, X3, X2, X1)
C Inverse rotation of the rows
pshufd $0x39, X1, X1 C 01 10 11 00
pshufd $0x4e, X2, X2 C 10 11 00 01
pshufd $0x93, X3, X3 C 11 00 01 10
jnz .Loop
SWAP(X0, X2, M0011)
SWAP(X1, X3, M0110)
SWAP(X0, X1, M0101)
SWAP(X2, X3, M0101)
movups (SRC), T0
movups 16(SRC), T1
paddd T0, X0
paddd T1, X1
movups X0,(DST)
movups X1,16(DST)
movups 32(SRC), T0
movups 48(SRC), T1
paddd T0, X2
paddd T1, X3
movups X2,32(DST)
movups X3,48(DST)
W64_EXIT(3, 9)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment