diff --git a/ChangeLog b/ChangeLog
index 49f223123b06978a01b0e39425b2c50c5c1b6e4a..0f0a0ae98abe9a1cc7b33e379f8f63bd7be4ac34 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2012-10-29  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/salsa20-core-internal.asm: New file.
+	* configure.ac: Added salsa20-core-internal.asm.
+	* examples/nettle-benchmark.c (bench_salsa20_core): New function.
+
 2012-10-27  Niels Möller  <nisse@lysator.liu.se>
 
 	* testsuite/Makefile.in (TS_SOURCES, CXX_SOURCES): Include sources
diff --git a/configure.ac b/configure.ac
index 998ed5570cd499c2d4679102295d3115a28f8485..b5f3571b002629416b54dee6ab0eec79e123ba6e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -237,7 +237,8 @@ if test "x$enable_assembler" = xyes ; then
     found=no
     for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		 arcfour-crypt.asm camellia-crypt-internal.asm \
-		 md5-compress.asm memxor.asm salsa20-crypt.asm \
+		 md5-compress.asm memxor.asm \
+		 salsa20-crypt.asm salsa20-core-internal.asm \
 		 serpent-encrypt.asm serpent-decrypt.asm \
 		 sha1-compress.asm machine.m4; do
 #       echo "Looking for $srcdir/$asm_path/$tmp_f"
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c
index 3d0786862ec8cbb1b7edf0c4801b56c0cadd93ec..7446fbcec2c73a177bfd84e833a27e77055d230d 100644
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -47,6 +47,7 @@
 #include "des.h"
 #include "gcm.h"
 #include "memxor.h"
+#include "salsa20.h"
 #include "serpent.h"
 #include "sha.h"
 #include "twofish.h"
@@ -563,10 +564,10 @@ compare_double(const void *ap, const void *bp)
 }
 
 /* Try to get accurate cycle times for assembler functions. */
+#if WITH_CYCLE_COUNTER
 static void
 bench_sha1_compress(void)
 {
-#if WITH_CYCLE_COUNTER
   uint32_t state[_SHA1_DIGEST_LENGTH];
   uint8_t data[BENCH_ITERATIONS * SHA1_DATA_SIZE];
   uint32_t start_lo, start_hi, end_lo, end_hi;
@@ -594,9 +595,42 @@ bench_sha1_compress(void)
 
   qsort(count, 5, sizeof(double), compare_double);
   printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);  
-#endif
 }
 
+static void
+bench_salsa20_core(void)
+{
+  uint32_t state[_SALSA20_INPUT_LENGTH];
+  uint32_t start_lo, start_hi, end_lo, end_hi;
+
+  double count[5];
+  
+  uint8_t *p;
+  unsigned i, j;
+
+  for (j = 0; j < 5; j++)
+    {
+      i = 0;
+      GET_CYCLE_COUNTER(start_hi, start_lo);
+      for (; i < BENCH_ITERATIONS; i++)
+	_nettle_salsa20_core(state, state, 20);
+
+      GET_CYCLE_COUNTER(end_hi, end_lo);
+
+      end_hi -= (start_hi + (start_lo > end_lo));
+      end_lo -= start_lo;
+
+      count[j] = ldexp(end_hi, 32) + end_lo;
+    }
+
+  qsort(count, 5, sizeof(double), compare_double);
+  printf("salsa20_core: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);  
+}
+#else
+#define bench_sha1_compress()
+#define bench_salsa20_core()
+#endif
+
 #if WITH_OPENSSL
 # define OPENSSL(x) x,
 #else
@@ -684,7 +718,7 @@ main(int argc, char **argv)
     }
 #endif
   bench_sha1_compress();
-
+  bench_salsa20_core();
   time_overhead();
 
   header();
diff --git a/x86_64/salsa20-core-internal.asm b/x86_64/salsa20-core-internal.asm
new file mode 100644
index 0000000000000000000000000000000000000000..81ca2cc83d14712654e1d9148f440e3e3685cb28
--- /dev/null
+++ b/x86_64/salsa20-core-internal.asm
@@ -0,0 +1,98 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2012 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+define(<DST>, <%rdi>)
+define(<SRC>, <%rsi>)
+define(<COUNT>, <%rdx>)
+define(<X0>, <%xmm0>)
+define(<X1>, <%xmm1>)
+define(<X2>, <%xmm2>)
+define(<X3>, <%xmm3>)
+define(<T0>, <%xmm4>)
+define(<T1>, <%xmm5>)
+define(<M0101>, <%xmm6>)
+define(<M0110>, <%xmm7>)
+define(<M0011>, <%xmm8>)
+
+include_src(<x86_64/salsa20.m4>)
+
+	C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+	.text
+	ALIGN(4)
+PROLOGUE(_nettle_salsa20_core)
+	W64_ENTRY(3, 9)	
+
+	C Load mask registers
+	mov	$-1, %eax
+	movd	%eax, M0101
+	pshufd	$0x09, M0101, M0011	C 01 01 00 00
+	pshufd	$0x41, M0101, M0110	C 01 00 00 01
+	pshufd	$0x22, M0101, M0101	C 01 00 01 00
+
+	movups	(SRC), X0
+	movups	16(SRC), X1
+	movups	32(SRC), X2
+	movups	48(SRC), X3
+
+	C The original rows are now diagonals.
+	SWAP(X0, X1, M0101)
+	SWAP(X2, X3, M0101)
+	SWAP(X1, X3, M0110)
+	SWAP(X0, X2, M0011)	
+
+	shrl	$1, XREG(COUNT)
+
+	ALIGN(4)
+.Loop:
+	QROUND(X0, X1, X2, X3)
+	pshufd	$0x93, X1, X1	C	11 00 01 10 (least sign. left)
+	pshufd	$0x4e, X2, X2	C	10 11 00 01
+	pshufd	$0x39, X3, X3	C	01 10 11 00
+
+	QROUND(X0, X3, X2, X1)
+
+	C Inverse rotation of the rows
+	pshufd	$0x39, X1, X1	C	01 10 11 00
+	pshufd	$0x4e, X2, X2	C	10 11 00 01
+	pshufd	$0x93, X3, X3	C	11 00 01 10
+
+	decl	XREG(COUNT)
+	jnz	.Loop
+
+	SWAP(X0, X2, M0011)	
+	SWAP(X1, X3, M0110)
+	SWAP(X0, X1, M0101)
+	SWAP(X2, X3, M0101)
+
+	movups	(SRC), T0
+	movups	16(SRC), T1
+	paddd	T0, X0
+	paddd	T1, X1
+	movups	X0,(DST)
+	movups	X1,16(DST)
+	movups	32(SRC), T0
+	movups	48(SRC), T1
+	paddd	T0, X2
+	paddd	T1, X3
+	movups	X2,32(DST)
+	movups	X3,48(DST)
+	
+	W64_EXIT(3, 9)
+	ret
+EPILOGUE(_nettle_salsa20_core)