From f51537c4e7c6e594adcb3d8ea817cbcd3db64bd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Mon, 27 Jan 2014 19:14:11 +0100
Subject: [PATCH] Use pshufhw + pshuflw for 16-bit rotate.

---
 ChangeLog                       |  3 +++
 x86_64/chacha-core-internal.asm | 20 ++++++++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index d96a59db..7298cf30 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2014-01-27  Niels Möller  <nisse@lysator.liu.se>
 
+	* x86_64/chacha-core-internal.asm: Use pshufhw + pshuflw for the
+	16-bit rotate.
+
 	* configure.ac (asm_replace_list): Added chacha-core-internal.asm.
 	* x86_64/chacha-core-internal.asm: New file.
 
diff --git a/x86_64/chacha-core-internal.asm b/x86_64/chacha-core-internal.asm
index 744e0ddf..15e45718 100644
--- a/x86_64/chacha-core-internal.asm
+++ b/x86_64/chacha-core-internal.asm
@@ -26,15 +26,27 @@ define(<X2>, <%xmm2>)
 define(<X3>, <%xmm3>)
 define(<T0>, <%xmm4>)
 define(<T1>, <%xmm5>)
-	
+
+define(<USE_PSHUFW>, <yes>)
+
+C ROTL_BY_16(REG, TMP)
+ifelse(USE_PSHUFW, <yes>, <
+define(<ROTL_BY_16>, <
+	pshufhw	<$>0xb1, $1, $1
+	pshuflw	<$>0xb1, $1, $1
+>)>, <
+define(<ROTL_BY_16>, <
+	pslld	<$>16, $1
+	psrld	<$>16, $2
+	por	$2, $1
+>)
+>)
 C QROUND
 define(<QROUND>, <
 	paddd	X1, X0
 	pxor	X0, X3
 	movaps	X3, T0
-	pslld	<$>16, X3
-	psrld	<$>16, T0
-	por	T0, X3
+	ROTL_BY_16(X3, T0)
 
 	paddd	X3, X2
 	pxor	X2, X1
-- 
GitLab