diff --git a/ChangeLog b/ChangeLog
index d96a59db247cf07e29f4545f0f49b6ee5e6e1847..7298cf3053be5b92ca35f973f978812463d428bf 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2014-01-27  Niels Möller  <nisse@lysator.liu.se>
 
+	* x86_64/chacha-core-internal.asm: Use pshufhw + pshuflw for the
+	16-bit rotate.
+
 	* configure.ac (asm_replace_list): Added chacha-core-internal.asm.
 	* x86_64/chacha-core-internal.asm: New file.
 
diff --git a/x86_64/chacha-core-internal.asm b/x86_64/chacha-core-internal.asm
index 744e0ddfd513d0f014d59a66c392e68cd8efeb9f..15e457183dfd98c493edcbaaa096515b1fc6b96c 100644
--- a/x86_64/chacha-core-internal.asm
+++ b/x86_64/chacha-core-internal.asm
@@ -26,15 +26,27 @@ define(<X2>, <%xmm2>)
 define(<X3>, <%xmm3>)
 define(<T0>, <%xmm4>)
 define(<T1>, <%xmm5>)
-	
+
+define(<USE_PSHUFW>, <yes>)
+
+C ROTL_BY_16(REG, TMP)
+ifelse(USE_PSHUFW, <yes>, <
+define(<ROTL_BY_16>, <
+	pshufhw	<$>0xb1, $1, $1
+	pshuflw	<$>0xb1, $1, $1
+>)>, <
+define(<ROTL_BY_16>, <
+	pslld	<$>16, $1
+	psrld	<$>16, $2
+	por	$2, $1
+>)
+>)
 C QROUND
 define(<QROUND>, <
 	paddd	X1, X0
 	pxor	X0, X3
 	movaps	X3, T0
-	pslld	<$>16, X3
-	psrld	<$>16, T0
-	por	T0, X3
+	ROTL_BY_16(X3, T0)
 
 	paddd	X3, X2
 	pxor	X2, X1