From 8c4ef180abef60d65d94382055e2fa9c558202a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 25 Jun 2020 23:26:56 +0200
Subject: [PATCH] arm: Micro optimize neon implementation of salsa20 and chacha

---
 ChangeLog                          |  5 +++++
 arm/neon/chacha-core-internal.asm  | 28 ++++++++++++----------------
 arm/neon/salsa20-core-internal.asm | 12 ++++--------
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c3d1b4fa..ef037fc0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2020-06-25  Niels Möller  <nisse@lysator.liu.se>
 
+	* arm/neon/chacha-core-internal.asm (QROUND): Micro optimize
+	rotations, using the vsra.u32 instruction. Gives 10% speedup,
+	benchmarked on Cortex-A5. Suggested by Torbjörn Granlund.
+	* arm/neon/salsa20-core-internal.asm (QROUND): Likewise.
+
 	* x86_64/chacha-core-internal.asm (QROUND): Fix use of macro
 	arguments. Spotted by Torbjörn Granlund.
 
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index 22f843e8..799376f2 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -54,28 +54,24 @@ define(<QROUND>, <
 	C x2 += x3, x1 ^= x2, x1 lrot 7
 
 	vadd.i32	$1, $1, $2
-	veor		$4, $4, $1
-	vshl.i32	T0, $4, #16
-	vshr.u32	$4, $4, #16
-	veor		$4, $4, T0
+	veor		T0, $4, $1
+	vshl.i32	$4, T0, #16
+	vsra.u32	$4, T0, #16
 
 	vadd.i32	$3, $3, $4
-	veor		$2, $2, $3
-	vshl.i32	T0, $2, #12
-	vshr.u32	$2, $2, #20
-	veor		$2, $2, T0
+	veor		T0, $2, $3
+	vshl.i32	$2, T0, #12
+	vsra.u32	$2, T0, #20
 
 	vadd.i32	$1, $1, $2
-	veor		$4, $4, $1
-	vshl.i32	T0, $4, #8
-	vshr.u32	$4, $4, #24
-	veor		$4, $4, T0
+	veor		T0, $4, $1
+	vshl.i32	$4, T0, #8
+	vsra.u32	$4, T0, #24
 
 	vadd.i32	$3, $3, $4
-	veor		$2, $2, $3
-	vshl.i32	T0, $2, #7
-	vshr.u32	$2, $2, #25
-	veor		$2, $2, T0
+	veor		T0, $2, $3
+	vshl.i32	$2, T0, #7
+	vsra.u32	$2, T0, #25
 >)
 
 	.text
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index 20710499..590c3bb1 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -53,26 +53,22 @@ define(<S3>, <q15>)
 define(<QROUND>, <
 	vadd.i32	T0, $1, $4
 	vshl.i32	T1, T0, #7
-	vshr.u32	T0, T0, #25
-	veor		$2, $2, T0
+	vsra.u32	T1, T0, #25
 	veor		$2, $2, T1
 
 	vadd.i32	T0, $1, $2
 	vshl.i32	T1, T0, #9
-	vshr.u32	T0, T0, #23
-	veor		$3, $3, T0
+	vsra.u32	T1, T0, #23
 	veor		$3, $3, T1
 
 	vadd.i32	T0, $2, $3
 	vshl.i32	T1, T0, #13
-	vshr.u32	T0, T0, #19
-	veor		$4, $4, T0
+	vsra.u32	T1, T0, #19
 	veor		$4, $4, T1
 
 	vadd.i32	T0, $3, $4
 	vshl.i32	T1, T0, #18
-	vshr.u32	T0, T0, #14
-	veor		$1, $1, T0
+	vsra.u32	T1, T0, #14
 	veor		$1, $1, T1
 >)
 	
-- 
GitLab