From 3be646d1cdbffbceef956de554a005320a1aa63d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Fri, 12 Apr 2013 12:22:56 +0200
Subject: [PATCH] ARM assembly for umac_nh.

---
 ChangeLog         |  3 ++
 armv7/machine.m4  | 38 ++++++++++++++++++++
 armv7/umac-nh.asm | 91 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 armv7/umac-nh.asm

diff --git a/ChangeLog b/ChangeLog
index 86f75f11..7a163fe7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2013-04-12  Niels Möller  <nisse@lysator.liu.se>
 
+	* armv7/umac-nh.asm: New file. 2.1 time speedup.
+	* armv7/machine.m4 (D0REG, D1REG): New macros.
+
 	* configure.ac (asm_replace_list): Added umac-nh.asm and
 	umac-nh-n.asm.
 
diff --git a/armv7/machine.m4 b/armv7/machine.m4
index a829c76e..f982a66a 100644
--- a/armv7/machine.m4
+++ b/armv7/machine.m4
@@ -16,3 +16,41 @@ define(<QREG>, <ifelse(
 	$1, d28, q14,
 	$1, d30, q15,
 	<NO REGISTER>)>)dnl
+
+define(<D0REG>, <ifelse(
+	$1, q0, d0,
+	$1, q1, d2,
+	$1, q2, d4,
+	$1, q3, d6,
+	$1, q4, d8,
+	$1, q5, d10,
+	$1, q6, d12,
+	$1, q7, d14,
+	$1, q8, d16,
+	$1, q9, d18,
+	$1, q10, d20,
+	$1, q11, d22,
+	$1, q12, d24,
+	$1, q13, d26,
+	$1, q14, d28,
+	$1, q15, d30,
+	<NO REGISTER>)>)dnl
+
+define(<D1REG>, <ifelse(
+	$1, q0, d1,
+	$1, q1, d3,
+	$1, q2, d5,
+	$1, q3, d7,
+	$1, q4, d9,
+	$1, q5, d11,
+	$1, q6, d13,
+	$1, q7, d15,
+	$1, q8, d17,
+	$1, q9, d19,
+	$1, q10, d21,
+	$1, q11, d23,
+	$1, q12, d25,
+	$1, q13, d27,
+	$1, q14, d29,
+	$1, q15, d31,
+	<NO REGISTER>)>)dnl
diff --git a/armv7/umac-nh.asm b/armv7/umac-nh.asm
new file mode 100644
index 00000000..7c0a0290
--- /dev/null
+++ b/armv7/umac-nh.asm
@@ -0,0 +1,91 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2013 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+	.file "umac-nh.asm"
+	.fpu	neon
+
+define(<KEY>, <r0>)
+define(<LENGTH>, <r1>)
+define(<MSG>, <r2>)
+define(<SHIFT>, <r3>)
+
+define(<QA>, <q0>)
+define(<QB>, <q1>)
+define(<DM>, <d16>)
+define(<QLEFT>, <q9>)
+define(<QRIGHT>, <q10>)
+define(<QACC>, <q11>)
+define(<QT0>, <q12>)
+define(<QT1>, <q13>)
+define(<QK0>, <q14>)
+define(<QK1>, <q15>)
+
+	.text
+	.align	3
+	
+PROLOGUE(_nettle_umac_nh)
+	C Setup for 64-bit aligned reads
+	ands	SHIFT, MSG, #7
+	and	MSG, MSG, #-8
+	vld1.8	{DM}, [MSG :64]
+	addne	MSG, MSG, #8
+	addeq	SHIFT, SHIFT, #8
+
+	C FIXME: Combine as rsb ?
+	lsl	SHIFT, SHIFT, #3
+	neg	SHIFT, SHIFT
+
+	C Right shift in QRIGHT (both halves)
+	vmov.i32 D0REG(QRIGHT)[0], SHIFT
+	vmov.32	 D1REG(QRIGHT), D0REG(QRIGHT)
+	add	SHIFT, SHIFT, #64
+	
+	vmov.i32 D0REG(QLEFT)[0], SHIFT
+	vmov.32	 D1REG(QLEFT), D0REG(QLEFT)
+
+	vmov.i64 QACC, #0
+
+	vshl.u64 DM, DM, D0REG(QRIGHT)
+.Loop:
+	C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
+	vld1.8 {QA, QB}, [MSG :64]!
+	vshl.u64 QT0, QA, QRIGHT
+	vshl.u64 QT1, QB, QRIGHT
+	vshl.u64 QA, QA, QLEFT
+	vshl.u64 QB, QB, QLEFT
+	veor	D0REG(QA), D0REG(QA), DM
+	veor	D1REG(QA), D1REG(QA), D0REG(QT0)
+	veor	D0REG(QB), D0REG(QB), D1REG(QT0)
+	veor	D1REG(QB), D1REG(QB), D0REG(QT1)
+	vmov	DM, D1REG(QT1)
+
+	vld1.i32 {QK0, QK1}, [KEY]!
+	vadd.i32 QA, QA, QK0
+	vadd.i32 QB, QB, QK1
+	vmull.u32 QT0, D0REG(QA), D0REG(QB)
+	vmull.u32 QT1, D1REG(QA), D1REG(QB)
+	subs	LENGTH, LENGTH, #32
+	vadd.i64 QACC, QACC, QT0
+	vadd.i64 QACC, QACC, QT1
+	bhi	.Loop
+
+	vadd.i64 D0REG(QACC), D0REG(QACC), D1REG(QACC)
+	vmov	r0, r1, D0REG(QACC)
+	bx	lr
+EPILOGUE(_nettle_umac_nh)
-- 
GitLab