From 59eb1aa12bbad340877f83f06eca1d47c10231c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 6 Mar 2013 09:33:26 +0100
Subject: [PATCH] x86_64 assembly for ecc_224_modp.

---
 ChangeLog               |   5 ++
 x86_64/ecc-224-modp.asm | 115 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 x86_64/ecc-224-modp.asm

diff --git a/ChangeLog b/ChangeLog
index a8c23743..581d86ca 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-03-06  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/ecc-224-modp.asm: New file, 5 time speedup over C
+	version.
+
 2013-03-05  Niels Möller  <nisse@lysator.liu.se>
 
 	* configure.ac (asm_optional_list): Added ecc-521-modp.asm.
diff --git a/x86_64/ecc-224-modp.asm b/x86_64/ecc-224-modp.asm
new file mode 100644
index 00000000..b759e1f2
--- /dev/null
+++ b/x86_64/ecc-224-modp.asm
@@ -0,0 +1,115 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+	.file "ecc-224-modp.asm"
+
+define(<RP>, <%rsi>)
+define(<T0>, <%rdi>) C Overlaps unused ecc input
+define(<T1>, <%rcx>)
+define(<H0>, <%rax>)
+define(<H1>, <%rdx>)
+define(<H2>, <%r8>)
+define(<F0>, <%r9>)
+define(<F1>, <%r10>)
+define(<F2>, <%r11>)
+
+PROLOGUE(nettle_ecc_224_modp)
+	W64_ENTRY(2, 0)
+	mov	48(RP), H0
+	mov	56(RP), H1
+	C Set (F2,F1,F0)  <--  (H1,H0) << 32
+	mov	H0, F0
+	mov	H0, F1
+	shl	$32, F0
+	shr	$32, F1
+	mov	H1, F2
+	mov	H1, T0
+	shl	$32, T0
+	shr	$32, F2
+	or	T0, F1
+
+	xor	H2, H2
+	mov	16(RP), T0
+	mov	24(RP), T1
+	sub	F0, T0
+	sbb	F1, T1
+	sbb	F2, H0
+	sbb	$0, H1		C No further borrow
+
+	adc	32(RP), H0
+	adc	40(RP), H1
+	adc	$0, H2
+
+	C Set (F2,F1,F0)  <--  (H2,H1,H0) << 32
+	C To free registers, add in T1, T0 as soon as H0, H1 have been copied
+	mov	H0, F0
+	mov	H0, F1
+	add	T0, H0
+	mov	H1, F2
+	mov	H1, T0
+	adc	T1, H1
+	mov	H2, T1
+	adc	$0, H2
+
+	C Shift 32 bits
+	shl	$32, F0
+	shr	$32, F1
+	shl	$32, T0
+	shr	$32, F2
+	shl	$32, T1
+	or	T0, F1
+	or	T1, F2
+
+	mov	(RP), T0
+	mov	8(RP), T1
+	sub	F0, T0
+	sbb	F1, T1
+	sbb	F2, H0
+	sbb	$0, H1
+	sbb	$0, H2
+
+	C We now have H2, H1, H0, T1, T0, with 33 bits left to reduce
+	C Set F0       <-- (H2, H1) >> 32
+	C Set (F2,F1)  <-- (H2, H1 & 0xffffffff00000000)
+	C H1  <--  H1 & 0xffffffff
+
+	mov	H1, F0
+	mov	H1, F1
+	mov	H2, F2
+	movl	XREG(H1), XREG(H1)	C Clears high 32 bits
+	sub	H1, F1			C Clears low 32 bits
+	shr	$32, F0
+	shl	$32, H2
+	or	H2, F0
+
+	sub	F0, T0
+	sbb	$0, F1
+	sbb	$0, F2
+	add	F1, T1
+	adc	F2, H0
+	adc	$0, H1
+
+	mov	T0, (RP)
+	mov	T1, 8(RP)
+	mov	H0, 16(RP)
+	mov	H1, 24(RP)
+
+	W64_EXIT(2, 0)
+	ret
+EPILOGUE(nettle_ecc_224_modp)
-- 
GitLab