From 8c02a9dca9ea9e1ea99083d7d1291538ecc2f39d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 6 Mar 2013 15:53:01 +0100
Subject: [PATCH] x86_64 assembly for ecc_384_modp.

---
 ChangeLog               |   1 +
 x86_64/ecc-384-modp.asm | 240 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 x86_64/ecc-384-modp.asm

diff --git a/ChangeLog b/ChangeLog
index 3b2a590a..1bc9b650 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,6 @@
 2013-03-06  Niels Möller  <nisse@lysator.liu.se>
 
+	* x86_64/ecc-384-modp.asm: New file, 3 time speedup.
 	* x86_64/ecc-256-redc.asm: New file, 2.5 time speedup.
 	* x86_64/ecc-224-modp.asm: New file, 5 time speedup over C
 	version.
diff --git a/x86_64/ecc-384-modp.asm b/x86_64/ecc-384-modp.asm
new file mode 100644
index 00000000..698838fd
--- /dev/null
+++ b/x86_64/ecc-384-modp.asm
@@ -0,0 +1,240 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+	.file "ecc-384-modp.asm"
+
+define(<RP>, <%rsi>)
+define(<D4>, <%rax>)
+define(<T0>, <%rbx>)
+define(<T1>, <%rcx>)
+define(<T2>, <%rdx>)
+define(<T3>, <%rbp>)
+define(<T4>, <%rdi>)
+define(<T5>, <%r8>)
+define(<H0>, <%r9>)
+define(<H1>, <%r10>)
+define(<H2>, <%r11>)
+define(<H3>, <%r12>)
+define(<H4>, <%r13>)
+define(<H5>, <%r14>)
+define(<C2>, <%r15>)
+define(<C0>, H5)	C Overlap
+define(<D0>, RP)	C Overlap
+define(<TMP>, H4)	C Overlap
+
+PROLOGUE(nettle_ecc_384_modp)
+	W64_ENTRY(2, 0)
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	C First get top 2 limbs, which need folding twice
+	C
+	C   H5 H4
+	C     -H5
+	C  ------
+	C   H0 D4
+	C
+	C Then shift right, (H1,H0,D4)  <--  (H0,D4) << 32
+	C and add
+	C
+	C     H5 H4
+	C     H1 H0
+	C ----------
+	C  C2 H1 H0
+
+	mov	80(RP), D4
+	mov	88(RP), H0
+	mov	D4, H4
+	mov	H0, H5
+	sub	H0, D4
+	sbb	$0, H0
+
+	mov	D4, T2
+	mov	H0, H1
+	shl	$32, H0
+	shr	$32, T2
+	shr	$32, H1
+	or	T2, H0
+
+	xor	C2, C2
+	add	H4, H0
+	adc	H5, H1
+	adc	$0, C2
+
+	C Add in to high part
+	add	48(RP), H0
+	adc	56(RP), H1
+	adc	$0, C2		C Do C2 later
+
+	C +1 term
+	mov	(RP), T0
+	add	H0, T0
+	mov	8(RP), T1
+	adc	H1, T1
+	mov	16(RP), T2
+	mov	64(RP), H2
+	adc	H2, T2
+	mov	24(RP), T3
+	mov	72(RP), H3
+	adc	H3, T3
+	mov	32(RP), T4
+	adc	H4, T4
+	mov	40(RP), T5
+	adc	H5, T5
+	sbb	C0, C0
+	neg	C0		C FIXME: Switch sign of C0?
+
+	push	RP
+
+	C +B^2 term
+	add	H0, T2
+	adc	H1, T3
+	adc	H2, T4
+	adc	H3, T5
+	adc	$0, C0
+
+	C   H3 H2 H1 H0  0
+	C - H4 H3 H2 H1 H0
+	C  ---------------
+	C   H3 H2 H1 H0 D0
+
+	mov	XREG(D4), XREG(D4)
+	mov	H0, D0
+	neg	D0
+	sbb	H1, H0
+	sbb	H2, H1
+	sbb	H3, H2
+	sbb	H4, H3
+	sbb	$0, D4
+
+	C Shift right. High bits are sign, to be added to C0.
+	mov	D4, TMP
+	sar	$32, TMP
+	shl	$32, D4
+	add	TMP, C0
+
+	mov	H3, TMP
+	shr	$32, TMP
+	shl	$32, H3
+	or	TMP, D4
+
+	mov	H2, TMP
+	shr	$32, TMP
+	shl	$32, H2
+	or	TMP, H3
+
+	mov	H1, TMP
+	shr	$32, TMP
+	shl	$32, H1
+	or	TMP, H2
+
+	mov	H0, TMP
+	shr	$32, TMP
+	shl	$32, H0
+	or	TMP, H1
+
+	mov	D0, TMP
+	shr	$32, TMP
+	shl	$32, D0
+	or	TMP, H0
+
+	add	D0, T0
+	adc	H0, T1
+	adc	H1, T2
+	adc	H2, T3
+	adc	H3, T4
+	adc	D4, T5
+	adc	$0, C0
+
+	C Remains to add in C2 and C0
+	C                         C0  C0<<32  (-2^32+1)C0
+	C    C2  C2<<32  (-2^32+1)C2
+	C where C2 is always positive, while C0 may be -1.
+	mov	C0, H0
+	mov	C0, H1
+	mov	C0, H2
+	sar	$63, C0		C Get sign
+	shl	$32, H1
+	sub	H1, H0		C Gives borrow iff C0 > 0
+	sbb	$0, H1
+	add	C0, H2
+
+	add	H0, T0
+	adc	H1, T1
+	adc	$0, H2
+	adc	$0, C0
+
+	C Set (H1 H0)  <-- C2 << 96 - C2 << 32 + 1
+	mov	C2, H0
+	mov	C2, H1
+	shl	$32, H1
+	sub	H1, H0
+	sbb	$0, H1
+
+	add	H2, H0
+	adc	C0, H1
+	adc	C2, C0
+	mov	C0, H2
+	sar	$63, C0
+	add	H0, T2
+	adc	H1, T3
+	adc	H2, T4
+	adc	C0, T5
+	sbb	C0, C0
+
+	C Final unlikely carry
+	mov	C0, H0
+	mov	C0, H1
+	mov	C0, H2
+	sar	$63, C0
+	shl	$32, H1
+	sub	H1, H0
+	sbb	$0, H1
+	add	C0, H2
+
+	pop	RP
+
+	sub	H0, T0
+	mov	T0, (RP)
+	sbb	H1, T1
+	mov	T1, 8(RP)
+	sbb	H2, T2
+	mov	T2, 16(RP)
+	sbb	C0, T3
+	mov	T3, 24(RP)
+	sbb	C0, T4
+	mov	T4, 32(RP)
+	sbb	C0, T5
+	mov	T5, 40(RP)
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	W64_EXIT(2, 0)
+	ret
+EPILOGUE(nettle_ecc_384_modp)
-- 
GitLab