From 10e0615f30f6997675985b655f1cfd6823aa8615 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 28 Feb 2013 11:45:44 +0100
Subject: [PATCH] Reduce number of additions for x86_64 ecc_192_modp.

---
 ChangeLog               |  3 ++
 x86_64/ecc-192-modp.asm | 69 +++++++++++++++++++++--------------------
 2 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 7d397d71..93643d33 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2013-02-28  Niels Möller  <nisse@lysator.liu.se>
 
+	* x86_64/ecc-192-modp.asm: Reorganized to reduce number of
+	additions. Use setc instruction.
+
 	* examples/Makefile.in: Let $(HOGWEED_TARGETS) depend on
 	../libhogweed.a.
 
diff --git a/x86_64/ecc-192-modp.asm b/x86_64/ecc-192-modp.asm
index 288340f9..5812070b 100644
--- a/x86_64/ecc-192-modp.asm
+++ b/x86_64/ecc-192-modp.asm
@@ -20,50 +20,53 @@ C MA 02111-1301, USA.
 	.file "ecc-192-modp.asm"
 
 define(<RP>, <%rsi>)
-define(<T1>, <%rdi>) C Overlaps unused ecc input
-define(<T2>, <%rcx>)
-define(<T3>, <%rdx>)
-define(<T4>, <%r8>)
-define(<T5>, <%r9>)
-define(<T6>, <%r10>)
+define(<T0>, <%rdi>) C Overlaps unused ecc input
+define(<T1>, <%rcx>)
+define(<T2>, <%rdx>)
+define(<T3>, <%r8>)
+define(<H>, <%r9>)
+define(<C1>, <%r10>)
+define(<C2>, <%r11>)
 
 	C ecc_192_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
 	.text
 	ALIGN(4)
 PROLOGUE(nettle_ecc_192_modp)
 	W64_ENTRY(2, 0)
-	C First: (B+1)*{r5, r4} < B^3 + B^2 - B
-	mov	32(RP), T1
-	mov	40(RP), T2
-	mov	T2, T3
-	xor	T4, T4
-	add	T1, T2
-	adc	$0, T3
-	adc	$0, T4
+	mov	16(RP), T2
+	mov	24(RP), T3
+	mov	40(RP), H
+	xor	C1, C1
+	xor	C2, C2
 
-	add	8(RP), T1
-	adc	16(RP), T2
-	adc	24(RP), T3
-	adc	$0, T4
-	C Sum is < 2B^4 + B^3 - B - 1, so {T4, T3} < 3B
+	add	H, T2
+	adc	H, T3
+	C Carry to be added in at T1 and T2
+	setc	LREG(C2)
+	
+	mov	8(RP), T1
+	mov	32(RP), H
+	adc	H, T1
+	adc	H, T2
+	C Carry to be added in at T0 and T1
+	setc	LREG(C1)
+	
+	mov	(RP), T0
+	adc	T3, T0
+	adc	T3, T1
+	adc	$0, C2
 
-	C Next: (B+1) * {T4, T3} < 3B^2 + 2B
-	mov	T4, T5
-	add	T3, T4
-	adc	$0, T5
+	C Add in C1 and C2
+	add	C1, T1
+	adc	C2, T2
+	setc	LREG(C1)
 
-	xor	T6, T6
-	add	(RP), T3
-	adc	T4, T1
-	adc	T5, T2
-	adc	$0, T6
-
-	C Fold in final carry.
-	add	T6, T3
-	adc	T6, T1
+	C Fold final carry.
+	adc	$0, T0
+	adc	C1, T1
 	adc	$0, T2
 
-	mov	T3, (RP)
+	mov	T0, (RP)
 	mov	T1, 8(RP)
 	mov	T2, 16(RP)
 
-- 
GitLab