From 10e0615f30f6997675985b655f1cfd6823aa8615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Thu, 28 Feb 2013 11:45:44 +0100 Subject: [PATCH] Reduce number of additions for x86_64 ecc_192_modp. --- ChangeLog | 3 ++ x86_64/ecc-192-modp.asm | 69 +++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7d397d71..93643d33 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ 2013-02-28 Niels Möller <nisse@lysator.liu.se> + * x86_64/ecc-192-modp.asm: Reorganized to reduce number of + additions. Use setc instruction. + * examples/Makefile.in: Let $(HOGWEED_TARGETS) depend on ../libhogweed.a. diff --git a/x86_64/ecc-192-modp.asm b/x86_64/ecc-192-modp.asm index 288340f9..5812070b 100644 --- a/x86_64/ecc-192-modp.asm +++ b/x86_64/ecc-192-modp.asm @@ -20,50 +20,53 @@ C MA 02111-1301, USA. .file "ecc-192-modp.asm" define(<RP>, <%rsi>) -define(<T1>, <%rdi>) C Overlaps unused ecc input -define(<T2>, <%rcx>) -define(<T3>, <%rdx>) -define(<T4>, <%r8>) -define(<T5>, <%r9>) -define(<T6>, <%r10>) +define(<T0>, <%rdi>) C Overlaps unused ecc input +define(<T1>, <%rcx>) +define(<T2>, <%rdx>) +define(<T3>, <%r8>) +define(<H>, <%r9>) +define(<C1>, <%r10>) +define(<C2>, <%r11>) C ecc_192_modp (const struct ecc_curve *ecc, mp_limb_t *rp) .text ALIGN(4) PROLOGUE(nettle_ecc_192_modp) W64_ENTRY(2, 0) - C First: (B+1)*{r5, r4} < B^3 + B^2 - B - mov 32(RP), T1 - mov 40(RP), T2 - mov T2, T3 - xor T4, T4 - add T1, T2 - adc $0, T3 - adc $0, T4 + mov 16(RP), T2 + mov 24(RP), T3 + mov 40(RP), H + xor C1, C1 + xor C2, C2 - add 8(RP), T1 - adc 16(RP), T2 - adc 24(RP), T3 - adc $0, T4 - C Sum is < 2B^4 + B^3 - B - 1, so {T4, T3} < 3B + add H, T2 + adc H, T3 + C Carry to be added in at T1 and T2 + setc LREG(C2) + + mov 8(RP), T1 + mov 32(RP), H + adc H, T1 + adc H, T2 + C Carry to be added in at T0 and T1 + setc LREG(C1) + + mov (RP), T0 + adc T3, T0 + adc T3, T1 + adc $0, C2 - C Next: (B+1) * {T4, T3} < 3B^2 + 2B - mov T4, T5 - add T3, T4 - adc $0, T5 + C Add in C1 and C2 + add C1, T1 + adc C2, T2 + setc LREG(C1) - xor T6, T6 - add (RP), T3 - adc T4, T1 - adc T5, T2 - adc $0, T6 - - C Fold in final carry. - add T6, T3 - adc T6, T1 + C Fold final carry. + adc $0, T0 + adc C1, T1 adc $0, T2 - mov T3, (RP) + mov T0, (RP) mov T1, 8(RP) mov T2, 16(RP) -- GitLab