Commit 16768e1f authored by Niels Möller's avatar Niels Möller

ARM assembly for ecc_384_modp.

parent 3ac426df
2013-03-04 Niels Möller <nisse@lysator.liu.se>
* configure.ac (asm_optional_list): Added ecc-384-modp.asm. Deleted
bogus reference to $asm_search_list.
* ecc-384.c: Check HAVE_NATIVE_ecc_384_modp, and use native
version if available.
* armv7/ecc-384-modp.asm: New file, 3 time speedup over C version.
2013-03-03 Niels Möller <nisse@lysator.liu.se>
* ecc-256.c: Fixed definition of USE_REDC.
......
C nettle, low-level cryptographics library
C
C Copyright (C) 2013, Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
.file "ecc-384-modp.asm"
.arm
define(<RP>, <r1>)
define(<T0>, <r0>)
define(<T1>, <r2>)
define(<T2>, <r3>)
define(<T3>, <r4>)
define(<F0>, <r5>)
define(<F1>, <r6>)
define(<F2>, <r7>)
define(<F3>, <r8>)
define(<F4>, <r10>)
define(<N>, <r12>)
define(<H>, <lr>)
C ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp)
.text
.align 2
PROLOGUE(nettle_ecc_384_modp)
push {r4,r5,r6,r7,r8,r10,lr}
add RP, RP, #80
ldm RP, {T0, T1, T2, T3} C 20-23
C First get top 4 limbs, which need folding twice, as
C
C T3 T2 T1 T0
C T3 T2 T1
C -T3
C ----------------
C F4 F3 F2 F1 F0
C
C Start with
C
C T3 T1 T0
C T1
C -T3
C -----------
C F2 F1 F0 Always fits
adds F0, T0, T1
adcs F1, T1, #0
adcs F2, T3, #0
subs F0, F0, T3
sbcs F1, F1, #0
sbcs F2, F2, #0
C T3 T2 T2 0
C F2 F1 F0
C ----------------
C F4 F3 F2 F1 F0
mov F4, #0
adds F1, F1, T2
adcs F2, F2, T2
adcs F3, T3, #0
adcs F4, F4, #0
C Add in to high part
sub RP, RP, #32
ldm RP, {T0, T1, T2, T3} C 12-15
mov H, #0
adds F0, T0, F0
adcs F1, T1, F1
adcs F2, T2, F2
adcs F3, T3, F3
adcs F4, F4, #0 C Do F4 later
C Add to low part, keeping carry (positive or negative) in H
sub RP, RP, #48
ldm RP, {T0, T1, T2, T3} C 0-3
mov H, #0
adds T0, T0, F0
adcs T1, T1, F1
adcs T2, T2, F2
adcs T3, T3, F3
adc H, H, #0
subs T1, T1, F0
sbcs T2, T2, F1
sbcs T3, T3, F2
sbc H, H, #0
adds T3, T3, F0
adc H, H, #0
stm RP!, {T0,T1,T2,T3} C 0-3
mov N, #2
.Loop:
ldm RP, {T0,T1,T2,T3} C 4-7
C First, propagate carry
adds T0, T0, H
asr H, #31 C Sign extend
adcs T1, T1, H
adcs T2, T2, H
adcs T3, T3, H
adc H, H, #0
C +B^4 term
adds T0, T0, F0
adcs T1, T1, F1
adcs T2, T2, F2
adcs T3, T3, F3
adc H, H, #0
C +B^3 terms
ldr F0, [RP, #+48] C 16
adds T0, T0, F1
adcs T1, T1, F2
adcs T2, T2, F3
adcs T3, T3, F0
adc H, H, #0
C -B
ldr F1, [RP, #+52] C 17-18
ldr F2, [RP, #+56]
subs T0, T0, F3
sbcs T1, T1, F0
sbcs T2, T2, F1
sbcs T3, T3, F2
sbcs H, H, #0
C +1
ldr F3, [RP, #+60] C 19
adds T0, T0, F0
adcs T1, T1, F1
adcs T2, T2, F2
adcs T3, T3, F3
adc H, H, #0
subs N, N, #1
stm RP!, {T0,T1,T2,T3}
bne .Loop
C Fold high limbs, we need to add in
C
C F4 F4 -F4 F4 H H -H H
C
C We always have F4 >= 0, but we can have H < 0.
C Sign extension gets tricky when F4 = 0 and H < 0.
sub RP, RP, #48
ldm RP, {T0,T1,T2,T3} C 0-3
C H H 0 -H H
C ----------------
C S F4 F3 F2 F1 F0
C
C Define S = H >> 31 (asr), we then have
C
C F0 = H
C F1 = S - H
C F2 = - [H > 0]
C F3 = H - [H > 0]
C F4 = H + S
C
C And we get underflow in S - H iff H > 0
C H = 0 H > 0 H = -1
mov F0, H C 0 H -1
asr H, #31
subs F1, H, F0 C 0,C=1 -H,C=0 0,C=1
sbc F2, F2, F2 C 0 -1 0
sbc F3, F0, #0 C 0 H-1 -1
adds T0, T0, F0
adcs T1, T1, F1
adcs T2, T2, F2
adcs T3, T3, F3
adc H, H, F0 C 0+cy H+cy -2+cy
stm RP!, {T0,T1,T2,T3} C 0-3
ldm RP, {T0,T1,T2,T3} C 4-7
C F4 0 -F4
C ---------
C F3 F2 F1
rsbs F1, F4, #0
sbc F2, F2, F2
sbc F3, F4, #0
C Sign extend H
adds F0, F4, H
asr H, H, #31
adcs F1, F1, H
adcs F2, F2, H
adcs F3, F3, H
adcs F4, F4, H
adc H, H, #0
adds T0, T0, F0
adcs T1, T1, F1
adcs T2, T2, F2
adcs T3, T3, F3
stm RP!, {T0,T1,T2,T3} C 4-7
ldm RP, {T0,T1,T2,T3} C 8-11
adcs T0, T0, F4
adcs T1, T1, H
adcs T2, T2, H
adcs T3, T3, H
adc H, H, #0
stm RP, {T0,T1,T2,T3} C 8-11
C Final (unlikely) carry
sub RP, RP, #32
ldm RP, {T0,T1,T2,T3} C 0-3
C Fold H into F0-F4
mov F0, H
asr H, #31
subs F1, H, F0
sbc F2, F2, F2
sbc F3, F0, #0
add F4, F0, H
adds T0, T0, F0
adcs T1, T1, F1
adcs T2, T2, F2
adcs T3, T3, F3
stm RP!, {T0,T1,T2,T3} C 0-3
ldm RP, {T0,T1,T2,T3} C 4-7
adcs T0, T0, F4
adcs T1, T1, H
adcs T2, T2, H
adcs T3, T3, H
stm RP!, {T0,T1,T2,T3} C 4-7
ldm RP, {T0,T1,T2,T3} C 8-11
adcs T0, T0, H
adcs T1, T1, H
adcs T2, T2, H
adcs T3, T3, H
stm RP!, {T0,T1,T2,T3} C 8-11
pop {r4,r5,r6,r7,r8,r10,pc}
EPILOGUE(nettle_ecc_384_modp)
......@@ -250,7 +250,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
asm_optional_list=""
if test "x$enable_public_key" = "xyes" ; then
asm_optional_list="$asm_search_list ecc-192-modp.asm ecc-224-modp.asm ecc-256-redc.asm"
asm_optional_list="ecc-192-modp.asm ecc-224-modp.asm ecc-256-redc.asm ecc-384-modp.asm"
fi
OPT_ASM_SOURCES=""
......
......@@ -36,13 +36,18 @@
#include "ecc-384.h"
#if HAVE_NATIVE_ecc_384_modp
#define ecc_384_modp nettle_ecc_384_modp
void
ecc_384_modp (const struct ecc_curve *ecc, mp_limb_t *rp);
#elif GMP_NUMB_BITS == 32
/* Use that 2^{384} = 2^{128} + 2^{96} - 2^{32} + 1, and eliminate 256
bits at a time.
We can get carry == 2 in the first iteration, and I think *only* in
the first iteration. */
#if GMP_NUMB_BITS == 32
/* p is 12 limbs, and B^12 - p = B^4 + B^3 - B + 1. We can eliminate
almost 8 at a time. Do only 7, to avoid additional carry
propagation, followed by 5. */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment