Commit 752861f7 authored by Niels Möller's avatar Niels Möller

Optimized x86_64 assembly for ecc_25519_modp.

parent 3a4e808e
2014-08-22 Niels Möller <nisse@lysator.liu.se>
* x86_64/ecc-25519-modp.asm: New file. Initial assembly
implementation, 30% speedup of ecc_25519_modp.
* x86_64/ecc-25519-modp.asm: New file. Assembly implementation,
initial version yields 30% speedup of ecc_25519_modp. Early
folding eliminates one pass of carry propagation, and yields
almost 20% additional speedup.
* ecc-25519.c [HAVE_NATIVE_ecc_25519_modp]: Use assembly version
if available.
......
......@@ -33,65 +33,62 @@ ifelse(<
.file "ecc-25519-modp.asm"
define(<RP>, <%rsi>)
define(<U0>, <%rdi>) C Overlaps unused ecc input
define(<U0>, <%rdi>) C Overlaps unused ecc input
define(<U1>, <%rcx>)
define(<U2>, <%r8>)
define(<V1>, <%r9>)
define(<V2>, <%r10>)
define(<V3>, <%r11>)
define(<M>, <%r12>)
define(<U3>, <%r9>)
define(<T0>, <%r10>)
define(<T1>, <%r11>)
define(<M>, <%rbx>)
PROLOGUE(nettle_ecc_25519_modp)
W64_ENTRY(2, 0)
push %r12
push %rbx
C First fold the limbs affecting bit 255
mov 56(RP), %rax
mov $38, M
mov 32(RP), %rax
mul M
mov %rax, U0
mov %rdx, V1
mov 24(RP), U3
xor T0, T0
add %rax, U3
adc %rdx, T0
mov 40(RP), %rax
mov 40(RP), %rax C Do this early as possible
mul M
mov %rax, U1
mov %rdx, V2
mov 48(RP), %rax
add U3, U3
adc T0, T0
shr U3 C Undo shift, clear high bit
C Fold the high limb again, together with RP[5]
imul $19, T0
mov (RP), U0
mov 8(RP), U1
mov 16(RP), U2
add T0, U0
adc %rax, U1
mov 32(RP), %rax
adc %rdx, U2
adc $0, U3
C Fold final two limbs, RP[4] and RP[6]
mul M
mov %rax, U2
mov %rdx, V3
mov 56(RP), %rax
mov %rax, T0
mov 48(RP), %rax
mov %rdx, T1
mul M
add V1, U1
adc V2, U2
adc V3, %rax
adc $0, %rdx
shr M
C FIXME: Load and add earlier?
add (RP), U0
adc 8(RP), U1
adc 16(RP), U2
adc 24(RP), %rax
adc $0, %rdx
add %rax, %rax C Copy high bit to carry
adc %rdx, %rdx
shr %rax C Undo shift, clear high bit
imul M, %rdx
add %rdx, U0
add T0, U0
mov U0, (RP)
adc $0, U1
adc T1, U1
mov U1, 8(RP)
adc $0, U2
adc %rax, U2
mov U2, 16(RP)
adc $0, %rax
mov %rax, 24(RP)
adc %rdx, U3
mov U3, 24(RP)
pop %r12
pop %rbx
W64_EXIT(2, 0)
ret
EPILOGUE(nettle_ecc_25519_modp)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment