diff --git a/ChangeLog b/ChangeLog index 581d86ca38f71c35b197b35fcbe51feec7462db0..3b2a590ac02d86da85bb031bb5013df3e69450e2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,6 @@ 2013-03-06 Niels Möller <nisse@lysator.liu.se> + * x86_64/ecc-256-redc.asm: New file, 2.5 time speedup. * x86_64/ecc-224-modp.asm: New file, 5 time speedup over C version. diff --git a/x86_64/ecc-256-redc.asm b/x86_64/ecc-256-redc.asm new file mode 100644 index 0000000000000000000000000000000000000000..dc7ea340ab2b2d95952bd0e14984c298feb950fb --- /dev/null +++ b/x86_64/ecc-256-redc.asm @@ -0,0 +1,116 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-256-redc.asm" + +define(<RP>, <%rsi>) +define(<U0>, <%rdi>) C Overlaps unused ecc input +define(<U1>, <%rcx>) +define(<U2>, <%rax>) +define(<U3>, <%rdx>) +define(<U4>, <%r8>) +define(<U5>, <%r9>) +define(<U6>, <%r10>) +define(<F0>, <%r11>) +define(<F1>, <%r12>) +define(<F2>, <%rbx>) +define(<F3>, <%rbp>) + +C FOLD(x), sets (F3,F2,F1,F0) <-- (x << 224) - (x << 128) - (x<<32) +define(<FOLD>, < + mov $1, F2 + mov $1, F3 + shl <$>32, F2 + shr <$>32, F3 + xor F0,F0 + xor F1,F1 + sub F2, F0 + sbb F3, F1 + sbb $1, F2 + sbb <$>0, F3 +>) +PROLOGUE(nettle_ecc_256_redc) + W64_ENTRY(2, 0) + C save all registers that need to be saved + push %rbx + push %rbp + push %r12 + + mov (RP), U0 + FOLD(U0) + mov 8(RP), U1 + mov 16(RP), U2 + mov 24(RP), U3 + sub F0, U1 + sbb F1, U2 + sbb F2, U3 + sbb F3, U0 C Add in later + + FOLD(U1) + mov 32(RP), U4 + sub F0, U2 + sbb F1, U3 + sbb F2, U4 + sbb F3, U1 + + FOLD(U2) + mov 40(RP), U5 + sub F0, U3 + sbb F1, U4 + sbb F2, U5 + sbb F3, U2 + + FOLD(U3) + mov 48(RP), U6 + sub F0, U4 + sbb F1, U5 + sbb F2, U6 + sbb F3, U3 + + add U4, U0 + adc U5, U1 + adc U6, U2 + adc 56(RP), U3 + + C If carry, we need to add in + C 2^256 - p = <0xfffffffe, 0xff..ff, 0xffffffff00000000, 1> + sbb F2, F2 + mov F2, F0 + mov F2, F1 + mov XREG(F2), XREG(F3) + neg F0 + shl $32, F1 + and $-2, XREG(F3) + + add F0, U0 + mov U0, (RP) + adc F1, U1 + mov U1, 8(RP) + adc F2, U2 + mov U2, 16(RP) + adc F3, U3 + + mov U3, 24(RP) + + pop %r12 + pop %rbp + pop %rbx + W64_EXIT(2, 0) + ret +EPILOGUE(nettle_ecc_256_redc)