diff --git a/ChangeLog b/ChangeLog index 1bc9b6502fb0016500fd99d8ce84da94564ff405..92c2179c574fc98d203c671819bb189762052530 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2013-03-07 Niels Möller <nisse@lysator.liu.se> + + * x86_64/ecc-521-modp.asm: New file. 2.4 time speedup. + 2013-03-06 Niels Möller <nisse@lysator.liu.se> * x86_64/ecc-384-modp.asm: New file, 3 time speedup. diff --git a/x86_64/ecc-521-modp.asm b/x86_64/ecc-521-modp.asm new file mode 100644 index 0000000000000000000000000000000000000000..afe3d2aa52cde0fc0a65fd7733c4b0792bebfd52 --- /dev/null +++ b/x86_64/ecc-521-modp.asm @@ -0,0 +1,143 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "ecc-521-modp.asm" + +define(<RP>, <%rsi>) +define(<U0>, <%rax>) +define(<U1>, <%rbx>) +define(<U2>, <%rcx>) +define(<U3>, <%rdx>) +define(<U4>, <%rbp>) +define(<U5>, <%rdi>) +define(<U6>, <%r8>) +define(<U7>, <%r9>) +define(<U8>, <%r10>) +define(<U9>, <%r11>) +define(<T0>, <%r12>) +define(<T1>, <%r13>) + +PROLOGUE(nettle_ecc_521_modp) + W64_ENTRY(2, 0) + push %rbx + push %rbp + push %r12 + push %r13 + + C Read top 17 limbs, shift left 55 bits + mov 72(RP), U1 + mov U1, U0 + shl $55, U0 + shr $9, U1 + + mov 80(RP), U2 + mov U2, T0 + shr $9, U2 + shl $55, T0 + or T0, U1 + + mov 88(RP), U3 + mov U3, T0 + shr $9, U3 + shl $55, T0 + or T0, U2 + + mov 96(RP), U4 + mov U4, T0 + shr $9, U4 + shl $55, T0 + or T0, U3 + + mov 104(RP), U5 + mov U5, T0 + shr $9, U5 + shl $55, T0 + or T0, U4 + + mov 112(RP), U6 + mov U6, T0 + shr $9, U6 + shl $55, T0 + or T0, U5 + + mov 120(RP), U7 + mov U7, T0 + shr $9, U7 + shl $55, T0 + or T0, U6 + + mov 128(RP), U8 + mov U8, T0 + shr $9, U8 + shl $55, T0 + or T0, U7 + + mov 136(RP), U9 + mov U9, T0 + shr $9, U9 + shl $55, T0 + or T0, U8 + + add (RP), U0 + adc 8(RP), U1 + adc 16(RP), U2 + adc 24(RP), U3 + adc 32(RP), U4 + adc 40(RP), U5 + adc 48(RP), U6 + adc 56(RP), U7 + adc 64(RP), U8 + adc $0, U9 + + C Top limbs are <U9, U8>. Keep low 9 bits of 8, and fold the + C top bits (at most 65 bits). + mov U8, T0 + shr $9, T0 + and $0x1ff, U8 + mov U9, T1 + shl $55, U9 + shr $9, T1 + or U9, T0 + + add T0, U0 + mov U0, (RP) + adc T1, U1 + mov U1, 8(RP) + adc $0, U2 + mov U2, 16(RP) + adc $0, U3 + mov U3, 24(RP) + adc $0, U4 + mov U4, 32(RP) + adc $0, U5 + mov U5, 40(RP) + adc $0, U6 + mov U6, 48(RP) + adc $0, U7 + mov U7, 56(RP) + adc $0, U8 + mov U8, 64(RP) + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + W64_EXIT(2, 0) + ret +EPILOGUE(nettle_ecc_521_modp)