From e8ddf9577049a929adc0fbc8b7f418be9f6b8448 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Wed, 13 Mar 2013 10:02:32 +0100 Subject: [PATCH] x86_64 assembly for sha512. --- ChangeLog | 4 + configure.ac | 4 +- x86_64/machine.m4 | 4 + x86_64/sha512-compress.asm | 196 +++++++++++++++++++++++++++++++++++++ 4 files changed, 207 insertions(+), 1 deletion(-) create mode 100644 x86_64/sha512-compress.asm diff --git a/ChangeLog b/ChangeLog index 00837d45..074a9007 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2013-03-13 Niels Möller <nisse@lysator.liu.se> + * configure.ac (asm_replace_list): Added sha512-compress.asm. + * x86_64/machine.m4 (OFFSET64): New macro. + * x86_64/sha512-compress.asm: New file, 20% speedup. + * sha512-compress.c (ROUND): Eliminated a temporary, analogous to sha256 change below. diff --git a/configure.ac b/configure.ac index ecf38090..25695b6a 100644 --- a/configure.ac +++ b/configure.ac @@ -249,7 +249,9 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ md5-compress.asm memxor.asm \ salsa20-crypt.asm salsa20-core-internal.asm \ serpent-encrypt.asm serpent-decrypt.asm \ - sha1-compress.asm sha256-compress.asm machine.m4" + sha1-compress.asm sha256-compress.asm sha512-compress.asm \ + machine.m4" + # Assembler files which generate additional object files if they are used. asm_optional_list="" diff --git a/x86_64/machine.m4 b/x86_64/machine.m4 index b6f353c9..dc23dde8 100644 --- a/x86_64/machine.m4 +++ b/x86_64/machine.m4 @@ -2,6 +2,10 @@ C OFFSET(i) C Expands to 4*i, or to the empty string if i is zero define(<OFFSET>, <ifelse($1,0,,eval(4*$1))>) +C OFFSET64(i) +C Expands to 8*i, or to the empty string if i is zero +define(<OFFSET64>, <ifelse($1,0,,eval(8*$1))>) + dnl LREG(reg) gives the 8-bit register corresponding to the given 64-bit register. define(<LREG>,<ifelse( $1, %rax, %al, diff --git a/x86_64/sha512-compress.asm b/x86_64/sha512-compress.asm new file mode 100644 index 00000000..3b0391a6 --- /dev/null +++ b/x86_64/sha512-compress.asm @@ -0,0 +1,196 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + + .file "sha512-compress.asm" +define(<STATE>, <%rdi>) +define(<INPUT>, <%rsi>) +define(<K>, <%rdx>) +define(<SA>, <%rax>) +define(<SB>, <%rbx>) +define(<SC>, <%rcx>) +define(<SD>, <%r8>) +define(<SE>, <%r9>) +define(<SF>, <%r10>) +define(<SG>, <%r11>) +define(<SH>, <%r12>) +define(<T0>, <%r13>) +define(<T1>, <%rdi>) C Overlap STATE +define(<COUNT>, <%r14>) +define(<W>, <%r15>) + +define(<EXPN>, < + mov OFFSET64($1)(%rsp), W + mov OFFSET64(eval(($1 + 14) % 16))(%rsp), T0 + mov T0, T1 + shr <$>6, T0 + rol <$>3, T1 + xor T1, T0 + rol <$>42, T1 + xor T1, T0 + add T0, W + mov OFFSET64(eval(($1 + 1) % 16))(%rsp), T0 + mov T0, T1 + shr <$>7, T0 + rol <$>56, T1 + xor T1, T0 + rol <$>7, T1 + xor T1, T0 + add T0, W + add OFFSET64(eval(($1 + 9) % 16))(%rsp), W + mov W, OFFSET64($1)(%rsp) +>) + +C ROUND(A,B,C,D,E,F,G,H,K) +C +C H += S1(E) + Choice(E,F,G) + K + W +C D += H +C H += S0(A) + Majority(A,B,C) +C +C Where +C +C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23 +C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25 +C Choice (E, F, G) = G^(E&(F^G)) +C Majority (A,B,C) = (A&B) + (C&(A^B)) + +define(<ROUND>, < + mov $5, T0 + mov $5, T1 + rol <$>23, T0 + rol <$>46, T1 + xor T0, T1 + rol <$>27, T0 + xor T0, T1 + add W, $8 + add T1, $8 + mov $7, T0 + xor $6, T0 + and $5, T0 + xor $7, T0 + add OFFSET64($9)(K,COUNT,8), $8 + add T0, $8 + add $8, $4 + + mov $1, T0 + mov $1, T1 + rol <$>25, T0 + rol <$>30, T1 + xor T0, T1 + rol <$>11, T0 + xor T0, T1 + add T1, $8 + mov $1, T0 + mov $1, T1 + and $2, T0 + xor $2, T1 + add T0, $8 + and $3, T1 + add T1, $8 +>) + +define(<NOEXPN>, < + mov OFFSET64($1)(INPUT, COUNT, 8), W + bswap W + mov W, OFFSET64($1)(%rsp, COUNT, 8) +>) + + C void + C _nettle_sha512_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) + + .text + ALIGN(4) + +PROLOGUE(_nettle_sha512_compress) + W64_ENTRY(3, 0) + + sub $184, %rsp + mov %rbx, 128(%rsp) + mov STATE, 136(%rsp) C Save state, to free a register + mov %rbp, 144(%rsp) + mov %r12, 152(%rsp) + mov %r13, 160(%rsp) + mov %r14, 168(%rsp) + mov %r15, 176(%rsp) + + mov (STATE), SA + mov 8(STATE), SB + mov 16(STATE), SC + mov 24(STATE), SD + mov 32(STATE), SE + mov 40(STATE), SF + mov 48(STATE), SG + mov 56(STATE), SH + xor COUNT, COUNT + ALIGN(4) + +.Loop1: + NOEXPN(0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,0) + NOEXPN(1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,1) + NOEXPN(2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,2) + NOEXPN(3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,3) + NOEXPN(4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,4) + NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5) + NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6) + NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7) + add $8, COUNT + cmp $16, COUNT + jne .Loop1 + +.Loop2: + EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,0) + EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,1) + EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,2) + EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,3) + EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,4) + EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5) + EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6) + EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7) + EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,8) + EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,9) + EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,10) + EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,11) + EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,12) + EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,13) + EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,14) + EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,15) + add $16, COUNT + cmp $80, COUNT + jne .Loop2 + + mov 136(%rsp), STATE + + add SA, (STATE) + add SB, 8(STATE) + add SC, 16(STATE) + add SD, 24(STATE) + add SE, 32(STATE) + add SF, 40(STATE) + add SG, 48(STATE) + add SH, 56(STATE) + + mov 128(%rsp), %rbx + mov 144(%rsp), %rbp + mov 152(%rsp), %r12 + mov 160(%rsp), %r13 + mov 168(%rsp),%r14 + mov 176(%rsp),%r15 + + add $184, %rsp + ret +EPILOGUE(_nettle_sha512_compress) -- GitLab