diff --git a/ChangeLog b/ChangeLog index 78ffcbb7e0b06bd2dc7a4bae999dbe8969f9e217..46bb9b56aa10ce6e15bedc34c2712bce9e8907be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2013-09-22 Niels Möller <nisse@lysator.liu.se> + + * x86_64/gcm-hash8.asm: New file. + * x86_64/gcm-gf-mul-8.asm: Deleted. + + * configure.ac (asm_nettle_optional_list): Look for gcm-hash8.asm, + not gcm-gf-mul-8.asm. + * gcm.c [HAVE_NATIVE_gcm_hash8]: Make use of (optional) assembly + implementation. + 2013-09-21 Niels Möller <nisse@lysator.liu.se> * Makefile.in (des.po): Add same dependencies as for des.o. diff --git a/configure.ac b/configure.ac index 72da6d78de566f0c8d00d3cb0ecb06345543f4e5..037fbe97278767962ae71cf17c4f9e726fac5860 100644 --- a/configure.ac +++ b/configure.ac @@ -266,7 +266,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4" # Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-gf-mul-8.asm" +asm_nettle_optional_list="gcm-hash8.asm" asm_hogweed_optional_list="" if test "x$enable_public_key" = "xyes" ; then asm_hogweed_optional_list="ecc-192-modp.asm ecc-224-modp.asm \ @@ -347,7 +347,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_384_redc #undef HAVE_NATIVE_ecc_521_modp #undef HAVE_NATIVE_ecc_521_redc -#undef HAVE_NATIVE_gcm_gf_mul_8]) +#undef HAVE_NATIVE_gcm_hash8]) # Besides getting correct dependencies, the explicit rules also tell # make that the .s files "ought to exist", so they are preferred over diff --git a/gcm.c b/gcm.c index 7b387bacfac88826a8038fc05c2437fb4b1b244e..c9ea30bc1a696849b33daa8dd1c4782854afe4e0 100644 --- a/gcm.c +++ b/gcm.c @@ -214,12 +214,13 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 -# if HAVE_NATIVE_gcm_gf_mul_8 +# if HAVE_NATIVE_gcm_hash8 -#define gcm_gf_mul _nettle_gcm_gf_mul_8 +#define gcm_hash _nettle_gcm_hash8 void -gcm_gf_mul (union gcm_block *x, const union gcm_block *y); -# else /* !HAVE_NATIVE_gcm_gf_mul_8 */ +_nettle_gcm_hash8 (const struct gcm_key *key, union gcm_block *x, + size_t length, const uint8_t *data); +# else /* !HAVE_NATIVE_gcm_hash8 */ static const uint16_t shift_table[0x100] = { W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e), @@ -310,7 +311,7 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table) gcm_gf_shift_8(&Z); gcm_gf_add(x, &Z, &table[x->b[0]]); } -# endif /* ! HAVE_NATIVE_gcm_gf_mul_8 */ +# endif /* ! HAVE_NATIVE_gcm_hash8 */ # else /* GCM_TABLE_BITS != 8 */ # error Unsupported table size. # endif /* GCM_TABLE_BITS != 8 */ @@ -353,6 +354,7 @@ gcm_set_key(struct gcm_key *key, #endif } +#ifndef gcm_hash static void gcm_hash(const struct gcm_key *key, union gcm_block *x, size_t length, const uint8_t *data) @@ -369,6 +371,7 @@ gcm_hash(const struct gcm_key *key, union gcm_block *x, gcm_gf_mul (x, key->h); } } +#endif /* !gcm_hash */ static void gcm_hash_sizes(const struct gcm_key *key, union gcm_block *x, diff --git a/x86_64/gcm-gf-mul-8.asm b/x86_64/gcm-hash8.asm similarity index 66% rename from x86_64/gcm-gf-mul-8.asm rename to x86_64/gcm-hash8.asm index 2296ba8738bac4840ad6b28729b0f5a0fd091e85..3a3f012eafa15fe0206d244f4e310ea77c57815e 100644 --- a/x86_64/gcm-gf-mul-8.asm +++ b/x86_64/gcm-hash8.asm @@ -1,17 +1,17 @@ C nettle, low-level cryptographics library -C +C C Copyright (C) 2013, Niels Möller -C +C C The nettle library is free software; you can redistribute it and/or modify C it under the terms of the GNU Lesser General Public License as published by C the Free Software Foundation; either version 2.1 of the License, or (at your C option) any later version. -C +C C The nettle library is distributed in the hope that it will be useful, but C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public C License for more details. -C +C C You should have received a copy of the GNU Lesser General Public License C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, @@ -19,47 +19,51 @@ C MA 02111-1301, USA. C Register usage: -define(<XP>, <%rdi>) -define(<TABLE>, <%rsi>) -define(<XW>, <%rax>) -define(<CNT>, <%ecx>) -define(<Z0>, <%rdx>) -define(<Z1>, <%r8>) -define(<T0>, <%r9>) -define(<T1>, <%r10>) -define(<T2>, <%r11>) -define(<SHIFT_TABLE>, <%rbx>) - -C The C code is 12.5 c/byte, slower than sha1 (10.6), while this code runs -C at 10.2, slightly faster. Benchmarked on a low-end AMD E-350. - - .file "gcm-gf-mul-8.asm" - - C void _gcm_gf_mul_8(union gcm_block *x, const union gcm_block *table) +define(<KEY>, <%rdi>) +define(<XP>, <%rsi>) +define(<LENGTH>, <%rdx>) +define(<SRC>, <%rcx>) +define(<X0>, <%rax>) +define(<X1>, <%rbx>) +define(<CNT>, <%ebp>) +define(<T0>, <%r8>) +define(<T1>, <%r9>) +define(<T2>, <%r10>) +define(<Z0>, <%r11>) +define(<Z1>, <%r12>) +define(<SHIFT_TABLE>, <%r13>) + + .file "gcm-hash8.asm" + + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, + C size_t length, const uint8_t *data) + .text ALIGN(16) -PROLOGUE(_nettle_gcm_gf_mul_8) - W64_ENTRY(2, 0) +PROLOGUE(_nettle_gcm_hash8) + W64_ENTRY(4, 0) push %rbx - mov 8(XP), XW - rol $8, XW - movzbl LREG(XW), XREG(T0) - shl $4, T0 - mov (TABLE, T0), Z0 - mov 8(TABLE, T0), Z1 + push %rbp + push %r12 + push %r13 + sub $16, LENGTH lea .Lshift_table(%rip), SHIFT_TABLE - movl $7, CNT - call .Lmul_word - mov (XP), XW - movl $8, CNT - call .Lmul_word - mov Z0, (XP) - mov Z1, 8(XP) - W64_EXIT(2, 0) - pop %rbx - ret + mov (XP), X0 + mov 8(XP), X1 + jc .Lfinal +ALIGN(16) +.Lblock_loop: + + xor (SRC), X0 + xor 8(SRC), X1 + +.Lblock_mul: + rol $8, X1 + movzbl LREG(X1), XREG(T1) + shl $4, T1 + mov (KEY, T1), Z0 + mov 8(KEY, T1), Z1 -.Lmul_word: C shift Z1, Z0, transforming C +-----------------------+-----------------------+ C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00| @@ -70,25 +74,118 @@ PROLOGUE(_nettle_gcm_gf_mul_8) C +-----------------------+-----------------+-----+ C xor |T[15]| C +-----+ + + mov $7, CNT + +ALIGN(16) +.Loop_X1: mov Z1, T1 + shr $56, T1 + shl $8, Z1 mov Z0, T0 - shl $8, Z1 C Use shld? shl $8, Z0 - shr $56, T1 shr $56, T0 movzwl (SHIFT_TABLE, T1, 2), XREG(T1) - rol $8, XW + xor T1, Z0 + rol $8, X1 + movzbl LREG(X1), XREG(T2) + shl $4, T2 + xor (KEY, T2), Z0 add T0, Z1 + xor 8(KEY, T2), Z1 + decl CNT + jne .Loop_X1 + + mov $7, CNT + +ALIGN(16) +.Loop_X0: + mov Z1, T1 + shr $56, T1 + shl $8, Z1 + mov Z0, T0 + shl $8, Z0 + shr $56, T0 + movzwl (SHIFT_TABLE, T1, 2), XREG(T1) xor T1, Z0 - movzbl LREG(XW), XREG(T2) + rol $8, X0 + movzbl LREG(X0), XREG(T2) shl $4, T2 - xor (TABLE, T2), Z0 - xor 8(TABLE, T2), Z1 + xor (KEY, T2), Z0 + add T0, Z1 + xor 8(KEY, T2), Z1 decl CNT - jne .Lmul_word + jne .Loop_X0 + + mov Z1, T1 + shr $56, T1 + shl $8, Z1 + mov Z0, T0 + shl $8, Z0 + shr $56, T0 + movzwl (SHIFT_TABLE, T1, 2), XREG(T1) + xor T1, Z0 + rol $8, X0 + movzbl LREG(X0), XREG(T2) + shl $4, T2 + mov (KEY, T2), X0 + xor Z0, X0 + add T0, Z1 + mov 8(KEY, T2), X1 + xor Z1, X1 + + add $16, SRC + sub $16, LENGTH + jnc .Lblock_loop + +.Lfinal: + add $16, LENGTH + jnz .Lpartial + + mov X0, (XP) + mov X1, 8(XP) + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + W64_EXIT(2, 0) + ret + +.Lpartial: + C Read and xor partial block, then jump back into the loop + C with LENGTH == 0. + + cmp $8, LENGTH + jc .Llt8 + + C 8 <= LENGTH < 16 + xor (SRC), X0 + add $8, SRC + sub $8, LENGTH + jz .Lblock_mul + call .Lread_bytes + xor T0, X1 + jmp .Lblock_mul + +.Llt8: C 0 < LENGTH < 8 + call .Lread_bytes + xor T0, X0 + jmp .Lblock_mul + +C Read 0 < LENGTH < 8 bytes at SRC, result in T0 +.Lread_bytes: + xor T0, T0 + sub $1, SRC +ALIGN(16) +.Lread_loop: + shl $8, T0 + orb (SRC, LENGTH), LREG(T0) +.Lread_next: + sub $1, LENGTH + jnz .Lread_loop ret - -EPILOGUE(_nettle_gcm_gf_mul_8) +EPILOGUE(_nettle_gcm_hash8) define(<W>, <0x$2$1>) .section .rodata @@ -126,5 +223,3 @@ define(<W>, <0x$2$1>) .hword W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e) .hword W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae) .hword W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be) - -