Commit 4611f79c authored by Niels Möller's avatar Niels Möller

New x86_64 assembly for gcm hashing.

parent a99c33b4
2013-09-22 Niels Möller <nisse@lysator.liu.se>
* x86_64/gcm-hash8.asm: New file.
* x86_64/gcm-gf-mul-8.asm: Deleted.
* configure.ac (asm_nettle_optional_list): Look for gcm-hash8.asm,
not gcm-gf-mul-8.asm.
* gcm.c [HAVE_NATIVE_gcm_hash8]: Make use of (optional) assembly
implementation.
2013-09-21 Niels Möller <nisse@lysator.liu.se>
* Makefile.in (des.po): Add same dependencies as for des.o.
......
......@@ -266,7 +266,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used.
asm_nettle_optional_list="gcm-gf-mul-8.asm"
asm_nettle_optional_list="gcm-hash8.asm"
asm_hogweed_optional_list=""
if test "x$enable_public_key" = "xyes" ; then
asm_hogweed_optional_list="ecc-192-modp.asm ecc-224-modp.asm \
......@@ -347,7 +347,7 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_ecc_384_redc
#undef HAVE_NATIVE_ecc_521_modp
#undef HAVE_NATIVE_ecc_521_redc
#undef HAVE_NATIVE_gcm_gf_mul_8])
#undef HAVE_NATIVE_gcm_hash8])
# Besides getting correct dependencies, the explicit rules also tell
# make that the .s files "ought to exist", so they are preferred over
......
......@@ -214,12 +214,13 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
memcpy (x->b, Z.b, sizeof(Z));
}
# elif GCM_TABLE_BITS == 8
# if HAVE_NATIVE_gcm_gf_mul_8
# if HAVE_NATIVE_gcm_hash8
#define gcm_gf_mul _nettle_gcm_gf_mul_8
#define gcm_hash _nettle_gcm_hash8
void
gcm_gf_mul (union gcm_block *x, const union gcm_block *y);
# else /* !HAVE_NATIVE_gcm_gf_mul_8 */
_nettle_gcm_hash8 (const struct gcm_key *key, union gcm_block *x,
size_t length, const uint8_t *data);
# else /* !HAVE_NATIVE_gcm_hash8 */
static const uint16_t
shift_table[0x100] = {
W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
......@@ -310,7 +311,7 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
gcm_gf_shift_8(&Z);
gcm_gf_add(x, &Z, &table[x->b[0]]);
}
# endif /* ! HAVE_NATIVE_gcm_gf_mul_8 */
# endif /* ! HAVE_NATIVE_gcm_hash8 */
# else /* GCM_TABLE_BITS != 8 */
# error Unsupported table size.
# endif /* GCM_TABLE_BITS != 8 */
......@@ -353,6 +354,7 @@ gcm_set_key(struct gcm_key *key,
#endif
}
#ifndef gcm_hash
static void
gcm_hash(const struct gcm_key *key, union gcm_block *x,
size_t length, const uint8_t *data)
......@@ -369,6 +371,7 @@ gcm_hash(const struct gcm_key *key, union gcm_block *x,
gcm_gf_mul (x, key->h);
}
}
#endif /* !gcm_hash */
static void
gcm_hash_sizes(const struct gcm_key *key, union gcm_block *x,
......
C nettle, low-level cryptographics library
C
C
C Copyright (C) 2013, Niels Möller
C
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
......@@ -19,47 +19,51 @@ C MA 02111-1301, USA.
C Register usage:
define(<XP>, <%rdi>)
define(<TABLE>, <%rsi>)
define(<XW>, <%rax>)
define(<CNT>, <%ecx>)
define(<Z0>, <%rdx>)
define(<Z1>, <%r8>)
define(<T0>, <%r9>)
define(<T1>, <%r10>)
define(<T2>, <%r11>)
define(<SHIFT_TABLE>, <%rbx>)
C The C code is 12.5 c/byte, slower than sha1 (10.6), while this code runs
C at 10.2, slightly faster. Benchmarked on a low-end AMD E-350.
.file "gcm-gf-mul-8.asm"
C void _gcm_gf_mul_8(union gcm_block *x, const union gcm_block *table)
define(<KEY>, <%rdi>)
define(<XP>, <%rsi>)
define(<LENGTH>, <%rdx>)
define(<SRC>, <%rcx>)
define(<X0>, <%rax>)
define(<X1>, <%rbx>)
define(<CNT>, <%ebp>)
define(<T0>, <%r8>)
define(<T1>, <%r9>)
define(<T2>, <%r10>)
define(<Z0>, <%r11>)
define(<Z1>, <%r12>)
define(<SHIFT_TABLE>, <%r13>)
.file "gcm-hash8.asm"
C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
C size_t length, const uint8_t *data)
.text
ALIGN(16)
PROLOGUE(_nettle_gcm_gf_mul_8)
W64_ENTRY(2, 0)
PROLOGUE(_nettle_gcm_hash8)
W64_ENTRY(4, 0)
push %rbx
mov 8(XP), XW
rol $8, XW
movzbl LREG(XW), XREG(T0)
shl $4, T0
mov (TABLE, T0), Z0
mov 8(TABLE, T0), Z1
push %rbp
push %r12
push %r13
sub $16, LENGTH
lea .Lshift_table(%rip), SHIFT_TABLE
movl $7, CNT
call .Lmul_word
mov (XP), XW
movl $8, CNT
call .Lmul_word
mov Z0, (XP)
mov Z1, 8(XP)
W64_EXIT(2, 0)
pop %rbx
ret
mov (XP), X0
mov 8(XP), X1
jc .Lfinal
ALIGN(16)
.Lblock_loop:
xor (SRC), X0
xor 8(SRC), X1
.Lblock_mul:
rol $8, X1
movzbl LREG(X1), XREG(T1)
shl $4, T1
mov (KEY, T1), Z0
mov 8(KEY, T1), Z1
.Lmul_word:
C shift Z1, Z0, transforming
C +-----------------------+-----------------------+
C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00|
......@@ -70,25 +74,118 @@ PROLOGUE(_nettle_gcm_gf_mul_8)
C +-----------------------+-----------------+-----+
C xor |T[15]|
C +-----+
mov $7, CNT
ALIGN(16)
.Loop_X1:
mov Z1, T1
shr $56, T1
shl $8, Z1
mov Z0, T0
shl $8, Z1 C Use shld?
shl $8, Z0
shr $56, T1
shr $56, T0
movzwl (SHIFT_TABLE, T1, 2), XREG(T1)
rol $8, XW
xor T1, Z0
rol $8, X1
movzbl LREG(X1), XREG(T2)
shl $4, T2
xor (KEY, T2), Z0
add T0, Z1
xor 8(KEY, T2), Z1
decl CNT
jne .Loop_X1
mov $7, CNT
ALIGN(16)
.Loop_X0:
mov Z1, T1
shr $56, T1
shl $8, Z1
mov Z0, T0
shl $8, Z0
shr $56, T0
movzwl (SHIFT_TABLE, T1, 2), XREG(T1)
xor T1, Z0
movzbl LREG(XW), XREG(T2)
rol $8, X0
movzbl LREG(X0), XREG(T2)
shl $4, T2
xor (TABLE, T2), Z0
xor 8(TABLE, T2), Z1
xor (KEY, T2), Z0
add T0, Z1
xor 8(KEY, T2), Z1
decl CNT
jne .Lmul_word
jne .Loop_X0
mov Z1, T1
shr $56, T1
shl $8, Z1
mov Z0, T0
shl $8, Z0
shr $56, T0
movzwl (SHIFT_TABLE, T1, 2), XREG(T1)
xor T1, Z0
rol $8, X0
movzbl LREG(X0), XREG(T2)
shl $4, T2
mov (KEY, T2), X0
xor Z0, X0
add T0, Z1
mov 8(KEY, T2), X1
xor Z1, X1
add $16, SRC
sub $16, LENGTH
jnc .Lblock_loop
.Lfinal:
add $16, LENGTH
jnz .Lpartial
mov X0, (XP)
mov X1, 8(XP)
pop %r13
pop %r12
pop %rbp
pop %rbx
W64_EXIT(2, 0)
ret
.Lpartial:
C Read and xor partial block, then jump back into the loop
C with LENGTH == 0.
cmp $8, LENGTH
jc .Llt8
C 8 <= LENGTH < 16
xor (SRC), X0
add $8, SRC
sub $8, LENGTH
jz .Lblock_mul
call .Lread_bytes
xor T0, X1
jmp .Lblock_mul
.Llt8: C 0 < LENGTH < 8
call .Lread_bytes
xor T0, X0
jmp .Lblock_mul
C Read 0 < LENGTH < 8 bytes at SRC, result in T0
.Lread_bytes:
xor T0, T0
sub $1, SRC
ALIGN(16)
.Lread_loop:
shl $8, T0
orb (SRC, LENGTH), LREG(T0)
.Lread_next:
sub $1, LENGTH
jnz .Lread_loop
ret
EPILOGUE(_nettle_gcm_gf_mul_8)
EPILOGUE(_nettle_gcm_hash8)
define(<W>, <0x$2$1>)
.section .rodata
......@@ -126,5 +223,3 @@ define(<W>, <0x$2$1>)
.hword W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e)
.hword W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae)
.hword W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment