diff --git a/ChangeLog b/ChangeLog index 82de3a48172cb5e7668f6e2eaa43a30e408cb31f..c8a1fd84437045f82137d0ccb57e23d031a9aacb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +2020-11-26 Niels Möller <nisse@lysator.liu.se> + + Enable powerpc64 gcm code in fat builds. Based on patch + contributed by Mamone Tarsha: + * powerpc64/fat/gcm-hash.asm: New file. + * configure.ac: Add HAVE_NATIVE_fat_gcm_init_key and + HAVE_NATIVE_fat_gcm_hash. + * gcm.c (gcm_init_key): Renamed, to ... + (_nettle_gcm_init_key_c): ... new name. Add fat setup conditionals. + (gcm_hash): Renamed, to... + (_nettle_gcm_hash_c): ... new name. Add fat setup conditionals. + * fat-setup.h (gcm_init_key_func, gcm_hash_func): New typedefs. + * fat-ppc.c: Select implementations of _nettle_gcm_init_key and _nettle_gcm_hash. + * gcm-internal.h: New file. + * Makefile.in (DISTFILES): Add gcm-internal.h. + 2020-11-28 Niels Möller <nisse@lysator.liu.se> * powerpc64/p7/chacha-2core.asm: Simplify counter carry handling diff --git a/Makefile.in b/Makefile.in index d955774d76ebebb09191f1239d630793bbfb928f..c4df14e158b7581dfb7cf0434069c10588fc33d8 100644 --- a/Makefile.in +++ b/Makefile.in @@ -259,8 +259,8 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \ nettle.pc.in hogweed.pc.in \ desdata.stamp $(des_headers) descore.README \ aes-internal.h block-internal.h blowfish-internal.h camellia-internal.h \ - gost28147-internal.h poly1305-internal.h serpent-internal.h \ - cast128_sboxes.h desinfo.h desCode.h \ + gcm-internal.h gost28147-internal.h poly1305-internal.h \ + serpent-internal.h cast128_sboxes.h desinfo.h desCode.h \ ripemd160-internal.h sha2-internal.h \ memxor-internal.h nettle-internal.h nettle-write.h \ ctr-internal.h chacha-internal.h sha3-internal.h \ diff --git a/configure.ac b/configure.ac index 09c73a53c35de89e9b9617bc019b218f9caa5436..6fafaa776bab2993c985f3647b08d6fde71cea94 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4" # Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-2core.asm chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -623,9 +623,11 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_fat_gcm_init_key +#undef HAVE_NATIVE_gcm_hash +#undef HAVE_NATIVE_fat_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/fat-ppc.c b/fat-ppc.c index 2839747330e3e57e61166c17bc59e4ca13850d80..8d4a703d3a40763c4fcf1350adac62874b2ca8e5 100644 --- a/fat-ppc.c +++ b/fat-ppc.c @@ -62,6 +62,7 @@ #include "aes-internal.h" #include "chacha-internal.h" #include "gcm.h" +#include "gcm-internal.h" #include "fat-setup.h" /* Defines from arch/powerpc/include/uapi/asm/cputable.h in Linux kernel */ @@ -110,7 +111,7 @@ get_ppc_features (struct ppc_features *features) { #if defined(_AIX) features->have_crypto_ext - = _system_configuration.implementation >= 0x10000u; + = _system_configuration.implementation >= 0x10000u; features->have_altivec = _system_configuration.vmx_version > 1; #else unsigned long hwcap = 0; @@ -149,6 +150,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64) +#if GCM_TABLE_BITS == 8 +DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64) + +DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64) +#endif /* GCM_TABLE_BITS == 8 */ + DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c); DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, altivec); @@ -180,11 +191,23 @@ fat_init (void) fprintf (stderr, "libnettle: enabling arch 2.07 code.\n"); _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64; +#if GCM_TABLE_BITS == 8 + /* Make sure _nettle_gcm_init_key_vec function is compatible + with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c() + fills gcm_key table with values that are incompatible with + _nettle_gcm_hash_ppc64() */ + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64; + _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64; +#endif /* GCM_TABLE_BITS == 8 */ } else { _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c; _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c; +#if GCM_TABLE_BITS == 8 + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c; + _nettle_gcm_hash_vec = _nettle_gcm_hash_c; +#endif /* GCM_TABLE_BITS == 8 */ } if (features.have_altivec) { @@ -216,6 +239,17 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, const uint8_t *src), (rounds, keys, T, length, dst, src)) +#if GCM_TABLE_BITS == 8 +DEFINE_FAT_FUNC(_nettle_gcm_init_key, void, + (union nettle_block16 *table), + (table)) + +DEFINE_FAT_FUNC(_nettle_gcm_hash, void, + (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data), + (key, x, length, data)) +#endif /* GCM_TABLE_BITS == 8 */ + DEFINE_FAT_FUNC(_nettle_chacha_core, void, (uint32_t *dst, const uint32_t *src, unsigned rounds), (dst, src, rounds)) diff --git a/fat-setup.h b/fat-setup.h index 99f1ea678abdcd092096648d713d51cf4b7edda7..10177390cf02aae85decd82da56cf1faeb261b99 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -162,6 +162,11 @@ typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys, size_t length, uint8_t *dst, const uint8_t *src); +typedef void gcm_init_key_func (union nettle_block16 *table); + +typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); + typedef void *(memxor_func)(void *dst, const void *src, size_t n); typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds); diff --git a/gcm-internal.h b/gcm-internal.h new file mode 100644 index 0000000000000000000000000000000000000000..2e28be2def769b1024901df9d968c4351d5f5daa --- /dev/null +++ b/gcm-internal.h @@ -0,0 +1,54 @@ +/* gcm-internal.h + + Copyright (C) 2020 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#ifndef NETTLE_GCM_INTERNAL_H_INCLUDED +#define NETTLE_GCM_INTERNAL_H_INCLUDED + +/* Functions available only in some configurations */ +void +_nettle_gcm_init_key (union nettle_block16 *table); + +void +_nettle_gcm_hash(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); + +#if HAVE_NATIVE_fat_gcm_init_key +void +_nettle_gcm_init_key_c (union nettle_block16 *table); +#endif + +#if HAVE_NATIVE_fat_gcm_hash +void +_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data); +#endif + +#endif /* NETTLE_GCM_INTERNAL_H_INCLUDED */ diff --git a/gcm.c b/gcm.c index 48b3e75a5c37866729ecba09f47054fbb5b4d915..d6d68070f39fef89da216523038ce28839b768cf 100644 --- a/gcm.c +++ b/gcm.c @@ -49,13 +49,24 @@ #include "gcm.h" +#include "gcm-internal.h" #include "memxor.h" #include "nettle-internal.h" #include "macros.h" #include "ctr-internal.h" #include "block-internal.h" -#if GCM_TABLE_BITS == 0 +#if GCM_TABLE_BITS != 8 +/* The native implementations (currently ppc64 only) depend on the + GCM_TABLE_BITS == 8 layout */ +#undef HAVE_NATIVE_gcm_hash +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_fat_gcm_hash +#undef HAVE_NATIVE_fat_gcm_init_key +#endif + +#if !HAVE_NATIVE_gcm_hash +# if GCM_TABLE_BITS == 0 /* Sets x <- x * y mod r, using the plain bitwise algorithm from the specification. y may be shorter than a full block, missing bytes are assumed zero. */ @@ -83,15 +94,15 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *y) } memcpy (x->b, Z.b, sizeof(Z)); } -#else /* GCM_TABLE_BITS != 0 */ +# else /* GCM_TABLE_BITS != 0 */ -# if WORDS_BIGENDIAN -# define W(left,right) (0x##left##right) -# else -# define W(left,right) (0x##right##left) -# endif +# if WORDS_BIGENDIAN +# define W(left,right) (0x##left##right) +# else +# define W(left,right) (0x##right##left) +# endif -# if GCM_TABLE_BITS == 4 +# if GCM_TABLE_BITS == 4 static const uint16_t shift_table[0x10] = { W(00,00),W(1c,20),W(38,40),W(24,60),W(70,80),W(6c,a0),W(48,c0),W(54,e0), @@ -110,7 +121,7 @@ gcm_gf_shift_4(union nettle_block16 *x) u64[1] = (u64[1] >> 4) | ((u64[0] & 0xf) << 60); u64[0] = (u64[0] >> 4) ^ (reduce << 48); #else /* ! WORDS_BIGENDIAN */ -#define RSHIFT_WORD_4(x) \ +# define RSHIFT_WORD_4(x) \ ((((x) & UINT64_C(0xf0f0f0f0f0f0f0f0)) >> 4) \ | (((x) & UINT64_C(0x000f0f0f0f0f0f0f)) << 12)) reduce = shift_table[(u64[1] >> 56) & 0xf]; @@ -139,14 +150,14 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) } memcpy (x->b, Z.b, sizeof(Z)); } -# elif GCM_TABLE_BITS == 8 -# if HAVE_NATIVE_gcm_hash8 +# elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_hash8 -#define gcm_hash _nettle_gcm_hash8 +#define _nettle_gcm_hash _nettle_gcm_hash8 void _nettle_gcm_hash8 (const struct gcm_key *key, union nettle_block16 *x, size_t length, const uint8_t *data); -# else /* !HAVE_NATIVE_gcm_hash8 */ +# else /* !HAVE_NATIVE_gcm_hash8 */ static const uint16_t shift_table[0x100] = { W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e), @@ -216,18 +227,46 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) gcm_gf_shift_8(&Z); block16_xor3(x, &Z, &table[x->b[0]]); } -# endif /* ! HAVE_NATIVE_gcm_hash8 */ -# else /* GCM_TABLE_BITS != 8 */ -# error Unsupported table size. -# endif /* GCM_TABLE_BITS != 8 */ +# endif /* ! HAVE_NATIVE_gcm_hash8 */ +# else /* GCM_TABLE_BITS != 8 */ +# error Unsupported table size. +# endif /* GCM_TABLE_BITS != 8 */ -#undef W +# undef W +# endif /* GCM_TABLE_BITS != 0 */ +#endif /* !HAVE_NATIVE_gcm_hash */ -#endif /* GCM_TABLE_BITS */ /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4) +#if !HAVE_NATIVE_gcm_init_key +# if !HAVE_NATIVE_fat_gcm_hash +# define _nettle_gcm_init_key _nettle_gcm_init_key_c +static +# endif +void +_nettle_gcm_init_key_c(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS + /* Middle element if GCM_TABLE_BITS > 0, otherwise the first + element */ + unsigned i = (1<<GCM_TABLE_BITS)/2; + + /* Algorithm 3 from the gcm paper. First do powers of two, then do + the rest by adding. */ + while (i /= 2) + block16_mulx_ghash(&table[i], &table[2*i]); + for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) + { + unsigned j; + for (j = 1; j < i; j++) + block16_xor3(&table[i+j], &table[i], &table[j]); + } +#endif +} +#endif /* !HAVE_NATIVE_gcm_init_key */ + /* Initialization of GCM. * @ctx: The context of GCM * @cipher: The context of the underlying block cipher @@ -244,25 +283,18 @@ gcm_set_key(struct gcm_key *key, /* H */ memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b); - -#if GCM_TABLE_BITS - /* Algorithm 3 from the gcm paper. First do powers of two, then do - the rest by adding. */ - while (i /= 2) - block16_mulx_ghash(&key->h[i], &key->h[2*i]); - for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2) - { - unsigned j; - for (j = 1; j < i; j++) - block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]); - } -#endif + + _nettle_gcm_init_key(key->h); } -#ifndef gcm_hash -static void -gcm_hash(const struct gcm_key *key, union nettle_block16 *x, - size_t length, const uint8_t *data) +#if !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8) +# if !HAVE_NATIVE_fat_gcm_hash +# define _nettle_gcm_hash _nettle_gcm_hash_c +static +# endif +void +_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data) { for (; length >= GCM_BLOCK_SIZE; length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE) @@ -276,7 +308,7 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x, gcm_gf_mul (x, key->h); } } -#endif /* !gcm_hash */ +#endif /* !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8) */ static void gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x, @@ -290,7 +322,7 @@ gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x, WRITE_UINT64 (buffer, auth_size); WRITE_UINT64 (buffer + 8, data_size); - gcm_hash(key, x, GCM_BLOCK_SIZE, buffer); + _nettle_gcm_hash(key, x, GCM_BLOCK_SIZE, buffer); } /* NOTE: The key is needed only if length != GCM_IV_SIZE */ @@ -309,7 +341,7 @@ gcm_set_iv(struct gcm_ctx *ctx, const struct gcm_key *key, else { memset(ctx->iv.b, 0, GCM_BLOCK_SIZE); - gcm_hash(key, &ctx->iv, length, iv); + _nettle_gcm_hash(key, &ctx->iv, length, iv); gcm_hash_sizes(key, &ctx->iv, 0, length); } @@ -328,7 +360,7 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key, assert(ctx->auth_size % GCM_BLOCK_SIZE == 0); assert(ctx->data_size == 0); - gcm_hash(key, &ctx->x, length, data); + _nettle_gcm_hash(key, &ctx->x, length, data); ctx->auth_size += length; } @@ -399,7 +431,7 @@ gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key, assert(ctx->data_size % GCM_BLOCK_SIZE == 0); _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src); - gcm_hash(key, &ctx->x, length, dst); + _nettle_gcm_hash(key, &ctx->x, length, dst); ctx->data_size += length; } @@ -411,7 +443,7 @@ gcm_decrypt(struct gcm_ctx *ctx, const struct gcm_key *key, { assert(ctx->data_size % GCM_BLOCK_SIZE == 0); - gcm_hash(key, &ctx->x, length, src); + _nettle_gcm_hash(key, &ctx->x, length, src); _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src); ctx->data_size += length; diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm new file mode 100644 index 0000000000000000000000000000000000000000..57c343d783e3f01359ea1a04fe4509802fee2ed3 --- /dev/null +++ b/powerpc64/fat/gcm-hash.asm @@ -0,0 +1,39 @@ +C powerpc64/fat/gcm-hash.asm + + +ifelse(` + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_fat_gcm_init_key) +dnl PROLOGUE(_nettle_fat_gcm_hash) + +define(`fat_transform', `$1_ppc64') +include_src(`powerpc64/p8/gcm-hash.asm') diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 0000000000000000000000000000000000000000..ad0ff6b38f84fa899bfe754fa055fcf58caf2ea1 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,499 @@ +C powerpc64/p8/gcm-hash.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C gcm_set_key() assigns H value in the middle element of the table +define(`H_Idx', `128') + +C Register usage: + +define(`SP', `r1') +define(`TOCP', `r2') + +define(`TABLE', `r3') + +define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1') + +define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + +.file "gcm-hash.asm" + +.text + + C void gcm_init_key (union gcm_block *table) + +C This function populates the gcm table as the following layout +C ******************************************************************************* +C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C ******************************************************************************* + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key) + DATA_LOAD_VEC(POLY,.polynomial,r7) C 0xC2000000000000000000000000000001 +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 C 0x000102030405060708090A0B0C0D0E0F + vspltisb LE_TEMP,0x07 C 0x07070707070707070707070707070707 + vxor LE_MASK,LE_MASK,LE_TEMP C 0x07060504030201000F0E0D0C0B0A0908 +') + + C 'H' is assigned by gcm_set_key() to the middle element of the table + li r10,H_Idx*16 + lxvd2x VSR(H),r10,TABLE C load 'H' + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm H,H,H,LE_MASK +') + + C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) --- + + vupkhsb EMSB,H C extend most significant bit to first byte + vspltisb B1,1 C 0x01010101010101010101010101010101 + vspltb EMSB,EMSB,0 C first byte quadword-extend + vsl H,H,B1 C H = H << 1 + vand EMSB,EMSB,POLY C EMSB &= 0xC2000000000000000000000000000001 + vxor ZERO,ZERO,ZERO C 0x00000000000000000000000000000000 + vxor H,H,EMSB C H ^= EMSB + + C --- calculate H^2 = H*H --- + + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000 + + C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) --- + C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 --- + C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) --- + vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷) + xxswapd VSR(Hm),VSR(H) + xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴ + vxor Hm,Hm,Hp C Hm = Hm + Hp + vxor Hl,Hl,Hp C Hl = Hl + Hp + xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl mod x⁶⁴) + xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl div x⁶⁴) + + vpmsumd F,H1L,H C F = (H1Lh × Hh) + (H1Ll × Hl) + vpmsumd R,H1M,H C R = (H1Mh × Hh) + (H1Ml × Hl) + + C --- rduction --- + vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷) + xxswapd VSR(H2),VSR(F) + vxor R,R,T C R = R + T + vxor H2,R,H2 + + xxmrgld VSR(Hl),VSR(H2),VSR(ZERO) + xxswapd VSR(Hm),VSR(H2) + vpmsumd Hp,H2,POLY_L + vxor Hl,Hl,Hp + vxor Hm,Hm,Hp + xxmrghd VSR(H2M),VSR(H2),VSR(Hl) + xxmrgld VSR(H2L),VSR(H2),VSR(Hm) + + C store H1M, H1L, H2M, H2L + li r8,1*16 + li r9,2*16 + li r10,3*16 + stxvd2x VSR(H1M),0,TABLE + stxvd2x VSR(H1L),r8,TABLE + stxvd2x VSR(H2M),r9,TABLE + stxvd2x VSR(H2L),r10,TABLE + + C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 --- + + vpmsumd F,H1L,H2 + vpmsumd F2,H2L,H2 + vpmsumd R,H1M,H2 + vpmsumd R2,H2M,H2 + + vpmsumd T,F,POLY_L + vpmsumd T2,F2,POLY_L + xxswapd VSR(H3),VSR(F) + xxswapd VSR(H4),VSR(F2) + vxor R,R,T + vxor R2,R2,T2 + vxor H3,R,H3 + vxor H4,R2,H4 + + xxmrgld VSR(Hl),VSR(H3),VSR(ZERO) + xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO) + xxswapd VSR(Hm),VSR(H3) + xxswapd VSR(Hm2),VSR(H4) + vpmsumd Hp,H3,POLY_L + vpmsumd Hp2,H4,POLY_L + vxor Hl,Hl,Hp + vxor Hl2,Hl2,Hp2 + vxor Hm,Hm,Hp + vxor Hm2,Hm2,Hp2 + xxmrghd VSR(H1M),VSR(H3),VSR(Hl) + xxmrghd VSR(H2M),VSR(H4),VSR(Hl2) + xxmrgld VSR(H1L),VSR(H3),VSR(Hm) + xxmrgld VSR(H2L),VSR(H4),VSR(Hm2) + + C store H3M, H3L, H4M, H4L + li r7,4*16 + li r8,5*16 + li r9,6*16 + li r10,7*16 + stxvd2x VSR(H1M),r7,TABLE + stxvd2x VSR(H1L),r8,TABLE + stxvd2x VSR(H2M),r9,TABLE + stxvd2x VSR(H2L),r10,TABLE + + blr +EPILOGUE(_nettle_gcm_init_key) + +define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6') + +define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0') + +define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T', `v18') +define(`R3', `v20') +define(`F3', `v21') +define(`R4', `v22') +define(`F4', `v23') + +define(`LE_TEMP', `v18') +define(`LE_MASK', `v19') + + C void gcm_hash (const struct gcm_key *key, union gcm_block *x, + C size_t length, const uint8_t *data) + +define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash) + vxor ZERO,ZERO,ZERO + DATA_LOAD_VEC(POLY,.polynomial,r7) +IF_LE(` + li r8,0 + lvsl LE_MASK,0,r8 + vspltisb LE_TEMP,0x07 + vxor LE_MASK,LE_MASK,LE_TEMP +') + xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) + + lxvd2x VSR(D),0,X C load 'X' pointer + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + + C --- process 4 blocks '128-bit each' per one loop --- + + srdi. r7,LENGTH,6 C 4-blocks loop count 'LENGTH / (4 * 16)' + beq L2x + + mtctr r7 C assign counter register to loop count + + C store non-volatile vector registers + addi r8,SP,-64 + stvx v20,0,r8 + addi r8,r8,16 + stvx v21,0,r8 + addi r8,r8,16 + stvx v22,0,r8 + addi r8,r8,16 + stvx v23,0,r8 + + C load table elements + li r8,1*16 + li r9,2*16 + li r10,3*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + lxvd2x VSR(H2M),r9,TABLE + lxvd2x VSR(H2L),r10,TABLE + li r7,4*16 + li r8,5*16 + li r9,6*16 + li r10,7*16 + lxvd2x VSR(H3M),r7,TABLE + lxvd2x VSR(H3L),r8,TABLE + lxvd2x VSR(H4M),r9,TABLE + lxvd2x VSR(H4L),r10,TABLE + + li r8,0x10 + li r9,0x20 + li r10,0x30 +.align 5 +L4x_loop: + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r8,DATA C load C1 + lxvd2x VSR(C2),r9,DATA C load C2 + lxvd2x VSR(C3),r10,DATA C load C3 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK + vperm C2,C2,C2,LE_MASK + vperm C3,C3,C3,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F2,H3L,C1 + vpmsumd R2,H3M,C1 + vpmsumd F3,H2L,C2 + vpmsumd R3,H2M,C2 + vpmsumd F4,H1L,C3 + vpmsumd R4,H1M,C3 + vpmsumd F,H4L,C0 + vpmsumd R,H4M,C0 + + C deferred recombination of partial products + vxor F3,F3,F4 + vxor R3,R3,R4 + vxor F,F,F2 + vxor R,R,R2 + vxor F,F,F3 + vxor R,R,R3 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DATA,DATA,0x40 + bdnz L4x_loop + + C restore non-volatile vector registers + addi r8,SP,-64 + lvx v20,0,r8 + addi r8,r8,16 + lvx v21,0,r8 + addi r8,r8,16 + lvx v22,0,r8 + addi r8,r8,16 + lvx v23,0,r8 + + clrldi LENGTH,LENGTH,58 C 'set the high-order 58 bits to zeros' +L2x: + C --- process 2 blocks --- + + srdi. r7,LENGTH,5 C 'LENGTH / (2 * 16)' + beq L1x + + C load table elements + li r8,1*16 + li r9,2*16 + li r10,3*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + lxvd2x VSR(H2M),r9,TABLE + lxvd2x VSR(H2L),r10,TABLE + + C input loading + li r10,0x10 + lxvd2x VSR(C0),0,DATA C load C0 + lxvd2x VSR(C1),r10,DATA C load C1 + +IF_LE(` + vperm C0,C0,C0,LE_MASK + vperm C1,C1,C1,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F2,H1L,C1 + vpmsumd R2,H1M,C1 + vpmsumd F,H2L,C0 + vpmsumd R,H2M,C0 + + C deferred recombination of partial products + vxor F,F,F2 + vxor R,R,R2 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DATA,DATA,0x20 + clrldi LENGTH,LENGTH,59 C 'set the high-order 59 bits to zeros' +L1x: + C --- process 1 block --- + + srdi. r7,LENGTH,4 C 'LENGTH / (1 * 16)' + beq Lmod + + C load table elements + li r8,1*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + + C input loading + lxvd2x VSR(C0),0,DATA C load C0 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F,H1L,C0 + vpmsumd R,H1M,C0 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + + addi DATA,DATA,0x10 + clrldi LENGTH,LENGTH,60 C 'set the high-order 60 bits to zeros' +Lmod: + C --- process the modulo bytes, padding the low-order bytes with zeros --- + + cmpldi LENGTH,0 + beq Ldone + + C load table elements + li r8,1*16 + lxvd2x VSR(H1M),0,TABLE + lxvd2x VSR(H1L),r8,TABLE + + C push every modulo byte to the stack and load them with padding into vector register + vxor ZERO,ZERO,ZERO + addi r8,SP,-16 + stvx ZERO,0,r8 +Lstb_loop: + subic. LENGTH,LENGTH,1 + lbzx r7,LENGTH,DATA + stbx r7,LENGTH,r8 + bne Lstb_loop + lxvd2x VSR(C0),0,r8 + +IF_LE(` + vperm C0,C0,C0,LE_MASK +') + + C previous digest combining + vxor C0,C0,D + + C polynomial multiplication + vpmsumd F,H1L,C0 + vpmsumd R,H1M,C0 + + C reduction + vpmsumd T,F,POLY_L + xxswapd VSR(D),VSR(F) + vxor R,R,T + vxor D,R,D + +Ldone: + C byte-reverse of each doubleword permuting on little-endian mode +IF_LE(` + vperm D,D,D,LE_MASK +') + stxvd2x VSR(D),0,X C store digest 'D' + + blr +EPILOGUE(_nettle_gcm_hash) + +.data + C 0xC2000000000000000000000000000001 +.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')