diff --git a/ChangeLog b/ChangeLog
index 82de3a48172cb5e7668f6e2eaa43a30e408cb31f..c8a1fd84437045f82137d0ccb57e23d031a9aacb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2020-11-26  Niels Möller  <nisse@lysator.liu.se>
+
+	Enable powerpc64 gcm code in fat builds. Based on patch
+	contributed by Mamone Tarsha:
+	* powerpc64/fat/gcm-hash.asm: New file.
+	* configure.ac: Add HAVE_NATIVE_fat_gcm_init_key and
+	HAVE_NATIVE_fat_gcm_hash.
+	* gcm.c (gcm_init_key): Renamed, to ...
+	(_nettle_gcm_init_key_c): ... new name. Add fat setup conditionals.
+	(gcm_hash): Renamed, to...
+	(_nettle_gcm_hash_c): ... new name. Add fat setup conditionals.
+	* fat-setup.h (gcm_init_key_func, gcm_hash_func): New typedefs.
+	* fat-ppc.c: Select implementations of _nettle_gcm_init_key and _nettle_gcm_hash.
+	* gcm-internal.h: New file.
+	* Makefile.in (DISTFILES): Add gcm-internal.h.
+
 2020-11-28  Niels Möller  <nisse@lysator.liu.se>
 
 	* powerpc64/p7/chacha-2core.asm: Simplify counter carry handling
diff --git a/Makefile.in b/Makefile.in
index d955774d76ebebb09191f1239d630793bbfb928f..c4df14e158b7581dfb7cf0434069c10588fc33d8 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -259,8 +259,8 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt.h getopt_int.h \
 	nettle.pc.in hogweed.pc.in \
 	desdata.stamp $(des_headers) descore.README \
 	aes-internal.h block-internal.h blowfish-internal.h camellia-internal.h \
-	gost28147-internal.h poly1305-internal.h serpent-internal.h \
-	cast128_sboxes.h desinfo.h desCode.h \
+	gcm-internal.h gost28147-internal.h poly1305-internal.h \
+	serpent-internal.h cast128_sboxes.h desinfo.h desCode.h \
 	ripemd160-internal.h sha2-internal.h \
 	memxor-internal.h nettle-internal.h nettle-write.h \
 	ctr-internal.h chacha-internal.h sha3-internal.h \
diff --git a/configure.ac b/configure.ac
index 09c73a53c35de89e9b9617bc019b218f9caa5436..6fafaa776bab2993c985f3647b08d6fde71cea94 100644
--- a/configure.ac
+++ b/configure.ac
@@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
 
 # Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
+asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
   chacha-2core.asm chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
@@ -623,9 +623,11 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_secp384r1_redc
 #undef HAVE_NATIVE_ecc_secp521r1_modp
 #undef HAVE_NATIVE_ecc_secp521r1_redc
-#undef HAVE_NATIVE_gcm_init_key8
+#undef HAVE_NATIVE_gcm_init_key
+#undef HAVE_NATIVE_fat_gcm_init_key
+#undef HAVE_NATIVE_gcm_hash
+#undef HAVE_NATIVE_fat_gcm_hash
 #undef HAVE_NATIVE_gcm_hash8
-#undef HAVE_NATIVE_gcm_fill
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_salsa20_2core
 #undef HAVE_NATIVE_fat_salsa20_2core
diff --git a/fat-ppc.c b/fat-ppc.c
index 2839747330e3e57e61166c17bc59e4ca13850d80..8d4a703d3a40763c4fcf1350adac62874b2ca8e5 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -62,6 +62,7 @@
 #include "aes-internal.h"
 #include "chacha-internal.h"
 #include "gcm.h"
+#include "gcm-internal.h"
 #include "fat-setup.h"
 
 /* Defines from arch/powerpc/include/uapi/asm/cputable.h in Linux kernel */
@@ -110,7 +111,7 @@ get_ppc_features (struct ppc_features *features)
     {
 #if defined(_AIX)
       features->have_crypto_ext
- = _system_configuration.implementation >= 0x10000u;
+	= _system_configuration.implementation >= 0x10000u;
       features->have_altivec = _system_configuration.vmx_version > 1;
 #else
       unsigned long hwcap = 0;
@@ -149,6 +150,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
 
+#if GCM_TABLE_BITS == 8
+DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func)
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64)
+
+DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func)
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c)
+DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64)
+#endif /* GCM_TABLE_BITS == 8 */
+
 DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
 DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
 DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, altivec);
@@ -180,11 +191,23 @@ fat_init (void)
 	fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
+#if GCM_TABLE_BITS == 8
+      /* Make sure _nettle_gcm_init_key_vec function is compatible
+         with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c()
+         fills gcm_key table with values that are incompatible with
+         _nettle_gcm_hash_ppc64() */
+      _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64;
+      _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64;
+#endif /* GCM_TABLE_BITS == 8 */
     }
   else
     {
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
+#if GCM_TABLE_BITS == 8
+      _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c;
+      _nettle_gcm_hash_vec = _nettle_gcm_hash_c;
+#endif /* GCM_TABLE_BITS == 8 */
     }
   if (features.have_altivec)
     {
@@ -216,6 +239,17 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
  const uint8_t *src),
  (rounds, keys, T, length, dst, src))
 
+#if GCM_TABLE_BITS == 8
+DEFINE_FAT_FUNC(_nettle_gcm_init_key, void,
+		(union nettle_block16 *table),
+		(table))
+
+DEFINE_FAT_FUNC(_nettle_gcm_hash, void,
+		(const struct gcm_key *key, union nettle_block16 *x,
+		 size_t length, const uint8_t *data),
+		(key, x, length, data))
+#endif /* GCM_TABLE_BITS == 8 */
+
 DEFINE_FAT_FUNC(_nettle_chacha_core, void,
 		(uint32_t *dst, const uint32_t *src, unsigned rounds),
 		(dst, src, rounds))
diff --git a/fat-setup.h b/fat-setup.h
index 99f1ea678abdcd092096648d713d51cf4b7edda7..10177390cf02aae85decd82da56cf1faeb261b99 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -162,6 +162,11 @@ typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys,
 				      size_t length, uint8_t *dst,
 				      const uint8_t *src);
 
+typedef void gcm_init_key_func (union nettle_block16 *table);
+
+typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x,
+			    size_t length, const uint8_t *data);
+
 typedef void *(memxor_func)(void *dst, const void *src, size_t n);
 
 typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds);
diff --git a/gcm-internal.h b/gcm-internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e28be2def769b1024901df9d968c4351d5f5daa
--- /dev/null
+++ b/gcm-internal.h
@@ -0,0 +1,54 @@
+/* gcm-internal.h
+
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#ifndef NETTLE_GCM_INTERNAL_H_INCLUDED
+#define NETTLE_GCM_INTERNAL_H_INCLUDED
+
+/* Functions available only in some configurations */
+void
+_nettle_gcm_init_key (union nettle_block16 *table);
+
+void
+_nettle_gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
+		 size_t length, const uint8_t *data);
+
+#if HAVE_NATIVE_fat_gcm_init_key
+void
+_nettle_gcm_init_key_c (union nettle_block16 *table);
+#endif
+
+#if HAVE_NATIVE_fat_gcm_hash
+void
+_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x,
+		    size_t length, const uint8_t *data);
+#endif
+
+#endif /* NETTLE_GCM_INTERNAL_H_INCLUDED */
diff --git a/gcm.c b/gcm.c
index 48b3e75a5c37866729ecba09f47054fbb5b4d915..d6d68070f39fef89da216523038ce28839b768cf 100644
--- a/gcm.c
+++ b/gcm.c
@@ -49,13 +49,24 @@
 
 #include "gcm.h"
 
+#include "gcm-internal.h"
 #include "memxor.h"
 #include "nettle-internal.h"
 #include "macros.h"
 #include "ctr-internal.h"
 #include "block-internal.h"
 
-#if GCM_TABLE_BITS == 0
+#if GCM_TABLE_BITS != 8
+/* The native implementations (currently ppc64 only) depend on the
+   GCM_TABLE_BITS == 8 layout */
+#undef HAVE_NATIVE_gcm_hash
+#undef HAVE_NATIVE_gcm_init_key
+#undef HAVE_NATIVE_fat_gcm_hash
+#undef HAVE_NATIVE_fat_gcm_init_key
+#endif
+
+#if !HAVE_NATIVE_gcm_hash
+# if GCM_TABLE_BITS == 0
 /* Sets x <- x * y mod r, using the plain bitwise algorithm from the
    specification. y may be shorter than a full block, missing bytes
    are assumed zero. */
@@ -83,15 +94,15 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *y)
     }
   memcpy (x->b, Z.b, sizeof(Z));
 }
-#else /* GCM_TABLE_BITS != 0 */
+# else /* GCM_TABLE_BITS != 0 */
 
-# if WORDS_BIGENDIAN
-#  define W(left,right) (0x##left##right)
-# else
-#  define W(left,right) (0x##right##left)
-# endif
+#  if WORDS_BIGENDIAN
+#   define W(left,right) (0x##left##right)
+#  else
+#   define W(left,right) (0x##right##left)
+#  endif
 
-# if GCM_TABLE_BITS == 4
+#  if GCM_TABLE_BITS == 4
 static const uint16_t
 shift_table[0x10] = {
   W(00,00),W(1c,20),W(38,40),W(24,60),W(70,80),W(6c,a0),W(48,c0),W(54,e0),
@@ -110,7 +121,7 @@ gcm_gf_shift_4(union nettle_block16 *x)
   u64[1] = (u64[1] >> 4) | ((u64[0] & 0xf) << 60);
   u64[0] = (u64[0] >> 4) ^ (reduce << 48);
 #else /* ! WORDS_BIGENDIAN */
-#define RSHIFT_WORD_4(x) \
+# define RSHIFT_WORD_4(x) \
   ((((x) & UINT64_C(0xf0f0f0f0f0f0f0f0)) >> 4) \
    | (((x) & UINT64_C(0x000f0f0f0f0f0f0f)) << 12))
   reduce = shift_table[(u64[1] >> 56) & 0xf];
@@ -139,14 +150,14 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
     }
   memcpy (x->b, Z.b, sizeof(Z));
 }
-# elif GCM_TABLE_BITS == 8
-#  if HAVE_NATIVE_gcm_hash8
+#  elif GCM_TABLE_BITS == 8
+#   if HAVE_NATIVE_gcm_hash8
 
-#define gcm_hash _nettle_gcm_hash8
+#define _nettle_gcm_hash _nettle_gcm_hash8
 void
 _nettle_gcm_hash8 (const struct gcm_key *key, union nettle_block16 *x,
 		   size_t length, const uint8_t *data);
-#  else /* !HAVE_NATIVE_gcm_hash8 */
+#   else /* !HAVE_NATIVE_gcm_hash8 */
 static const uint16_t
 shift_table[0x100] = {
   W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
@@ -216,18 +227,46 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
   gcm_gf_shift_8(&Z);
   block16_xor3(x, &Z, &table[x->b[0]]);
 }
-#  endif /* ! HAVE_NATIVE_gcm_hash8 */
-# else /* GCM_TABLE_BITS != 8 */
-#  error Unsupported table size. 
-# endif /* GCM_TABLE_BITS != 8 */
+#   endif /* ! HAVE_NATIVE_gcm_hash8 */
+#  else /* GCM_TABLE_BITS != 8 */
+#   error Unsupported table size.
+#  endif /* GCM_TABLE_BITS != 8 */
 
-#undef W
+#  undef W
+# endif /* GCM_TABLE_BITS != 0 */
+#endif /* !HAVE_NATIVE_gcm_hash */
 
-#endif /* GCM_TABLE_BITS */
 
 /* Increment the rightmost 32 bits. */
 #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
 
+#if !HAVE_NATIVE_gcm_init_key
+# if !HAVE_NATIVE_fat_gcm_hash
+#  define _nettle_gcm_init_key _nettle_gcm_init_key_c
+static
+# endif
+void
+_nettle_gcm_init_key_c(union nettle_block16 *table)
+{
+#if GCM_TABLE_BITS
+  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
+     element */
+  unsigned i = (1<<GCM_TABLE_BITS)/2;
+
+  /* Algorithm 3 from the gcm paper. First do powers of two, then do
+     the rest by adding. */
+  while (i /= 2)
+    block16_mulx_ghash(&table[i], &table[2*i]);
+  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
+    {
+      unsigned j;
+      for (j = 1; j < i; j++)
+	block16_xor3(&table[i+j], &table[i], &table[j]);
+    }
+#endif
+}
+#endif /* !HAVE_NATIVE_gcm_init_key */
+
 /* Initialization of GCM.
  * @ctx: The context of GCM
  * @cipher: The context of the underlying block cipher
@@ -244,25 +283,18 @@ gcm_set_key(struct gcm_key *key,
   /* H */  
   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-  
-#if GCM_TABLE_BITS
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
-     the rest by adding. */
-  while (i /= 2)
-    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
-    {
-      unsigned j;
-      for (j = 1; j < i; j++)
-	block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
-    }
-#endif
+
+  _nettle_gcm_init_key(key->h);
 }
 
-#ifndef gcm_hash
-static void
-gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
-	 size_t length, const uint8_t *data)
+#if !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8)
+# if !HAVE_NATIVE_fat_gcm_hash
+#  define _nettle_gcm_hash _nettle_gcm_hash_c
+static
+# endif
+void
+_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
+		   size_t length, const uint8_t *data)
 {
   for (; length >= GCM_BLOCK_SIZE;
        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
@@ -276,7 +308,7 @@ gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
       gcm_gf_mul (x, key->h);
     }
 }
-#endif /* !gcm_hash */
+#endif /* !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8) */
 
 static void
 gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
@@ -290,7 +322,7 @@ gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
   WRITE_UINT64 (buffer, auth_size);
   WRITE_UINT64 (buffer + 8, data_size);
 
-  gcm_hash(key, x, GCM_BLOCK_SIZE, buffer);
+  _nettle_gcm_hash(key, x, GCM_BLOCK_SIZE, buffer);
 }
 
 /* NOTE: The key is needed only if length != GCM_IV_SIZE */
@@ -309,7 +341,7 @@ gcm_set_iv(struct gcm_ctx *ctx, const struct gcm_key *key,
   else
     {
       memset(ctx->iv.b, 0, GCM_BLOCK_SIZE);
-      gcm_hash(key, &ctx->iv, length, iv);
+      _nettle_gcm_hash(key, &ctx->iv, length, iv);
       gcm_hash_sizes(key, &ctx->iv, 0, length);
     }
 
@@ -328,7 +360,7 @@ gcm_update(struct gcm_ctx *ctx, const struct gcm_key *key,
   assert(ctx->auth_size % GCM_BLOCK_SIZE == 0);
   assert(ctx->data_size == 0);
 
-  gcm_hash(key, &ctx->x, length, data);
+  _nettle_gcm_hash(key, &ctx->x, length, data);
 
   ctx->auth_size += length;
 }
@@ -399,7 +431,7 @@ gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,
   assert(ctx->data_size % GCM_BLOCK_SIZE == 0);
 
   _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src);
-  gcm_hash(key, &ctx->x, length, dst);
+  _nettle_gcm_hash(key, &ctx->x, length, dst);
 
   ctx->data_size += length;
 }
@@ -411,7 +443,7 @@ gcm_decrypt(struct gcm_ctx *ctx, const struct gcm_key *key,
 {
   assert(ctx->data_size % GCM_BLOCK_SIZE == 0);
 
-  gcm_hash(key, &ctx->x, length, src);
+  _nettle_gcm_hash(key, &ctx->x, length, src);
   _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src);
 
   ctx->data_size += length;
diff --git a/powerpc64/fat/gcm-hash.asm b/powerpc64/fat/gcm-hash.asm
new file mode 100644
index 0000000000000000000000000000000000000000..57c343d783e3f01359ea1a04fe4509802fee2ed3
--- /dev/null
+++ b/powerpc64/fat/gcm-hash.asm
@@ -0,0 +1,39 @@
+C powerpc64/fat/gcm-hash.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl picked up by configure
+dnl PROLOGUE(_nettle_fat_gcm_init_key)
+dnl PROLOGUE(_nettle_fat_gcm_hash)
+
+define(`fat_transform', `$1_ppc64')
+include_src(`powerpc64/p8/gcm-hash.asm')
diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm
new file mode 100644
index 0000000000000000000000000000000000000000..ad0ff6b38f84fa899bfe754fa055fcf58caf2ea1
--- /dev/null
+++ b/powerpc64/p8/gcm-hash.asm
@@ -0,0 +1,499 @@
+C powerpc64/p8/gcm-hash.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C gcm_set_key() assigns H value in the middle element of the table
+define(`H_Idx', `128')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+define(`TABLE', `r3')
+
+define(`ZERO', `v0')
+define(`B1', `v1')
+define(`EMSB', `v16')
+define(`POLY', `v17')
+define(`POLY_L', `v1')
+
+define(`H', `v2')
+define(`H2', `v3')
+define(`H3', `v4')
+define(`H4', `v5')
+define(`H1M', `v6')
+define(`H1L', `v7')
+define(`H2M', `v8')
+define(`H2L', `v9')
+define(`Hl', `v10')
+define(`Hm', `v11')
+define(`Hp', `v12')
+define(`Hl2', `v13')
+define(`Hm2', `v14')
+define(`Hp2', `v15')
+define(`R', `v13')
+define(`F', `v14')
+define(`T', `v15')
+define(`R2', `v16')
+define(`F2', `v17')
+define(`T2', `v18')
+
+define(`LE_TEMP', `v18')
+define(`LE_MASK', `v19')
+
+.file "gcm-hash.asm"
+
+.text
+
+    C void gcm_init_key (union gcm_block *table)
+
+C This function populates the gcm table as the following layout
+C *******************************************************************************
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) |
+C |                                                                             |
+C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) |
+C |                                                                             |
+C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) |
+C |                                                                             |
+C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
+C *******************************************************************************
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_init_key)
+    DATA_LOAD_VEC(POLY,.polynomial,r7)           C 0xC2000000000000000000000000000001
+IF_LE(`
+    li             r8,0
+    lvsl           LE_MASK,0,r8                  C 0x000102030405060708090A0B0C0D0E0F
+    vspltisb       LE_TEMP,0x07                  C 0x07070707070707070707070707070707
+    vxor           LE_MASK,LE_MASK,LE_TEMP       C 0x07060504030201000F0E0D0C0B0A0908
+')
+
+    C 'H' is assigned by gcm_set_key() to the middle element of the table
+    li             r10,H_Idx*16
+    lxvd2x         VSR(H),r10,TABLE              C load 'H'
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          H,H,H,LE_MASK
+')
+
+    C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) ---
+
+    vupkhsb        EMSB,H                        C extend most significant bit to first byte
+    vspltisb       B1,1                          C 0x01010101010101010101010101010101
+    vspltb         EMSB,EMSB,0                   C first byte quadword-extend
+    vsl            H,H,B1                        C H = H << 1
+    vand           EMSB,EMSB,POLY                C EMSB &= 0xC2000000000000000000000000000001
+    vxor           ZERO,ZERO,ZERO                C 0x00000000000000000000000000000000
+    vxor           H,H,EMSB                      C H ^= EMSB
+
+    C --- calculate H^2 = H*H ---
+
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000
+
+    C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
+    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
+    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
+    vpmsumd        Hp,H,POLY_L                   C Hp = (H mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)
+    xxswapd        VSR(Hm),VSR(H)
+    xxmrgld        VSR(Hl),VSR(H),VSR(ZERO)      C Hl = (H mod x⁶⁴) × x⁶⁴
+    vxor           Hm,Hm,Hp                      C Hm = Hm + Hp
+    vxor           Hl,Hl,Hp                      C Hl = Hl + Hp
+    xxmrgld        VSR(H1L),VSR(H),VSR(Hm)       C H1L = (H mod x⁶⁴)||(Hl mod x⁶⁴)
+    xxmrghd        VSR(H1M),VSR(H),VSR(Hl)       C H1M = (H div x⁶⁴)||(Hl div x⁶⁴)
+
+    vpmsumd        F,H1L,H                       C F = (H1Lh × Hh) + (H1Ll × Hl)
+    vpmsumd        R,H1M,H                       C R = (H1Mh × Hh) + (H1Ml × Hl)
+
+    C --- rduction ---
+    vpmsumd        T,F,POLY_L                    C T = (F mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)
+    xxswapd        VSR(H2),VSR(F)
+    vxor           R,R,T                         C R = R + T
+    vxor           H2,R,H2
+
+    xxmrgld        VSR(Hl),VSR(H2),VSR(ZERO)
+    xxswapd        VSR(Hm),VSR(H2)
+    vpmsumd        Hp,H2,POLY_L
+    vxor           Hl,Hl,Hp
+    vxor           Hm,Hm,Hp
+    xxmrghd        VSR(H2M),VSR(H2),VSR(Hl)
+    xxmrgld        VSR(H2L),VSR(H2),VSR(Hm)
+
+    C store H1M, H1L, H2M, H2L
+    li             r8,1*16
+    li             r9,2*16
+    li             r10,3*16
+    stxvd2x        VSR(H1M),0,TABLE
+    stxvd2x        VSR(H1L),r8,TABLE
+    stxvd2x        VSR(H2M),r9,TABLE
+    stxvd2x        VSR(H2L),r10,TABLE
+
+    C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
+
+    vpmsumd        F,H1L,H2
+    vpmsumd        F2,H2L,H2
+    vpmsumd        R,H1M,H2
+    vpmsumd        R2,H2M,H2
+
+    vpmsumd        T,F,POLY_L
+    vpmsumd        T2,F2,POLY_L
+    xxswapd        VSR(H3),VSR(F)
+    xxswapd        VSR(H4),VSR(F2)
+    vxor           R,R,T
+    vxor           R2,R2,T2
+    vxor           H3,R,H3
+    vxor           H4,R2,H4
+
+    xxmrgld        VSR(Hl),VSR(H3),VSR(ZERO)
+    xxmrgld        VSR(Hl2),VSR(H4),VSR(ZERO)
+    xxswapd        VSR(Hm),VSR(H3)
+    xxswapd        VSR(Hm2),VSR(H4)
+    vpmsumd        Hp,H3,POLY_L
+    vpmsumd        Hp2,H4,POLY_L
+    vxor           Hl,Hl,Hp
+    vxor           Hl2,Hl2,Hp2
+    vxor           Hm,Hm,Hp
+    vxor           Hm2,Hm2,Hp2
+    xxmrghd        VSR(H1M),VSR(H3),VSR(Hl)
+    xxmrghd        VSR(H2M),VSR(H4),VSR(Hl2)
+    xxmrgld        VSR(H1L),VSR(H3),VSR(Hm)
+    xxmrgld        VSR(H2L),VSR(H4),VSR(Hm2)
+
+    C store H3M, H3L, H4M, H4L
+    li             r7,4*16
+    li             r8,5*16
+    li             r9,6*16
+    li             r10,7*16
+    stxvd2x        VSR(H1M),r7,TABLE
+    stxvd2x        VSR(H1L),r8,TABLE
+    stxvd2x        VSR(H2M),r9,TABLE
+    stxvd2x        VSR(H2L),r10,TABLE
+
+    blr
+EPILOGUE(_nettle_gcm_init_key)
+
+define(`TABLE', `r3')
+define(`X', `r4')
+define(`LENGTH', `r5')
+define(`DATA', `r6')
+
+define(`ZERO', `v16')
+define(`POLY', `v17')
+define(`POLY_L', `v0')
+
+define(`D', `v1')
+define(`C0', `v2')
+define(`C1', `v3')
+define(`C2', `v4')
+define(`C3', `v5')
+define(`H1M', `v6')
+define(`H1L', `v7')
+define(`H2M', `v8')
+define(`H2L', `v9')
+define(`H3M', `v10')
+define(`H3L', `v11')
+define(`H4M', `v12')
+define(`H4L', `v13')
+define(`R', `v14')
+define(`F', `v15')
+define(`R2', `v16')
+define(`F2', `v17')
+define(`T', `v18')
+define(`R3', `v20')
+define(`F3', `v21')
+define(`R4', `v22')
+define(`F4', `v23')
+
+define(`LE_TEMP', `v18')
+define(`LE_MASK', `v19')
+
+    C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+    C                size_t length, const uint8_t *data)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_gcm_hash)
+    vxor           ZERO,ZERO,ZERO
+    DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
+    li             r8,0
+    lvsl           LE_MASK,0,r8
+    vspltisb       LE_TEMP,0x07
+    vxor           LE_MASK,LE_MASK,LE_TEMP
+')
+    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
+
+    lxvd2x         VSR(D),0,X                    C load 'X' pointer
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+
+    C --- process 4 blocks '128-bit each' per one loop ---
+
+    srdi.          r7,LENGTH,6                   C 4-blocks loop count 'LENGTH / (4 * 16)'
+    beq            L2x
+
+    mtctr          r7                            C assign counter register to loop count
+
+    C store non-volatile vector registers
+    addi           r8,SP,-64
+    stvx           v20,0,r8
+    addi           r8,r8,16
+    stvx           v21,0,r8
+    addi           r8,r8,16
+    stvx           v22,0,r8
+    addi           r8,r8,16
+    stvx           v23,0,r8
+
+    C load table elements
+    li             r8,1*16
+    li             r9,2*16
+    li             r10,3*16
+    lxvd2x         VSR(H1M),0,TABLE
+    lxvd2x         VSR(H1L),r8,TABLE
+    lxvd2x         VSR(H2M),r9,TABLE
+    lxvd2x         VSR(H2L),r10,TABLE
+    li             r7,4*16
+    li             r8,5*16
+    li             r9,6*16
+    li             r10,7*16
+    lxvd2x         VSR(H3M),r7,TABLE
+    lxvd2x         VSR(H3L),r8,TABLE
+    lxvd2x         VSR(H4M),r9,TABLE
+    lxvd2x         VSR(H4L),r10,TABLE
+
+    li             r8,0x10
+    li             r9,0x20
+    li             r10,0x30
+.align 5
+L4x_loop:
+    C input loading
+    lxvd2x         VSR(C0),0,DATA                C load C0
+    lxvd2x         VSR(C1),r8,DATA               C load C1
+    lxvd2x         VSR(C2),r9,DATA               C load C2
+    lxvd2x         VSR(C3),r10,DATA              C load C3
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+    vperm          C1,C1,C1,LE_MASK
+    vperm          C2,C2,C2,LE_MASK
+    vperm          C3,C3,C3,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication
+    vpmsumd        F2,H3L,C1
+    vpmsumd        R2,H3M,C1
+    vpmsumd        F3,H2L,C2
+    vpmsumd        R3,H2M,C2
+    vpmsumd        F4,H1L,C3
+    vpmsumd        R4,H1M,C3
+    vpmsumd        F,H4L,C0
+    vpmsumd        R,H4M,C0
+
+    C deferred recombination of partial products
+    vxor           F3,F3,F4
+    vxor           R3,R3,R4
+    vxor           F,F,F2
+    vxor           R,R,R2
+    vxor           F,F,F3
+    vxor           R,R,R3
+
+    C reduction
+    vpmsumd        T,F,POLY_L
+    xxswapd        VSR(D),VSR(F)
+    vxor           R,R,T
+    vxor           D,R,D
+
+    addi           DATA,DATA,0x40
+    bdnz           L4x_loop
+
+    C restore non-volatile vector registers
+    addi           r8,SP,-64
+    lvx            v20,0,r8
+    addi           r8,r8,16
+    lvx            v21,0,r8
+    addi           r8,r8,16
+    lvx            v22,0,r8
+    addi           r8,r8,16
+    lvx            v23,0,r8
+
+    clrldi         LENGTH,LENGTH,58              C 'set the high-order 58 bits to zeros'
+L2x:
+    C --- process 2 blocks ---
+
+    srdi.          r7,LENGTH,5                   C 'LENGTH / (2 * 16)'
+    beq            L1x
+
+    C load table elements
+    li             r8,1*16
+    li             r9,2*16
+    li             r10,3*16
+    lxvd2x         VSR(H1M),0,TABLE
+    lxvd2x         VSR(H1L),r8,TABLE
+    lxvd2x         VSR(H2M),r9,TABLE
+    lxvd2x         VSR(H2L),r10,TABLE
+
+    C input loading
+    li             r10,0x10
+    lxvd2x         VSR(C0),0,DATA                C load C0
+    lxvd2x         VSR(C1),r10,DATA              C load C1
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+    vperm          C1,C1,C1,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication
+    vpmsumd        F2,H1L,C1
+    vpmsumd        R2,H1M,C1
+    vpmsumd        F,H2L,C0
+    vpmsumd        R,H2M,C0
+
+    C deferred recombination of partial products
+    vxor           F,F,F2
+    vxor           R,R,R2
+
+    C reduction
+    vpmsumd        T,F,POLY_L
+    xxswapd        VSR(D),VSR(F)
+    vxor           R,R,T
+    vxor           D,R,D
+
+    addi           DATA,DATA,0x20
+    clrldi         LENGTH,LENGTH,59              C 'set the high-order 59 bits to zeros'
+L1x:
+    C --- process 1 block ---
+
+    srdi.          r7,LENGTH,4                   C 'LENGTH / (1 * 16)'
+    beq            Lmod
+
+    C load table elements
+    li             r8,1*16
+    lxvd2x         VSR(H1M),0,TABLE
+    lxvd2x         VSR(H1L),r8,TABLE
+
+    C input loading
+    lxvd2x         VSR(C0),0,DATA                C load C0
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication
+    vpmsumd        F,H1L,C0
+    vpmsumd        R,H1M,C0
+
+    C reduction
+    vpmsumd        T,F,POLY_L
+    xxswapd        VSR(D),VSR(F)
+    vxor           R,R,T
+    vxor           D,R,D
+
+    addi           DATA,DATA,0x10
+    clrldi         LENGTH,LENGTH,60              C 'set the high-order 60 bits to zeros'
+Lmod:
+    C --- process the modulo bytes, padding the low-order bytes with zeros ---
+
+    cmpldi         LENGTH,0
+    beq            Ldone
+
+    C load table elements
+    li             r8,1*16
+    lxvd2x         VSR(H1M),0,TABLE
+    lxvd2x         VSR(H1L),r8,TABLE
+
+    C push every modulo byte to the stack and load them with padding into vector register
+    vxor           ZERO,ZERO,ZERO
+    addi           r8,SP,-16
+    stvx           ZERO,0,r8
+Lstb_loop:
+    subic.         LENGTH,LENGTH,1
+    lbzx           r7,LENGTH,DATA
+    stbx           r7,LENGTH,r8
+    bne            Lstb_loop
+    lxvd2x         VSR(C0),0,r8
+
+IF_LE(`
+    vperm          C0,C0,C0,LE_MASK
+')
+
+    C previous digest combining
+    vxor           C0,C0,D
+
+    C polynomial multiplication
+    vpmsumd        F,H1L,C0
+    vpmsumd        R,H1M,C0
+
+    C reduction
+    vpmsumd        T,F,POLY_L
+    xxswapd        VSR(D),VSR(F)
+    vxor           R,R,T
+    vxor           D,R,D
+
+Ldone:
+    C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
+    vperm          D,D,D,LE_MASK
+')
+    stxvd2x        VSR(D),0,X                    C store digest 'D'
+
+    blr
+EPILOGUE(_nettle_gcm_hash)
+
+.data
+    C 0xC2000000000000000000000000000001
+.polynomial:
+.align 4
+IF_BE(`
+.byte 0xC2
+.rept 14
+.byte 0x00
+.endr
+.byte 0x01
+',`
+.byte 0x01
+.rept 14
+.byte 0x00
+.endr
+.byte 0xC2
+')