diff --git a/ChangeLog b/ChangeLog
index 6798693200839a3f9461a263159def2605f23396..41f814fe2fc5a67eaf101d7bea48753c0be4a3af 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2023-04-13  Niels Möller  <nisse@lysator.liu.se>
+
+	* ghash-update.c (gcm_gf_mul): Rewrite to avoid side-channel
+	leakage. Now processes the message bits one at a time, using
+	tabulated values of the key premultiplied by appropriate powers of
+	x, so that the table is accessed in a fixed sequential order.
+	Performance penalty, on x86_64, is roughly 3 times.
+	(shift_table): Deleted table.
+	(gcm_gf_shift_8): Deleted function.
+	* ghash-set-key.c (_ghash_set_key): Rewrite table generation.
+	* gcmdata.c: Deleted.
+	* Makefile.in: Delete references to gcmdata.
+
+	* x86_64/ghash-update.asm: Rewritten, similar side-channel silent
+	method as the C implementation, with same table layout, but using
+	sse2 instructions.
+
+	* testsuite/gcm-test.c (test_ghash_internal): Add valgrind
+	annotations, to verify that the ghash implementation makes no
+	data-dependent branches or memory accesses.
+
+	* examples/nettle-benchmark.c (bench_ghash_update): New function.
+
 2023-04-03  Niels Möller  <nisse@lysator.liu.se>
 
 	From Mamone Tarsha:
diff --git a/Makefile.in b/Makefile.in
index 081337a8f2918211e7602c30c6c92d100cc50621..2464e17ef6fab6cab5505c45e84f0acc20bbdc5f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -250,7 +250,7 @@ INSTALL_HEADERS = $(HEADERS) version.h @IF_MINI_GMP@ mini-gmp.h
 SOURCES = $(nettle_SOURCES) $(hogweed_SOURCES) \
 	  $(getopt_SOURCES) $(internal_SOURCES) \
 	  $(OPT_SOURCES) \
-	  aesdata.c desdata.c twofishdata.c shadata.c gcmdata.c eccdata.c
+	  aesdata.c desdata.c twofishdata.c shadata.c eccdata.c
 
 # NOTE: This list must include all source files, with no duplicates,
 # independently of which source files are included in the build.
@@ -683,7 +683,6 @@ clean-here:
 		desdata$(EXEEXT_FOR_BUILD) \
 		twofishdata$(EXEEXT_FOR_BUILD) \
 		shadata$(EXEEXT_FOR_BUILD) \
-		gcmdata$(EXEEXT_FOR_BUILD) \
 		eccdata$(EXEEXT_FOR_BUILD) eccdata.stamp
 	-rm -rf .lib libnettle.stamp libhogweed.stamp
 
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c
index 613b1dcfe3e3760215f689a5b1fc06f8d867e687..04c3e0f29eb5af239c44dc2c65e56ca9b0b66e3c 100644
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -56,6 +56,7 @@
 #include "des.h"
 #include "eax.h"
 #include "gcm.h"
+#include "ghash-internal.h"
 #include "memxor.h"
 #include "salsa20.h"
 #include "salsa20-internal.h"
@@ -875,10 +876,22 @@ bench_sha3_permute(void)
   TIME_CYCLES (t, sha3_permute (&state));
   printf("sha3_permute: %.2f cycles (%.2f / round)\n", t, t / 24.0);
 }
+static void
+bench_ghash_update(void)
+{
+  struct gcm_key key;
+  union nettle_block16 state;
+  const uint8_t data[160];
+  double t;
+
+  TIME_CYCLES (t, _ghash_update (&key, &state, 10, data));
+  printf("ghash_update: %.2f cycles / block\n", t / 10.0);
+}
 #else
 #define bench_sha1_compress()
 #define bench_salsa20_core()
 #define bench_sha3_permute()
+#define bench_ghash_update()
 #endif
 
 #if WITH_OPENSSL
@@ -986,6 +999,7 @@ main(int argc, char **argv)
   bench_sha1_compress();
   bench_salsa20_core();
   bench_sha3_permute();
+  bench_ghash_update();
   printf("\n");
 
   header();
diff --git a/gcmdata.c b/gcmdata.c
deleted file mode 100644
index 2d57b46a619d63358bb4cbdee35dab8ee46b66fd..0000000000000000000000000000000000000000
--- a/gcmdata.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/* gcmdata.c
-
-   Galois counter mode, specified by NIST,
-   http://csrc.nist.gov/publications/nistpubs/800-38D/SP-800-38D.pdf
-
-   Generation of fixed multiplication tables.
-
-   Copyright (C) 2011 Niels Möller
-
-   This file is part of GNU Nettle.
-
-   GNU Nettle is free software: you can redistribute it and/or
-   modify it under the terms of either:
-
-     * the GNU Lesser General Public License as published by the Free
-       Software Foundation; either version 3 of the License, or (at your
-       option) any later version.
-
-   or
-
-     * the GNU General Public License as published by the Free
-       Software Foundation; either version 2 of the License, or (at your
-       option) any later version.
-
-   or both in parallel, as here.
-
-   GNU Nettle is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received copies of the GNU General Public License and
-   the GNU Lesser General Public License along with this program.  If
-   not, see http://www.gnu.org/licenses/.
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define GHASH_POLYNOMIAL 0xE1
-
-
-/* When x is shifted out over the block edge, add multiples of the
-   defining polynomial to eliminate each bit. */
-static unsigned
-reduce(unsigned x)
-{
-  unsigned p = GHASH_POLYNOMIAL << 1;
-  unsigned y = 0;
-  for (; x; x >>= 1, p <<= 1)
-    if (x & 1)
-      y ^= p;
-  return y;
-}
-
-int
-main(int argc, char **argv)
-{
-  unsigned i;
-  printf("4-bit table:\n");
-  
-  for (i = 0; i<16; i++)
-    {
-      unsigned x;
-      if (i && !(i%8))
-	printf("\n");
-
-      x = reduce(i << 4);
-      printf("W(%02x,%02x),", x >> 8, x & 0xff);
-    }
-  printf("\n\n");
-  printf("8-bit table:\n");
-  for (i = 0; i<256; i++)
-    {
-      unsigned x;
-      if (i && !(i%8))
-	printf("\n");
-
-      x = reduce(i);
-      printf("W(%02x,%02x),", x >> 8, x & 0xff);
-    }
-  printf("\n");
-  return EXIT_SUCCESS;
-}
diff --git a/ghash-set-key.c b/ghash-set-key.c
index 0e91afcbda39f921e880b1c7c75c3233a76d0a92..da1c90f03c37dc624c5e7a02cadb6311fbf2beb3 100644
--- a/ghash-set-key.c
+++ b/ghash-set-key.c
@@ -51,25 +51,46 @@ _nettle_ghash_set_key_c (struct gcm_key *ctx, const union nettle_block16 *key);
 #define _nettle_ghash_set_key _nettle_ghash_set_key_c
 #endif
 
+#if GCM_TABLE_BITS < 7
+# error Unsupported table size.
+#endif
+
 /* Implements a lookup table for processors without carryless-mul
    instruction. */
 void
 _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
 {
-  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
-     element */
-  unsigned i = (1<<GCM_TABLE_BITS)/2;
-  block16_zero (&ctx->h[0]);
-  ctx->h[i] = *key;
-
-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
-     the rest by adding. */
-  while (i /= 2)
-    block16_mulx_ghash (&ctx->h[i], &ctx->h[2*i]);
-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
-    {
-      unsigned j;
-      for (j = 1; j < i; j++)
-	block16_xor3 (&ctx->h[i+j], &ctx->h[i], &ctx->h[j]);
-    }
+  /* Table elements hold the key, premultiplied by all needed powers
+     of x. Element ordering follows the order bits are processed in
+     _ghash_update, alternating u64[0] and u64[1] bits, starting from
+     the least significant end. In the gcm bit order, bits (left to
+     right) correspond to x powers (the numbers) like
+
+       |0...7|8...15|...|56...63|64...71|72...79|...|120...127|
+
+     where | indicates the byte boundaries. On little endian, these
+     bits are in u64 like
+
+       u64[0]: | 56...63   48...55   40...47  32...39  24...31 16...23  8...15  0...7|
+       u64[1]: |120...127 112...129 104...111 96...103 88...95 80...87 72...79 64...71|
+
+     With big-endian, we instead get
+
+       u64[0]:  |0...63|
+       u64[1]: |64...127|
+  */
+#if WORDS_BIGENDIAN
+# define INDEX_PERMUTE 63
+#else
+# define INDEX_PERMUTE 7
+#endif
+  unsigned i;
+
+  block16_set (&ctx->h[2*INDEX_PERMUTE], key);
+  for (i = 1; i < 64; i++)
+    block16_mulx_ghash(&ctx->h[2*(i ^ INDEX_PERMUTE)], &ctx->h[2*((i-1) ^ INDEX_PERMUTE)]);
+
+  block16_mulx_ghash(&ctx->h[2*INDEX_PERMUTE + 1], &ctx->h[2*(63^INDEX_PERMUTE)]);
+  for (i = 1; i < 64; i++)
+    block16_mulx_ghash(&ctx->h[2*(i ^ INDEX_PERMUTE)+1], &ctx->h[2*((i-1) ^ INDEX_PERMUTE)+1]);
 }
diff --git a/ghash-update.c b/ghash-update.c
index 6eb19d807cc5f451ac825dce95d55002c095a2e1..bdeaa38d5c0a5e548fd2c3a6fd81af414aa563e9 100644
--- a/ghash-update.c
+++ b/ghash-update.c
@@ -44,7 +44,7 @@
 #include "ghash-internal.h"
 #include "block-internal.h"
 
-#if GCM_TABLE_BITS != 8
+#if GCM_TABLE_BITS < 7
 # error Unsupported table size.
 #endif
 
@@ -54,83 +54,26 @@ const uint8_t *
 _nettle_ghash_update_c (const struct gcm_key *ctx, union nettle_block16 *state,
 			size_t blocks, const uint8_t *data);
 #define _nettle_ghash_update _nettle_ghash_update_c
-
-#endif
-#if WORDS_BIGENDIAN
-# define W(left,right) (0x##left##right)
-#else
-# define W(left,right) (0x##right##left)
 #endif
 
-static const uint16_t
-shift_table[0x100] = {
-  W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
-  W(0e,10),W(0f,d2),W(0d,94),W(0c,56),W(09,18),W(08,da),W(0a,9c),W(0b,5e),
-  W(1c,20),W(1d,e2),W(1f,a4),W(1e,66),W(1b,28),W(1a,ea),W(18,ac),W(19,6e),
-  W(12,30),W(13,f2),W(11,b4),W(10,76),W(15,38),W(14,fa),W(16,bc),W(17,7e),
-  W(38,40),W(39,82),W(3b,c4),W(3a,06),W(3f,48),W(3e,8a),W(3c,cc),W(3d,0e),
-  W(36,50),W(37,92),W(35,d4),W(34,16),W(31,58),W(30,9a),W(32,dc),W(33,1e),
-  W(24,60),W(25,a2),W(27,e4),W(26,26),W(23,68),W(22,aa),W(20,ec),W(21,2e),
-  W(2a,70),W(2b,b2),W(29,f4),W(28,36),W(2d,78),W(2c,ba),W(2e,fc),W(2f,3e),
-  W(70,80),W(71,42),W(73,04),W(72,c6),W(77,88),W(76,4a),W(74,0c),W(75,ce),
-  W(7e,90),W(7f,52),W(7d,14),W(7c,d6),W(79,98),W(78,5a),W(7a,1c),W(7b,de),
-  W(6c,a0),W(6d,62),W(6f,24),W(6e,e6),W(6b,a8),W(6a,6a),W(68,2c),W(69,ee),
-  W(62,b0),W(63,72),W(61,34),W(60,f6),W(65,b8),W(64,7a),W(66,3c),W(67,fe),
-  W(48,c0),W(49,02),W(4b,44),W(4a,86),W(4f,c8),W(4e,0a),W(4c,4c),W(4d,8e),
-  W(46,d0),W(47,12),W(45,54),W(44,96),W(41,d8),W(40,1a),W(42,5c),W(43,9e),
-  W(54,e0),W(55,22),W(57,64),W(56,a6),W(53,e8),W(52,2a),W(50,6c),W(51,ae),
-  W(5a,f0),W(5b,32),W(59,74),W(58,b6),W(5d,f8),W(5c,3a),W(5e,7c),W(5f,be),
-  W(e1,00),W(e0,c2),W(e2,84),W(e3,46),W(e6,08),W(e7,ca),W(e5,8c),W(e4,4e),
-  W(ef,10),W(ee,d2),W(ec,94),W(ed,56),W(e8,18),W(e9,da),W(eb,9c),W(ea,5e),
-  W(fd,20),W(fc,e2),W(fe,a4),W(ff,66),W(fa,28),W(fb,ea),W(f9,ac),W(f8,6e),
-  W(f3,30),W(f2,f2),W(f0,b4),W(f1,76),W(f4,38),W(f5,fa),W(f7,bc),W(f6,7e),
-  W(d9,40),W(d8,82),W(da,c4),W(db,06),W(de,48),W(df,8a),W(dd,cc),W(dc,0e),
-  W(d7,50),W(d6,92),W(d4,d4),W(d5,16),W(d0,58),W(d1,9a),W(d3,dc),W(d2,1e),
-  W(c5,60),W(c4,a2),W(c6,e4),W(c7,26),W(c2,68),W(c3,aa),W(c1,ec),W(c0,2e),
-  W(cb,70),W(ca,b2),W(c8,f4),W(c9,36),W(cc,78),W(cd,ba),W(cf,fc),W(ce,3e),
-  W(91,80),W(90,42),W(92,04),W(93,c6),W(96,88),W(97,4a),W(95,0c),W(94,ce),
-  W(9f,90),W(9e,52),W(9c,14),W(9d,d6),W(98,98),W(99,5a),W(9b,1c),W(9a,de),
-  W(8d,a0),W(8c,62),W(8e,24),W(8f,e6),W(8a,a8),W(8b,6a),W(89,2c),W(88,ee),
-  W(83,b0),W(82,72),W(80,34),W(81,f6),W(84,b8),W(85,7a),W(87,3c),W(86,fe),
-  W(a9,c0),W(a8,02),W(aa,44),W(ab,86),W(ae,c8),W(af,0a),W(ad,4c),W(ac,8e),
-  W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e),
-  W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae),
-  W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be),
-};
-#undef W
-
-static void
-gcm_gf_shift_8(union nettle_block16 *x)
-{
-  uint64_t reduce;
-
-  /* Shift uses big-endian representation. */
-#if WORDS_BIGENDIAN
-  reduce = shift_table[x->u64[1] & 0xff];
-  x->u64[1] = (x->u64[1] >> 8) | ((x->u64[0] & 0xff) << 56);
-  x->u64[0] = (x->u64[0] >> 8) ^ (reduce << 48);
-#else /* ! WORDS_BIGENDIAN */
-  reduce = shift_table[(x->u64[1] >> 56) & 0xff];
-  x->u64[1] = (x->u64[1] << 8) | (x->u64[0] >> 56);
-  x->u64[0] = (x->u64[0] << 8) ^ reduce;
-#endif /* ! WORDS_BIGENDIAN */
-}
-
 static void
 gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table)
 {
-  union nettle_block16 Z;
+  uint64_t x0 = x->u64[0];
+  uint64_t x1 = x->u64[1];
+  uint64_t r0 = 0;
+  uint64_t r1 = 0;
   unsigned i;
-
-  Z = table[x->b[GCM_BLOCK_SIZE-1]];
-
-  for (i = GCM_BLOCK_SIZE-2; i > 0; i--)
+  for (i = 0; i < 64; i++, x0 >>= 1, x1 >>= 1)
     {
-      gcm_gf_shift_8(&Z);
-      block16_xor(&Z, &table[x->b[i]]);
+      uint64_t m0 = -(x0 & 1);
+      uint64_t m1 = -(x1 & 1);
+      r0 ^= m0 & table[2*i].u64[0];
+      r1 ^= m0 & table[2*i].u64[1];
+      r0 ^= m1 & table[2*i+1].u64[0];
+      r1 ^= m1 & table[2*i+1].u64[1];
     }
-  gcm_gf_shift_8(&Z);
-  block16_xor3(x, &Z, &table[x->b[0]]);
+  x->u64[0] = r0; x->u64[1] = r1;
 }
 
 const uint8_t *
diff --git a/testsuite/gcm-test.c b/testsuite/gcm-test.c
index d70cdd1e98b68816efe92bf3fe78f926d2d8e26d..bc555d60819bff5ae078df9743e891e76115e9a3 100644
--- a/testsuite/gcm-test.c
+++ b/testsuite/gcm-test.c
@@ -6,6 +6,13 @@
 #include "gcm.h"
 #include "ghash-internal.h"
 
+#if HAVE_VALGRIND_MEMCHECK_H
+# include <valgrind/memcheck.h>
+#else
+# define VALGRIND_MAKE_MEM_UNDEFINED(p, n)
+# define VALGRIND_MAKE_MEM_DEFINED(p, n)
+#endif
+
 static void
 test_gcm_hash (const struct tstring *msg, const struct tstring *ref)
 {
@@ -42,11 +49,19 @@ test_ghash_internal (const struct tstring *key,
   struct gcm_key gcm_key;
   union nettle_block16 state;
 
+  /* Use VALGRIND_MAKE_MEM_DEFINED to mark inputs as "undefined", to
+     get valgrind to warn about any branches or memory accesses
+     depending on secret data. */
   memcpy (state.b, key->data, GCM_BLOCK_SIZE);
+  VALGRIND_MAKE_MEM_UNDEFINED (&state, sizeof(state));
   _ghash_set_key (&gcm_key, &state);
 
   memcpy (state.b, iv->data, GCM_BLOCK_SIZE);
+  VALGRIND_MAKE_MEM_UNDEFINED (&state, sizeof(state));
+  VALGRIND_MAKE_MEM_UNDEFINED (message->data, message->length);
   _ghash_update (&gcm_key, &state, message->length / GCM_BLOCK_SIZE, message->data);
+  VALGRIND_MAKE_MEM_DEFINED (&state, sizeof(state));
+  VALGRIND_MAKE_MEM_DEFINED (message->data, message->length);
   if (!MEMEQ(GCM_BLOCK_SIZE, state.b, digest->data))
     {
       fprintf (stderr, "gcm_hash (internal) failed\n");
diff --git a/x86_64/ghash-update.asm b/x86_64/ghash-update.asm
index f3a66210850634c1fc03c7b6c718e07ad263d4db..b3417e45349da4abd7dcd0c206b497b8fb24ca40 100644
--- a/x86_64/ghash-update.asm
+++ b/x86_64/ghash-update.asm
@@ -36,15 +36,14 @@ define(`KEY', `%rdi')
 define(`XP', `%rsi')
 define(`BLOCKS', `%rdx')
 define(`SRC', `%rcx')
-define(`X0', `%rax')
-define(`X1', `%rbx')
-define(`CNT', `%ebp')
-define(`T0', `%r8')
-define(`T1', `%r9')
-define(`T2', `%r10')
-define(`Z0', `%r11')
-define(`Z1', `%r12')
-define(`SHIFT_TABLE', `%r13')
+define(`CNT', `%rax')
+define(`KEY32', `%r8')
+define(`X', `%xmm0')
+define(`R', `%xmm1')
+define(`M0', `%xmm2')
+define(`M1', `%xmm3')
+define(`M2', `%xmm4')
+define(`M3', `%xmm5')
 
 	.file "ghash-update.asm"
 
@@ -55,150 +54,50 @@ define(`SHIFT_TABLE', `%r13')
 	.text
 	ALIGN(16)
 PROLOGUE(_nettle_ghash_update)
-	W64_ENTRY(4, 0)
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
+	W64_ENTRY(4, 6)
 	sub	$1, BLOCKS
-	lea	.Lshift_table(%rip), SHIFT_TABLE
-	mov	(XP), X0
-	mov	8(XP), X1
+	movups	(XP), X
 	jc	.Ldone
-ALIGN(16)
-.Lblock_loop:
-
-	xor (SRC), X0
-	xor 8(SRC), X1
-
-.Lblock_mul:
-	rol	$8, X1
-	movzbl	LREG(X1), XREG(T1)
-	shl	$4, T1
-	mov	(KEY, T1), Z0
-	mov	8(KEY, T1), Z1
-
-	C shift Z1, Z0, transforming
-	C +-----------------------+-----------------------+
-	C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00|
-	C +-----------------------+-----------------------+
-	C into
-	C +-----------------------+-----------------------+
-	C |14 13 12 11 10 09 08 07|06 05 04 03 02 01 00   |
-	C +-----------------------+-----------------+-----+
-	C                               xor         |T[15]|
-	C                                           +-----+
-
-	mov	$7, CNT
+	C Table offset corresponding to 32 bits.
+	lea	1024(KEY), KEY32
 
 ALIGN(16)
-.Loop_X1:
-	mov	Z1, T1
-	shr	$56, T1
-	shl	$8, Z1
-	mov	Z0, T0
-	shl	$8, Z0
-	shr	$56, T0
-	movzwl	(SHIFT_TABLE, T1, 2), XREG(T1)
-	xor	T1, Z0
-	rol	$8, X1
-	movzbl	LREG(X1), XREG(T2)
-	shl	$4, T2
-	xor	(KEY, T2), Z0
-	add	T0, Z1
-	xor	8(KEY, T2), Z1
-	decl	CNT
-	jne	.Loop_X1
-
-	mov	$7, CNT
-
+.Lblock_loop:
+	C Unaligned input
+	movups	(SRC), M0
+	pxor	M0, X
+	pxor	R, R
+	mov	$992, CNT
 ALIGN(16)
-.Loop_X0:
-	mov	Z1, T1
-	shr	$56, T1
-	shl	$8, Z1
-	mov	Z0, T0
-	shl	$8, Z0
-	shr	$56, T0
-	movzwl	(SHIFT_TABLE, T1, 2), XREG(T1)
-	xor	T1, Z0
-	rol	$8, X0
-	movzbl	LREG(X0), XREG(T2)
-	shl	$4, T2
-	xor	(KEY, T2), Z0
-	add	T0, Z1
-	xor	8(KEY, T2), Z1
-	decl	CNT
-	jne	.Loop_X0
-
-	mov	Z1, T1
-	shr	$56, T1
-	shl	$8, Z1
-	mov	Z0, T0
-	shl	$8, Z0
-	shr	$56, T0
-	movzwl	(SHIFT_TABLE, T1, 2), XREG(T1)
-	xor	T1, Z0
-	rol	$8, X0
-	movzbl	LREG(X0), XREG(T2)
-	shl	$4, T2
-	mov	(KEY, T2), X0
-	xor	Z0, X0
-	add	T0, Z1
-	mov	8(KEY, T2), X1
-	xor	Z1, X1
+.Loop_bit:
+	movdqa	X, M3
+	psrad	$31, M3
+	pshufd	$0x00, M3, M0
+	pshufd	$0x55, M3, M1
+	pshufd	$0xaa, M3, M2
+	pshufd	$0xff, M3, M3
+	pslld	$1, X
+	pand	(KEY, CNT), M0
+	pand	(KEY32, CNT), M1
+	pand	16(KEY, CNT), M2
+	pand	16(KEY32, CNT), M3
+	pxor	M0, M1
+	pxor	M2, M3
+	pxor	M1, R
+	pxor	M3, R
+
+	sub	$32, CNT
+	jnc	.Loop_bit
+
+	movaps	R, X
 
 	add	$16, SRC
 	sub	$1, BLOCKS
 	jnc	.Lblock_loop
 
 .Ldone:
-	mov	X0, (XP)
-	mov	X1, 8(XP)
+	movups	X, (XP)
 	mov	SRC, %rax
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	W64_EXIT(4, 0)
+	W64_EXIT(4, 6)
 	ret
 EPILOGUE(_nettle_ghash_update)
-
-define(`W', `0x$2$1')
-	RODATA
-	ALIGN(2)
-C NOTE: Sun/Oracle assembler doesn't support ".short".
-C Using ".value" seems more portable.
-.Lshift_table:
-.value W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e)
-.value W(0e,10),W(0f,d2),W(0d,94),W(0c,56),W(09,18),W(08,da),W(0a,9c),W(0b,5e)
-.value W(1c,20),W(1d,e2),W(1f,a4),W(1e,66),W(1b,28),W(1a,ea),W(18,ac),W(19,6e)
-.value W(12,30),W(13,f2),W(11,b4),W(10,76),W(15,38),W(14,fa),W(16,bc),W(17,7e)
-.value W(38,40),W(39,82),W(3b,c4),W(3a,06),W(3f,48),W(3e,8a),W(3c,cc),W(3d,0e)
-.value W(36,50),W(37,92),W(35,d4),W(34,16),W(31,58),W(30,9a),W(32,dc),W(33,1e)
-.value W(24,60),W(25,a2),W(27,e4),W(26,26),W(23,68),W(22,aa),W(20,ec),W(21,2e)
-.value W(2a,70),W(2b,b2),W(29,f4),W(28,36),W(2d,78),W(2c,ba),W(2e,fc),W(2f,3e)
-.value W(70,80),W(71,42),W(73,04),W(72,c6),W(77,88),W(76,4a),W(74,0c),W(75,ce)
-.value W(7e,90),W(7f,52),W(7d,14),W(7c,d6),W(79,98),W(78,5a),W(7a,1c),W(7b,de)
-.value W(6c,a0),W(6d,62),W(6f,24),W(6e,e6),W(6b,a8),W(6a,6a),W(68,2c),W(69,ee)
-.value W(62,b0),W(63,72),W(61,34),W(60,f6),W(65,b8),W(64,7a),W(66,3c),W(67,fe)
-.value W(48,c0),W(49,02),W(4b,44),W(4a,86),W(4f,c8),W(4e,0a),W(4c,4c),W(4d,8e)
-.value W(46,d0),W(47,12),W(45,54),W(44,96),W(41,d8),W(40,1a),W(42,5c),W(43,9e)
-.value W(54,e0),W(55,22),W(57,64),W(56,a6),W(53,e8),W(52,2a),W(50,6c),W(51,ae)
-.value W(5a,f0),W(5b,32),W(59,74),W(58,b6),W(5d,f8),W(5c,3a),W(5e,7c),W(5f,be)
-.value W(e1,00),W(e0,c2),W(e2,84),W(e3,46),W(e6,08),W(e7,ca),W(e5,8c),W(e4,4e)
-.value W(ef,10),W(ee,d2),W(ec,94),W(ed,56),W(e8,18),W(e9,da),W(eb,9c),W(ea,5e)
-.value W(fd,20),W(fc,e2),W(fe,a4),W(ff,66),W(fa,28),W(fb,ea),W(f9,ac),W(f8,6e)
-.value W(f3,30),W(f2,f2),W(f0,b4),W(f1,76),W(f4,38),W(f5,fa),W(f7,bc),W(f6,7e)
-.value W(d9,40),W(d8,82),W(da,c4),W(db,06),W(de,48),W(df,8a),W(dd,cc),W(dc,0e)
-.value W(d7,50),W(d6,92),W(d4,d4),W(d5,16),W(d0,58),W(d1,9a),W(d3,dc),W(d2,1e)
-.value W(c5,60),W(c4,a2),W(c6,e4),W(c7,26),W(c2,68),W(c3,aa),W(c1,ec),W(c0,2e)
-.value W(cb,70),W(ca,b2),W(c8,f4),W(c9,36),W(cc,78),W(cd,ba),W(cf,fc),W(ce,3e)
-.value W(91,80),W(90,42),W(92,04),W(93,c6),W(96,88),W(97,4a),W(95,0c),W(94,ce)
-.value W(9f,90),W(9e,52),W(9c,14),W(9d,d6),W(98,98),W(99,5a),W(9b,1c),W(9a,de)
-.value W(8d,a0),W(8c,62),W(8e,24),W(8f,e6),W(8a,a8),W(8b,6a),W(89,2c),W(88,ee)
-.value W(83,b0),W(82,72),W(80,34),W(81,f6),W(84,b8),W(85,7a),W(87,3c),W(86,fe)
-.value W(a9,c0),W(a8,02),W(aa,44),W(ab,86),W(ae,c8),W(af,0a),W(ad,4c),W(ac,8e)
-.value W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e)
-.value W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae)
-.value W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be)