diff --git a/block-internal.h b/block-internal.h
index 8972d0ac2b5be04b8170364cbc515fe9325e0ddc..88e19be333c8f1372adc10b649595f5380af1d35 100644
--- a/block-internal.h
+++ b/block-internal.h
@@ -95,11 +95,15 @@ block8_xor_bytes (union nettle_block8 *r,
 #define LSHIFT_ALIEN_UINT64(x) \
 	((((x) & UINT64_C(0x7f7f7f7f7f7f7f7f)) << 1) | \
 	 (((x) & UINT64_C(0x8080808080808080)) >> 15))
+#define RSHIFT_ALIEN_UINT64(x) \
+	((((x) & UINT64_C(0xfefefefefefefefe)) >> 1) | \
+	 (((x) & UINT64_C(0x0001010101010101)) << 15))
 
 /* Two typical defining polynoms */
 
 #define BLOCK16_POLY (UINT64_C(0x87))
 #define BLOCK8_POLY (UINT64_C(0x1b))
+#define GHASH_POLY (UINT64_C(0xE1))
 
 /* Galois multiplications by 2:
  * functions differ in shifting right or left, big- or little- endianness
@@ -133,6 +137,18 @@ block8_mulx_be (union nettle_block8 *dst,
 
   dst->u64 = (src->u64 << 1) ^ (BLOCK8_POLY & -carry);
 }
+
+static inline void
+block16_mulx_ghash (union nettle_block16 *r,
+		    const union nettle_block16 *x)
+{
+  uint64_t mask;
+
+  /* Shift uses big-endian representation. */
+  mask = - (x->u64[1] & 1);
+  r->u64[1] = (x->u64[1] >> 1) | ((x->u64[0] & 1) << 63);
+  r->u64[0] = (x->u64[0] >> 1) ^ (mask & (GHASH_POLY << 56));
+}
 #else /* !WORDS_BIGENDIAN */
 static inline void
 block16_mulx_be (union nettle_block16 *dst,
@@ -160,6 +176,18 @@ block8_mulx_be (union nettle_block8 *dst,
 
   dst->u64 = LSHIFT_ALIEN_UINT64(src->u64) ^ ((BLOCK8_POLY << 56) & -carry);
 }
-#endif /* !WORDS_BIGENDIAN */
+
+static inline void
+block16_mulx_ghash (union nettle_block16 *r,
+		    const union nettle_block16 *x)
+{
+  uint64_t mask;
+
+  /* Shift uses big-endian representation. */
+  mask = - ((x->u64[1] >> 56) & 1);
+  r->u64[1] = RSHIFT_ALIEN_UINT64(x->u64[1]) | ((x->u64[0] >> 49) & 0x80);
+  r->u64[0] = RSHIFT_ALIEN_UINT64(x->u64[0]) ^ (mask & GHASH_POLY);
+}
+#endif /* ! WORDS_BIGENDIAN */
 
 #endif /* NETTLE_BLOCK_INTERNAL_H_INCLUDED */
diff --git a/gcm.c b/gcm.c
index 4a04a0a10842ece12f77edb8b90e3c3bd2d50d57..cf615daf18bd60efc6a49459213c392bd09d0066 100644
--- a/gcm.c
+++ b/gcm.c
@@ -55,32 +55,6 @@
 #include "ctr-internal.h"
 #include "block-internal.h"
 
-#define GHASH_POLYNOMIAL 0xE1UL
-
-/* Multiplication by 010...0; a big-endian shift right. If the bit
-   shifted out is one, the defining polynomial is added to cancel it
-   out. r == x is allowed. */
-static void
-gcm_gf_shift (union nettle_block16 *r, const union nettle_block16 *x)
-{
-  uint64_t mask;
-
-  /* Shift uses big-endian representation. */
-#if WORDS_BIGENDIAN
-  mask = - (x->u64[1] & 1);
-  r->u64[1] = (x->u64[1] >> 1) | ((x->u64[0] & 1) << 63);
-  r->u64[0] = (x->u64[0] >> 1) ^ (mask & ((uint64_t) GHASH_POLYNOMIAL << 56));
-#else /* ! WORDS_BIGENDIAN */
-#define RSHIFT_WORD(x) \
-  ((((x) & 0xfefefefefefefefeUL) >> 1) \
-   | (((x) & 0x0001010101010101UL) << 15))
-  mask = - ((x->u64[1] >> 56) & 1);
-  r->u64[1] = RSHIFT_WORD(x->u64[1]) | ((x->u64[0] >> 49) & 0x80);
-  r->u64[0] = RSHIFT_WORD(x->u64[0]) ^ (mask & GHASH_POLYNOMIAL);
-# undef RSHIFT_WORD
-#endif /* ! WORDS_BIGENDIAN */
-}
-
 #if GCM_TABLE_BITS == 0
 /* Sets x <- x * y mod r, using the plain bitwise algorithm from the
    specification. y may be shorter than a full block, missing bytes
@@ -104,7 +78,7 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *y)
 	  if (b & 0x80)
 	    block16_xor(&Z, &V);
 	  
-	  gcm_gf_shift(&V, &V);
+	  block16_mulx_ghash(&V, &V);
 	}
     }
   memcpy (x->b, Z.b, sizeof(Z));
@@ -275,7 +249,7 @@ gcm_set_key(struct gcm_key *key,
   /* Algorithm 3 from the gcm paper. First do powers of two, then do
      the rest by adding. */
   while (i /= 2)
-    gcm_gf_shift(&key->h[i], &key->h[2*i]);
+    block16_mulx_ghash(&key->h[i], &key->h[2*i]);
   for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
     {
       unsigned j;