diff --git a/ChangeLog b/ChangeLog
index 78ffcbb7e0b06bd2dc7a4bae999dbe8969f9e217..46bb9b56aa10ce6e15bedc34c2712bce9e8907be 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2013-09-22  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/gcm-hash8.asm: New file.
+	* x86_64/gcm-gf-mul-8.asm: Deleted.
+
+	* configure.ac (asm_nettle_optional_list): Look for gcm-hash8.asm,
+	not gcm-gf-mul-8.asm.
+	* gcm.c [HAVE_NATIVE_gcm_hash8]: Make use of (optional) assembly
+	implementation.
+
 2013-09-21  Niels Möller  <nisse@lysator.liu.se>
 
 	* Makefile.in (des.po): Add same dependencies as for des.o.
diff --git a/configure.ac b/configure.ac
index 72da6d78de566f0c8d00d3cb0ecb06345543f4e5..037fbe97278767962ae71cf17c4f9e726fac5860 100644
--- a/configure.ac
+++ b/configure.ac
@@ -266,7 +266,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
 
 # Assembler files which generate additional object files if they are used.
-asm_nettle_optional_list="gcm-gf-mul-8.asm"
+asm_nettle_optional_list="gcm-hash8.asm"
 asm_hogweed_optional_list=""
 if test "x$enable_public_key" = "xyes" ; then
   asm_hogweed_optional_list="ecc-192-modp.asm ecc-224-modp.asm \
@@ -347,7 +347,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_ecc_384_redc
 #undef HAVE_NATIVE_ecc_521_modp
 #undef HAVE_NATIVE_ecc_521_redc
-#undef HAVE_NATIVE_gcm_gf_mul_8])
+#undef HAVE_NATIVE_gcm_hash8])
 
 # Besides getting correct dependencies, the explicit rules also tell
 # make that the .s files "ought to exist", so they are preferred over
diff --git a/gcm.c b/gcm.c
index 7b387bacfac88826a8038fc05c2437fb4b1b244e..c9ea30bc1a696849b33daa8dd1c4782854afe4e0 100644
--- a/gcm.c
+++ b/gcm.c
@@ -214,12 +214,13 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
   memcpy (x->b, Z.b, sizeof(Z));
 }
 # elif GCM_TABLE_BITS == 8
-#  if HAVE_NATIVE_gcm_gf_mul_8
+#  if HAVE_NATIVE_gcm_hash8
 
-#define gcm_gf_mul _nettle_gcm_gf_mul_8
+#define gcm_hash _nettle_gcm_hash8
 void
-gcm_gf_mul (union gcm_block *x, const union gcm_block *y);
-#  else /* !HAVE_NATIVE_gcm_gf_mul_8 */
+_nettle_gcm_hash8 (const struct gcm_key *key, union gcm_block *x,
+		   size_t length, const uint8_t *data);
+#  else /* !HAVE_NATIVE_gcm_hash8 */
 static const uint16_t
 shift_table[0x100] = {
   W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
@@ -310,7 +311,7 @@ gcm_gf_mul (union gcm_block *x, const union gcm_block *table)
   gcm_gf_shift_8(&Z);
   gcm_gf_add(x, &Z, &table[x->b[0]]);
 }
-#  endif /* ! HAVE_NATIVE_gcm_gf_mul_8 */
+#  endif /* ! HAVE_NATIVE_gcm_hash8 */
 # else /* GCM_TABLE_BITS != 8 */
 #  error Unsupported table size. 
 # endif /* GCM_TABLE_BITS != 8 */
@@ -353,6 +354,7 @@ gcm_set_key(struct gcm_key *key,
 #endif
 }
 
+#ifndef gcm_hash
 static void
 gcm_hash(const struct gcm_key *key, union gcm_block *x,
 	 size_t length, const uint8_t *data)
@@ -369,6 +371,7 @@ gcm_hash(const struct gcm_key *key, union gcm_block *x,
       gcm_gf_mul (x, key->h);
     }
 }
+#endif /* !gcm_hash */
 
 static void
 gcm_hash_sizes(const struct gcm_key *key, union gcm_block *x,
diff --git a/x86_64/gcm-gf-mul-8.asm b/x86_64/gcm-hash8.asm
similarity index 66%
rename from x86_64/gcm-gf-mul-8.asm
rename to x86_64/gcm-hash8.asm
index 2296ba8738bac4840ad6b28729b0f5a0fd091e85..3a3f012eafa15fe0206d244f4e310ea77c57815e 100644
--- a/x86_64/gcm-gf-mul-8.asm
+++ b/x86_64/gcm-hash8.asm
@@ -1,17 +1,17 @@
 C nettle, low-level cryptographics library
-C 
+C
 C Copyright (C) 2013, Niels Möller
-C  
+C
 C The nettle library is free software; you can redistribute it and/or modify
 C it under the terms of the GNU Lesser General Public License as published by
 C the Free Software Foundation; either version 2.1 of the License, or (at your
 C option) any later version.
-C 
+C
 C The nettle library is distributed in the hope that it will be useful, but
 C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
 C License for more details.
-C 
+C
 C You should have received a copy of the GNU Lesser General Public License
 C along with the nettle library; see the file COPYING.LIB.  If not, write to
 C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
@@ -19,47 +19,51 @@ C MA 02111-1301, USA.
 
 C Register usage:
 
-define(<XP>, <%rdi>)
-define(<TABLE>, <%rsi>)
-define(<XW>, <%rax>)
-define(<CNT>, <%ecx>)
-define(<Z0>, <%rdx>)
-define(<Z1>, <%r8>)
-define(<T0>, <%r9>)
-define(<T1>, <%r10>)
-define(<T2>, <%r11>)
-define(<SHIFT_TABLE>, <%rbx>)
-	
-C The C code is 12.5 c/byte, slower than sha1 (10.6), while this code runs
-C at 10.2, slightly faster. Benchmarked on a low-end AMD E-350.
-
-	.file "gcm-gf-mul-8.asm"
-	
-	C void _gcm_gf_mul_8(union gcm_block *x, const union gcm_block *table)
+define(<KEY>, <%rdi>)
+define(<XP>, <%rsi>)
+define(<LENGTH>, <%rdx>)
+define(<SRC>, <%rcx>)
+define(<X0>, <%rax>)
+define(<X1>, <%rbx>)
+define(<CNT>, <%ebp>)
+define(<T0>, <%r8>)
+define(<T1>, <%r9>)
+define(<T2>, <%r10>)
+define(<Z0>, <%r11>)
+define(<Z1>, <%r12>)
+define(<SHIFT_TABLE>, <%r13>)
+
+	.file "gcm-hash8.asm"
+
+	C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+	C                size_t length, const uint8_t *data)
+
 	.text
 	ALIGN(16)
-PROLOGUE(_nettle_gcm_gf_mul_8)
-	W64_ENTRY(2, 0)
+PROLOGUE(_nettle_gcm_hash8)
+	W64_ENTRY(4, 0)
 	push	%rbx
-	mov	8(XP), XW
-	rol	$8, XW
-	movzbl	LREG(XW), XREG(T0)
-	shl	$4, T0
-	mov	(TABLE, T0), Z0
-	mov	8(TABLE, T0), Z1
+	push	%rbp
+	push	%r12
+	push	%r13
+	sub	$16, LENGTH
 	lea	.Lshift_table(%rip), SHIFT_TABLE
-	movl	$7, CNT
-	call	.Lmul_word
-	mov	(XP), XW
-	movl	$8, CNT
-	call	.Lmul_word
-	mov	Z0, (XP)
-	mov	Z1, 8(XP)
-	W64_EXIT(2, 0)
-	pop	%rbx
-	ret
+	mov	(XP), X0
+	mov	8(XP), X1
+	jc	.Lfinal
+ALIGN(16)
+.Lblock_loop:
+
+	xor (SRC), X0
+	xor 8(SRC), X1
+
+.Lblock_mul:
+	rol	$8, X1
+	movzbl	LREG(X1), XREG(T1)
+	shl	$4, T1
+	mov	(KEY, T1), Z0
+	mov	8(KEY, T1), Z1
 
-.Lmul_word:
 	C shift Z1, Z0, transforming
 	C +-----------------------+-----------------------+
 	C |15 14 13 12 11 10 09 08|07 06 05 04 03 02 01 00|
@@ -70,25 +74,118 @@ PROLOGUE(_nettle_gcm_gf_mul_8)
 	C +-----------------------+-----------------+-----+
 	C                               xor         |T[15]|
 	C                                           +-----+
+
+	mov	$7, CNT
+
+ALIGN(16)
+.Loop_X1:
 	mov	Z1, T1
+	shr	$56, T1
+	shl	$8, Z1
 	mov	Z0, T0
-	shl	$8, Z1		C Use shld?
 	shl	$8, Z0
-	shr	$56, T1
 	shr	$56, T0
 	movzwl	(SHIFT_TABLE, T1, 2), XREG(T1)
-	rol	$8, XW
+	xor	T1, Z0
+	rol	$8, X1
+	movzbl	LREG(X1), XREG(T2)
+	shl	$4, T2
+	xor	(KEY, T2), Z0
 	add	T0, Z1
+	xor	8(KEY, T2), Z1
+	decl	CNT
+	jne	.Loop_X1
+
+	mov	$7, CNT
+
+ALIGN(16)
+.Loop_X0:
+	mov	Z1, T1
+	shr	$56, T1
+	shl	$8, Z1
+	mov	Z0, T0
+	shl	$8, Z0
+	shr	$56, T0
+	movzwl	(SHIFT_TABLE, T1, 2), XREG(T1)
 	xor	T1, Z0
-	movzbl	LREG(XW), XREG(T2)
+	rol	$8, X0
+	movzbl	LREG(X0), XREG(T2)
 	shl	$4, T2
-	xor	(TABLE, T2), Z0
-	xor	8(TABLE, T2), Z1
+	xor	(KEY, T2), Z0
+	add	T0, Z1
+	xor	8(KEY, T2), Z1
 	decl	CNT
-	jne	.Lmul_word
+	jne	.Loop_X0
+
+	mov	Z1, T1
+	shr	$56, T1
+	shl	$8, Z1
+	mov	Z0, T0
+	shl	$8, Z0
+	shr	$56, T0
+	movzwl	(SHIFT_TABLE, T1, 2), XREG(T1)
+	xor	T1, Z0
+	rol	$8, X0
+	movzbl	LREG(X0), XREG(T2)
+	shl	$4, T2
+	mov	(KEY, T2), X0
+	xor	Z0, X0
+	add	T0, Z1
+	mov	8(KEY, T2), X1
+	xor	Z1, X1
+
+	add	$16, SRC
+	sub	$16, LENGTH
+	jnc	.Lblock_loop
+
+.Lfinal:
+	add	$16, LENGTH
+	jnz	.Lpartial
+
+	mov	X0, (XP)
+	mov	X1, 8(XP)
+
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	W64_EXIT(2, 0)
+	ret
+
+.Lpartial:
+	C Read and xor partial block, then jump back into the loop
+	C with LENGTH == 0.
+
+	cmp	$8, LENGTH
+	jc	.Llt8
+
+	C 	8 <= LENGTH < 16
+	xor	(SRC), X0
+	add	$8, SRC
+	sub	$8, LENGTH
+	jz	.Lblock_mul
+	call	.Lread_bytes
+	xor	T0, X1
+	jmp	.Lblock_mul
+
+.Llt8:	C 0 < LENGTH < 8
+	call	.Lread_bytes
+	xor	T0, X0
+	jmp	.Lblock_mul
+
+C Read 0 < LENGTH < 8 bytes at SRC, result in T0
+.Lread_bytes:
+	xor	T0, T0
+	sub	$1, SRC
+ALIGN(16)
+.Lread_loop:
+	shl	$8, T0
+	orb	(SRC, LENGTH), LREG(T0)
+.Lread_next:
+	sub	$1, LENGTH
+	jnz	.Lread_loop
 	ret
-	
-EPILOGUE(_nettle_gcm_gf_mul_8)
+EPILOGUE(_nettle_gcm_hash8)
 
 define(<W>, <0x$2$1>)
 	.section .rodata
@@ -126,5 +223,3 @@ define(<W>, <0x$2$1>)
 .hword W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e)
 .hword W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae)
 .hword W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be)
-	
-