diff --git a/ChangeLog b/ChangeLog
index 6ee1bb0aa943444f9e83fee64d6d2bb414b3250f..5f718808a3785a5bfbc61e53b7da31e95cbfa319 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2014-06-01  Niels Möller  <nisse@lysator.liu.se>
+
+	* x86_64/gcm-hash8.asm: Pass correct argument count to W64_EXIT.
+	* x86_64/camellia-crypt-internal.asm: Pass correct argument count
+	to W64_ENTRY and W64_EXIT.
+
+	* x86_64/machine.m4 [W64_ABI]: Fix for the case of 6 function
+	arguments. Also push %rdi unconditionally, and use aligned
+	accesses for save and restore %xmm registers (movdqa).
+
 2014-05-31  Niels Möller  <nisse@lysator.liu.se>
 
 	* configure.ac: Check for COFF type directives.
diff --git a/x86_64/README b/x86_64/README
index ae693be55b3591be1542d2ba87b1c7fb332655e7..d04e5dfc8d9481c9685dda77fb64db6f48bae934 100644
--- a/x86_64/README
+++ b/x86_64/README
@@ -4,6 +4,8 @@ Up to 6 integer and pointer arguments are passed in registers. Nine
 registers, %rax, %rcx, %rdx, %rsi, %rdi and %r8-%r11 can be used
 freely. Integers and pointers are returned in %rax.
 
+At entry, it is required that %rsp == 8 (mod 16).
+
 Registers	May be		Argument
 		clobbered	number
 
@@ -51,14 +53,19 @@ Additional arguments are passed on the stack. "backing store" on the
 stack for the four register arguments is also required. %xmm6 to
 %xmm15 are callee-saved. The "long" type is just 32 bits.
 
-If we have five arguments, and push the additional callee-save
+If we have six arguments, and push the additional callee-save
 registers %rdi and %rsi on the stack, we get a stack frame like
 
+64(%rsp): Sixth argument
 56(%rsp): Fifth argument
 48(%rsp): Space for fourth argument
 40(%rsp): Space for third argument
 32(%rsp): Space for second argument
 24(%rsp): Space for first argument
 16(%rsp): Return address
-8(%rsp): Saved %rsi
-(%rsp) : Saved %rdi
+8(%rsp) : Saved %rdi
+(%rsp): Saved %rsi
+
+If, in addition, we use more than 6 %xmm registers, we push them 
+*after* %rdi (but before %rsi), so that they are stored at 16-byte
+aligned addresses.
diff --git a/x86_64/camellia-crypt-internal.asm b/x86_64/camellia-crypt-internal.asm
index c6032a5d11d024377c7e7c7769de8403111a64e8..040e030f168f4094e5fb26579ca09c1d7bab03c1 100644
--- a/x86_64/camellia-crypt-internal.asm
+++ b/x86_64/camellia-crypt-internal.asm
@@ -138,7 +138,7 @@ C	xorl	XREG(TMP), XREG($1)
 	ALIGN(16)
 PROLOGUE(_nettle_camellia_crypt)
 
-	W64_ENTRY(5, 0)
+	W64_ENTRY(6, 0)
 	test	LENGTH, LENGTH
 	jz	.Lend
 
@@ -197,6 +197,6 @@ PROLOGUE(_nettle_camellia_crypt)
 	pop	%rbp
 	pop	%rbx
 .Lend:
-	W64_EXIT(5, 0)
+	W64_EXIT(6, 0)
 	ret
 EPILOGUE(_nettle_camellia_crypt)
diff --git a/x86_64/gcm-hash8.asm b/x86_64/gcm-hash8.asm
index f74f2f4b061648a3fd3eaa6c899016770f474999..6dec3b8c54a700b6f7779a36d9e11d2163dec3a2 100644
--- a/x86_64/gcm-hash8.asm
+++ b/x86_64/gcm-hash8.asm
@@ -162,7 +162,7 @@ ALIGN(16)
 	pop	%r12
 	pop	%rbp
 	pop	%rbx
-	W64_EXIT(2, 0)
+	W64_EXIT(4, 0)
 	ret
 
 .Lpartial:
diff --git a/x86_64/machine.m4 b/x86_64/machine.m4
index b9556a27b26005629209097503e8dcfbf7f621de..397e9b25840b4d47bbd122c466bd9caa3ddc5da8 100644
--- a/x86_64/machine.m4
+++ b/x86_64/machine.m4
@@ -67,44 +67,48 @@ define(<XREG>,<ifelse(
 dnl W64_ENTRY(nargs, xmm_used)
 define(<W64_ENTRY>, <
   changequote([,])dnl
-  ifelse(<<<<<<<<<<<<<<<< ignored; only for balancing)
+  ifelse(<<<<<<<<<<<<<<<<<< ignored; only for balancing)
   ifelse(W64_ABI,yes,[
+    dnl unconditionally push %rdi, making %rsp 16-byte aligned
+    push	%rdi
+    dnl Save %xmm6, ..., if needed
     ifelse(eval($2 > 6), 1, [
-      sub	[$]eval(8 + 16*($2 - 6)), %rsp
-      movdqu	%xmm6, 0(%rsp)
+      sub	[$]eval(16*($2 - 6)), %rsp
+      movdqa	%xmm6, 0(%rsp)
     ])
     ifelse(eval($2 > 7), 1, [
-      movdqu	%xmm7, 16(%rsp)
+      movdqa	%xmm7, 16(%rsp)
     ])
     ifelse(eval($2 > 8), 1, [
-      movdqu	%xmm8, 32(%rsp)
+      movdqa	%xmm8, 32(%rsp)
     ])
     ifelse(eval($2 > 9), 1, [
-      movdqu	%xmm9, 48(%rsp)
+      movdqa	%xmm9, 48(%rsp)
     ])
     ifelse(eval($2 > 10), 1, [
-      movdqu	%xmm10, 64(%rsp)
+      movdqa	%xmm10, 64(%rsp)
     ])
     ifelse(eval($2 > 11), 1, [
-      movdqu	%xmm11, 80(%rsp)
+      movdqa	%xmm11, 80(%rsp)
     ])
     ifelse(eval($2 > 12), 1, [
-      movdqu	%xmm12, 96(%rsp)
+      movdqa	%xmm12, 96(%rsp)
     ])
     ifelse(eval($2 > 13), 1, [
-      movdqu	%xmm13, 112(%rsp)
+      movdqa	%xmm13, 112(%rsp)
     ])
     ifelse(eval($2 > 14), 1, [
-      movdqu	%xmm14, 128(%rsp)
+      movdqa	%xmm14, 128(%rsp)
     ])
     ifelse(eval($2 > 15), 1, [
-      movdqu	%xmm15, 144(%rsp)
+      movdqa	%xmm15, 144(%rsp)
     ])
+    dnl Move around arguments
     ifelse(eval($1 >= 1), 1, [
-      push	%rdi
       mov	%rcx, %rdi
     ])
     ifelse(eval($1 >= 2), 1, [
+      dnl NOTE: Breaks 16-byte %rsp alignment
       push	%rsi
       mov	%rdx, %rsi
     ])
@@ -115,11 +119,10 @@ define(<W64_ENTRY>, <
       mov	%r9, %rcx
     ])
     ifelse(eval($1 >= 5), 1, [
-      ifelse(eval($2 > 6), 1, [
-        mov	eval(8 + 16*($2 - 6) + 56)(%rsp), %r8
-      ], [
-        mov	56(%rsp), %r8
-      ])
+      mov	ifelse(eval($2 > 6), 1, eval(16*($2-6)+56),56)(%rsp), %r8
+    ])
+    ifelse(eval($1 >= 6), 1, [
+      mov	ifelse(eval($2 > 6), 1, eval(16*($2-6)+64),64)(%rsp), %r9
     ])
   ])
   changequote(<,>)dnl
@@ -128,45 +131,43 @@ define(<W64_ENTRY>, <
 dnl W64_EXIT(nargs, xmm_used)
 define(<W64_EXIT>, <
   changequote([,])dnl
-  ifelse(<<<<<<<<<<<< ignored; only for balancing)
+  ifelse(<<<<<<<<<<< ignored; only for balancing)
   ifelse(W64_ABI,yes,[
     ifelse(eval($1 >= 2), 1, [
       pop	%rsi
-    ])
-    ifelse(eval($1 >= 1), 1, [
-      pop	%rdi
-    ])
+    ])  
     ifelse(eval($2 > 15), 1, [
-      movdqu	144(%rsp), %xmm15
+      movdqa	144(%rsp), %xmm15
     ])
     ifelse(eval($2 > 14), 1, [
-      movdqu	128(%rsp), %xmm14
+      movdqa	128(%rsp), %xmm14
     ])
     ifelse(eval($2 > 13), 1, [
-      movdqu	112(%rsp), %xmm13
+      movdqa	112(%rsp), %xmm13
     ])
     ifelse(eval($2 > 12), 1, [
-      movdqu	96(%rsp), %xmm12
+      movdqa	96(%rsp), %xmm12
     ])
     ifelse(eval($2 > 11), 1, [
-      movdqu	80(%rsp), %xmm11
+      movdqa	80(%rsp), %xmm11
     ])
     ifelse(eval($2 > 10), 1, [
-      movdqu	64(%rsp), %xmm10
+      movdqa	64(%rsp), %xmm10
     ])
     ifelse(eval($2 > 9), 1, [
-      movdqu	48(%rsp), %xmm9
+      movdqa	48(%rsp), %xmm9
     ])
     ifelse(eval($2 > 8), 1, [
-      movdqu	32(%rsp), %xmm8
+      movdqa	32(%rsp), %xmm8
     ])
     ifelse(eval($2 > 7), 1, [
-      movdqu	16(%rsp), %xmm7
+      movdqa	16(%rsp), %xmm7
     ])
     ifelse(eval($2 > 6), 1, [
-      movdqu	0(%rsp), %xmm6
-      add	[$]eval(8 + 16*($2 - 6)), %rsp
+      movdqa	(%rsp), %xmm6
+      add	[$]eval(16*($2 - 6)), %rsp
     ])
+    pop	%rdi
   ])
   changequote(<,>)dnl
 >)