diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm
index d3a48f26ea298ed08197c1d7eeea242c13dad780..1e9d700baeee8c42f67f0dd02afe9d5db324a7d6 100644
--- a/x86_64/aes-decrypt-internal.asm
+++ b/x86_64/aes-decrypt-internal.asm
@@ -24,28 +24,25 @@ C Register usage:
 C AES state, use two of them
 define(<SA>,<%eax>)
 define(<SB>,<%ebx>)
-define(<SC>,<%ebp>)
-define(<SD>,<%r9d>)
+define(<SC>,<%ecx>)
+define(<SD>,<%edx>)
 
 define(<TA>,<%r10d>)
 define(<TB>,<%r11d>)
 define(<TC>,<%r12d>)
-define(<TD>,<%r13d>)
 
 define(<CTX>,	<%rdi>)
 define(<TABLE>,	<%rsi>)
-define(<LENGTH>,<%edx>)		C Length is only 32 bits
-define(<DST>,	<%rcx>)
+define(<PARAM_LENGTH>,<%edx>)		C Length is only 32 bits
+define(<PARAM_DST>,	<%rcx>)
 define(<SRC>,	<%r8>)
 
+define(<DST>, <%r9>) 
 define(<KEY>,<%r14>)
 define(<COUNT>,	<%r15d>)
+define(<BLOCK_COUNT>, <%r13d>)
 
-C Put the outer loop counter on the stack, and reuse the LENGTH
-C register as a temporary. 
-	
-define(<FRAME_COUNT>,	<(%rsp)>)
-define(<TMP>,<%rdx>)
+define(<TMP>,<%rbp>)
 
 	.file "aes-decrypt-internal.asm"
 	
@@ -56,7 +53,7 @@ define(<TMP>,<%rdx>)
 	.text
 	ALIGN(4)
 PROLOGUE(_nettle_aes_decrypt)
-	test	LENGTH, LENGTH
+	test	PARAM_LENGTH, PARAM_LENGTH
 	jz	.Lend
 
         C save all registers that need to be saved
@@ -67,11 +64,9 @@ PROLOGUE(_nettle_aes_decrypt)
 	push	%r14
 	push	%r15	
 
-	C Allocates 4 bytes more than we need, for nicer alignment.
-	sub	$8, %rsp
-
-	shrl	$4, LENGTH
-	movl	LENGTH, FRAME_COUNT
+	mov	PARAM_DST, DST
+	movl	PARAM_LENGTH, BLOCK_COUNT
+	shrl	$4, BLOCK_COUNT
 .Lblock_loop:
 	mov	CTX,KEY
 	
@@ -95,19 +90,19 @@ PROLOGUE(_nettle_aes_decrypt)
 	AES_ROUND(TABLE, SC,SB,SA,SD, TC, TMP)
 	xorl	8(KEY),TC
 
-	AES_ROUND(TABLE, SD,SC,SB,SA, TD, TMP)
-	xorl	12(KEY),TD
+	AES_ROUND(TABLE, SD,SC,SB,SA, SD, TMP)
+	xorl	12(KEY),SD
 
-	AES_ROUND(TABLE, TA,TD,TC,TB, SA, TMP)
+	AES_ROUND(TABLE, TA,SD,TC,TB, SA, TMP)
 	xorl	16(KEY), SA
 
-	AES_ROUND(TABLE, TB,TA,TD,TC, SB, TMP)
+	AES_ROUND(TABLE, TB,TA,SD,TC, SB, TMP)
 	xorl	20(KEY),SB
 
-	AES_ROUND(TABLE, TC,TB,TA,TD, SC, TMP)
+	AES_ROUND(TABLE, TC,TB,TA,SD, SC, TMP)
 	xorl	24(KEY),SC
 
-	AES_ROUND(TABLE, TD,TC,TB,TA, SD, TMP)
+	AES_ROUND(TABLE, SD,TC,TB,TA, SD, TMP)
 	xorl	28(KEY),SD
 	
 	add	$32,KEY	C  point to next key
@@ -125,13 +120,13 @@ PROLOGUE(_nettle_aes_decrypt)
 	AES_ROUND(TABLE, SC,SB,SA,SD, TC, TMP)
 	xorl	8(KEY),TC
 
-	AES_ROUND(TABLE, SD,SC,SB,SA, TD, TMP)
-	xorl	12(KEY),TD
+	AES_ROUND(TABLE, SD,SC,SB,SA, SD, TMP)
+	xorl	12(KEY),SD
 
-	AES_FINAL_ROUND(TA,TD,TC,TB, TABLE, SA, TMP)
-	AES_FINAL_ROUND(TB,TA,TD,TC, TABLE, SB, TMP)
-	AES_FINAL_ROUND(TC,TB,TA,TD, TABLE, SC, TMP)
-	AES_FINAL_ROUND(TD,TC,TB,TA, TABLE, SD, TMP)
+	AES_FINAL_ROUND(TA,SD,TC,TB, TABLE, SA, TMP)
+	AES_FINAL_ROUND(TB,TA,SD,TC, TABLE, SB, TMP)
+	AES_FINAL_ROUND(TC,TB,TA,SD, TABLE, SC, TMP)
+	AES_FINAL_ROUND(SD,TC,TB,TA, TABLE, SD, TMP)
 
 	C Inverse S-box substitution
 	mov	$3, COUNT
@@ -145,11 +140,10 @@ PROLOGUE(_nettle_aes_decrypt)
 	AES_STORE(SA,SB,SC,SD, KEY, DST)
 	
 	add	$16, DST
-	decl	FRAME_COUNT
+	decl	BLOCK_COUNT
 
 	jnz	.Lblock_loop
 
-	add	$8, %rsp
 	pop	%r15	
 	pop	%r14
 	pop	%r13
diff --git a/x86_64/aes-encrypt-internal.asm b/x86_64/aes-encrypt-internal.asm
index e89dc2bac2d1c2bddfc3ba63eb9dd22635541a25..839d3341ffa8289ba5dba34d7116498390389262 100644
--- a/x86_64/aes-encrypt-internal.asm
+++ b/x86_64/aes-encrypt-internal.asm
@@ -1,4 +1,3 @@
-C -*- mode: asm; asm-comment-char: ?C; -*-  
 C nettle, low-level cryptographics library
 C 
 C Copyright (C) 2001, 2002, 2005, 2008 Rafael R. Sevilla, Niels M�ller
@@ -25,28 +24,25 @@ C Register usage:
 C AES state, use two of them
 define(<SA>,<%eax>)
 define(<SB>,<%ebx>)
-define(<SC>,<%ebp>)
-define(<SD>,<%r9d>)
+define(<SC>,<%ecx>)
+define(<SD>,<%edx>)
 
 define(<TA>,<%r10d>)
 define(<TB>,<%r11d>)
 define(<TC>,<%r12d>)
-define(<TD>,<%r13d>)
 
 define(<CTX>,	<%rdi>)
 define(<TABLE>,	<%rsi>)
-define(<LENGTH>,<%edx>)		C Length is only 32 bits
-define(<DST>,	<%rcx>)
+define(<PARAM_LENGTH>,<%edx>)		C Length is only 32 bits
+define(<PARAM_DST>,	<%rcx>)
 define(<SRC>,	<%r8>)
 
+define(<DST>, <%r9>) 
 define(<KEY>,<%r14>)
 define(<COUNT>,	<%r15d>)
+define(<BLOCK_COUNT>, <%r13d>)
 
-C Put the outer loop counter on the stack, and reuse the LENGTH
-C register as a temporary. 
-	
-define(<FRAME_COUNT>,	<(%rsp)>)
-define(<TMP>,<%rdx>)
+define(<TMP>,<%rbp>)
 
 	.file "aes-encrypt-internal.asm"
 	
@@ -57,7 +53,7 @@ define(<TMP>,<%rdx>)
 	.text
 	ALIGN(4)
 PROLOGUE(_nettle_aes_encrypt)
-	test	LENGTH, LENGTH
+	test	PARAM_LENGTH, PARAM_LENGTH
 	jz	.Lend
 
         C save all registers that need to be saved
@@ -68,11 +64,9 @@ PROLOGUE(_nettle_aes_encrypt)
 	push	%r14
 	push	%r15	
 
-	C Allocates 4 bytes more than we need, for nicer alignment.
-	sub	$8, %rsp
-
-	shrl	$4, LENGTH
-	movl	LENGTH, FRAME_COUNT
+	mov	PARAM_DST, DST
+	movl	PARAM_LENGTH, BLOCK_COUNT
+	shrl	$4, BLOCK_COUNT
 .Lblock_loop:
 	mov	CTX,KEY
 	
@@ -96,19 +90,19 @@ PROLOGUE(_nettle_aes_encrypt)
 	AES_ROUND(TABLE, SC,SD,SA,SB, TC, TMP)
 	xorl	8(KEY),TC
 
-	AES_ROUND(TABLE, SD,SA,SB,SC, TD, TMP)
-	xorl	12(KEY),TD
+	AES_ROUND(TABLE, SD,SA,SB,SC, SD, TMP)
+	xorl	12(KEY),SD
 
-	AES_ROUND(TABLE, TA,TB,TC,TD, SA, TMP)
+	AES_ROUND(TABLE, TA,TB,TC,SD, SA, TMP)
 	xorl	16(KEY), SA
 
-	AES_ROUND(TABLE, TB,TC,TD,TA, SB, TMP)
+	AES_ROUND(TABLE, TB,TC,SD,TA, SB, TMP)
 	xorl	20(KEY),SB
 
-	AES_ROUND(TABLE, TC,TD,TA,TB, SC, TMP)
+	AES_ROUND(TABLE, TC,SD,TA,TB, SC, TMP)
 	xorl	24(KEY),SC
 
-	AES_ROUND(TABLE, TD,TA,TB,TC, SD, TMP)
+	AES_ROUND(TABLE, SD,TA,TB,TC, SD, TMP)
 	xorl	28(KEY),SD
 	
 	add	$32,KEY	C  point to next key
@@ -126,13 +120,13 @@ PROLOGUE(_nettle_aes_encrypt)
 	AES_ROUND(TABLE, SC,SD,SA,SB, TC, TMP)
 	xorl	8(KEY),TC
 
-	AES_ROUND(TABLE, SD,SA,SB,SC, TD, TMP)
-	xorl	12(KEY),TD
+	AES_ROUND(TABLE, SD,SA,SB,SC, SD, TMP)
+	xorl	12(KEY),SD
 
-	AES_FINAL_ROUND(TA,TB,TC,TD, TABLE, SA, TMP)
-	AES_FINAL_ROUND(TB,TC,TD,TA, TABLE, SB, TMP)
-	AES_FINAL_ROUND(TC,TD,TA,TB, TABLE, SC, TMP)
-	AES_FINAL_ROUND(TD,TA,TB,TC, TABLE, SD, TMP)
+	AES_FINAL_ROUND(TA,TB,TC,SD, TABLE, SA, TMP)
+	AES_FINAL_ROUND(TB,TC,SD,TA, TABLE, SB, TMP)
+	AES_FINAL_ROUND(TC,SD,TA,TB, TABLE, SC, TMP)
+	AES_FINAL_ROUND(SD,TA,TB,TC, TABLE, SD, TMP)
 
 	C S-box substitution
 	mov	$3, COUNT
@@ -146,11 +140,10 @@ PROLOGUE(_nettle_aes_encrypt)
 	AES_STORE(SA,SB,SC,SD, KEY, DST)
 	
 	add	$16, DST
-	decl	FRAME_COUNT
+	decl	BLOCK_COUNT
 
 	jnz	.Lblock_loop
 
-	add	$8, %rsp
 	pop	%r15	
 	pop	%r14
 	pop	%r13
diff --git a/x86_64/aes.m4 b/x86_64/aes.m4
index 26f4b29ac6b74092dfa6e325fd63f55f4b830c7a..9f251c50d2e8ba8ed609e5d752d7a126b8815900 100644
--- a/x86_64/aes.m4
+++ b/x86_64/aes.m4
@@ -17,6 +17,28 @@ define(<LREG>,<ifelse(
 	$1, %r14d, %r14b,
 	$1, %r15d, %r15b)>)dnl
 
+define(<HREG>,<ifelse(
+	$1, %eax, %ah,
+	$1, %ebx, %bh,
+	$1, %ecx, %ch,
+	$1, %edx, %dh,
+	error)>)
+
+dnl MOVE_HREG(src, dst)
+define(<MOVE_HREG>, <ifelse(
+	$1, %eax, <movzb	%ah, $2
+	>,
+	$1, %ebx, <movzb	%bh, $2
+	>,
+	$1, %ecx, <movzb	%ch, $2
+	>,
+	$1, %edx, <movzb	%dh, $2
+	>,
+	<movl	$1, $2
+	shr	<$>8, $2
+	and	<$>0xff, $2
+	>)>)
+
 define(<XREG>,<ifelse(
 	$1, %rax, %eax,
 	$1, %rbx, %ebx,
@@ -26,8 +48,8 @@ define(<XREG>,<ifelse(
 	$1, %rdi, %edi,
 	$1, %rbp, %ebp,
 	$1, %rsp, %esp,
-	$1, %r8d, %r8d,
-	$1, %r9d, %r9d,
+	$1, %r8, %r8d,
+	$1, %r9, %r9d,
 	$1, %r10,%r10d,
 	$1, %r11,%r11d,
 	$1, %r12,%r12d,
@@ -72,9 +94,7 @@ dnl Computes one word of the AES round. Leaves result in $6.
 define(<AES_ROUND>, <
 	movzb	LREG($2), $7
 	movl	AES_TABLE0 ($1, $7, 4),$6
-	movl	$3, XREG($7)
-	shr	<$>8,$7
-	and	<$>0xff,$7
+	MOVE_HREG($3, XREG($7))
 	xorl	AES_TABLE1 ($1, $7, 4),$6
 	movl	$4,XREG($7)
 	shr	<$>16,$7