diff --git a/aes-decrypt-internal.c b/aes-decrypt-internal.c
index fcd7289a06c630dd45fc5c26944e0181bed7b84c..47747ab868dc00dd0ce0cd1f3004fd101bfa3a82 100644
--- a/aes-decrypt-internal.c
+++ b/aes-decrypt-internal.c
@@ -60,22 +60,23 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
     {
       uint32_t w0, w1, w2, w3;		/* working ciphertext */
       uint32_t t0, t1, t2, t3;
+      const uint32_t *p;
       unsigned i;
       
       /* Get clear text, using little-endian byte order.
        * Also XOR with the first subkey. */
 
-      w0 = LE_READ_UINT32(src)      ^ keys[4*rounds];
-      w1 = LE_READ_UINT32(src + 4)  ^ keys[4*rounds + 1];
-      w2 = LE_READ_UINT32(src + 8)  ^ keys[4*rounds + 2];
-      w3 = LE_READ_UINT32(src + 12) ^ keys[4*rounds + 3];
+      w0 = LE_READ_UINT32(src)      ^ keys[0];
+      w1 = LE_READ_UINT32(src + 4)  ^ keys[1];
+      w2 = LE_READ_UINT32(src + 8)  ^ keys[2];
+      w3 = LE_READ_UINT32(src + 12) ^ keys[3];
 
-      for (i = rounds - 1; i > 0; i--)
+      for (i = 1, p = keys - 4; i < rounds; i++, p -= 4)
 	{
-	  t0 = AES_ROUND(T, w0, w3, w2, w1, keys[4*i]);
-	  t1 = AES_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]);
-	  t2 = AES_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]);
-	  t3 = AES_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]);
+	  t0 = AES_ROUND(T, w0, w3, w2, w1, p[0]);
+	  t1 = AES_ROUND(T, w1, w0, w3, w2, p[1]);
+	  t2 = AES_ROUND(T, w2, w1, w0, w3, p[2]);
+	  t3 = AES_ROUND(T, w3, w2, w1, w0, p[3]);
 
 	  /* We could unroll the loop twice, to avoid these
 	     assignments. If all eight variables fit in registers,
@@ -88,10 +89,10 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
 
       /* Final round */
 
-      t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, keys[0]);
-      t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, keys[1]);
-      t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, keys[2]);
-      t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, keys[3]);
+      t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, p[0]);
+      t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, p[1]);
+      t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, p[2]);
+      t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, p[3]);
 
       LE_WRITE_UINT32(dst, t0);
       LE_WRITE_UINT32(dst + 4, t1);
diff --git a/aes-internal.h b/aes-internal.h
index 64cf7be56ed1d27ecc414154866770b977593e99..442b045b279121612cf3b2a17177ecf852f2a1e5 100644
--- a/aes-internal.h
+++ b/aes-internal.h
@@ -66,6 +66,8 @@ _nettle_aes_encrypt(unsigned rounds, const uint32_t *keys,
 		    size_t length, uint8_t *dst,
 		    const uint8_t *src);
 
+/* The keys pointer points at the subkeys for the first decrypt round,
+   located at the end of the array. */
 void
 _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
 		    const struct aes_table *T,
diff --git a/aes128-decrypt.c b/aes128-decrypt.c
index 436438cc38cd36961ae01572bb629ea82a3a9762..96d6bc6ed57c64c1bd90b80266ce1e0f1f461ff6 100644
--- a/aes128-decrypt.c
+++ b/aes128-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes128_decrypt(const struct aes128_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys + 4*_AES128_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/aes192-decrypt.c b/aes192-decrypt.c
index 7746c76e574c09072c90836623e8c42d2581c82f..bf8702e4c679b0129210ad84efd3a0716dfa3562 100644
--- a/aes192-decrypt.c
+++ b/aes192-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes192_decrypt(const struct aes192_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys + 4 * _AES192_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/aes256-decrypt.c b/aes256-decrypt.c
index 89411c10ee3ecad7688271f4996b5c268b2a4826..ae050797abacdbc1b5afe6346956be12f8c76a36 100644
--- a/aes256-decrypt.c
+++ b/aes256-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes256_decrypt(const struct aes256_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys + 4 * _AES256_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/sparc64/aes-decrypt-internal.asm b/sparc64/aes-decrypt-internal.asm
index 5e7952337e2c1831b91b5ddabe0a3d195650f6eb..8918de1baf84a9f700d76ca8ff8c508651e34a76 100644
--- a/sparc64/aes-decrypt-internal.asm
+++ b/sparc64/aes-decrypt-internal.asm
@@ -88,9 +88,6 @@ PROLOGUE(_nettle_aes_decrypt)
 	add	T, AES_TABLE2, T2
 	add	T, AES_TABLE3, T3
 
-	sll	ROUNDS, 4, W0	C Can use W0 as scratch
-	add	KEYS, W0, KEYS	C Point to last subkey
-
 	C	Must be even, and includes the final round
 	srl	ROUNDS, 1, ROUNDS
 	C	Last two rounds handled specially
diff --git a/x86/aes-decrypt-internal.asm b/x86/aes-decrypt-internal.asm
index 36d6ca44dc4b50b972a054107c50d5f737f10042..d8d980da8b4880a56e1c022ba38770fa93bfdf37 100644
--- a/x86/aes-decrypt-internal.asm
+++ b/x86/aes-decrypt-internal.asm
@@ -90,10 +90,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	jz	.Lend
 
 	shrl	$4, PARAM_LENGTH
-	movl	PARAM_ROUNDS, TMP
 	decl	PARAM_ROUNDS
-	shll	$4, TMP
-	addl	TMP, PARAM_KEYS
 .Lblock_loop:
 	movl	PARAM_KEYS, KEY	C  address of subkeys
 	
diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm
index ed753a2c00fc596c348f009f8878fba43944e87f..afa44f3568d72bba9038d536d3d3f2c4075dfcce 100644
--- a/x86_64/aes-decrypt-internal.asm
+++ b/x86_64/aes-decrypt-internal.asm
@@ -83,9 +83,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	push	%r15	
 
 	subl	$1, XREG(ROUNDS)
-	push	ROUNDS			C Rounds stored at (%rsp)
-	shl	$4, XREG(ROUNDS)	C Zero-extends
-	lea	16(KEYS, ROUNDS), KEYS
+	push	ROUNDS		C Rounds stored at (%rsp)
 
 	mov	PARAM_TABLE, TABLE
 	mov	PARAM_LENGTH, LENGTH
@@ -94,8 +92,8 @@ PROLOGUE(_nettle_aes_decrypt)
 	mov	KEYS, KEY
 	
 	AES_LOAD(SA, SB, SC, SD, SRC, KEY)
+	add	$16, SRC	C Increment src pointer
 
-	add	$16, SRC	C  increment src pointer
 	movl	(%rsp), XREG(ROUNDS)
 
 	sub	$16, KEY	C  point to next key