diff --git a/ChangeLog b/ChangeLog
index d9166612c0ffb01d92b032eba8017312dd7fe5c5..a1243072651406ec0cab5388c920ca90630bed6b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,32 @@
 	for indexing, and reducing number of used callee-save registers.
 	* powerpc64/p8/aes-decrypt-internal.asm: Likewise.
 
+2024-01-27  Niels Möller  <nisse@lysator.liu.se>
+
+	* aes-invert-internal.c (_nettle_aes_invert): Don't reorder the subkeys.
+	* aes-decrypt-internal.c (_nettle_aes_decrypt): Updated to process
+	subkeys starting from the end, and let subkeys pointer point at
+	the subkey for the first decrypt round, located at the end of the
+	array.
+	* aes128-decrypt.c (nettle_aes128_decrypt): Updated accordingly.
+	* aes192-decrypt.c (nettle_aes192_decrypt): Likewise.
+	* aes256-decrypt.c (nettle_aes256_decrypt): Likewise.
+	* arm/aes.m4 (AES_LOAD_INCR): New macro, specifying desired
+	increment of key pointer.
+	* arm/aes-decrypt-internal.asm: Updated for new conventions.
+	* arm/v6/aes-decrypt-internal.asm: Likewise.
+	* arm64/crypto/aes128-decrypt.asm: Likewise.
+	* arm64/crypto/aes192-decrypt.asm: Likewise.
+	* arm64/crypto/aes256-decrypt.asm: Likewise.
+	* powerpc64/p8/aes-decrypt-internal.asm: Likewise.
+	* sparc64/aes-decrypt-internal.asm: Likewise.
+	* x86/aes-decrypt-internal.asm: Likewise.
+	* x86_64/aes-decrypt-internal.asm: Likewise.
+	* x86_64/aes-decrypt-internal.asm: Likewise.
+	* x86_64/aesni/aes128-decrypt.asm: Likewise.
+	* x86_64/aesni/aes192-decrypt.asm: Likewise.
+	* x86_64/aesni/aes256-decrypt.asm: Likewise.
+
 2024-01-26  Niels Möller  <nisse@lysator.liu.se>
 
 	Delete all sparc32 assembly.
diff --git a/aes-decrypt-internal.c b/aes-decrypt-internal.c
index 9e8cf34a100f773e94c4c4e35062feb3c4f00a55..47747ab868dc00dd0ce0cd1f3004fd101bfa3a82 100644
--- a/aes-decrypt-internal.c
+++ b/aes-decrypt-internal.c
@@ -60,6 +60,7 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
     {
       uint32_t w0, w1, w2, w3;		/* working ciphertext */
       uint32_t t0, t1, t2, t3;
+      const uint32_t *p;
       unsigned i;
       
       /* Get clear text, using little-endian byte order.
@@ -70,12 +71,12 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
       w2 = LE_READ_UINT32(src + 8)  ^ keys[2];
       w3 = LE_READ_UINT32(src + 12) ^ keys[3];
 
-      for (i = 1; i < rounds; i++)
+      for (i = 1, p = keys - 4; i < rounds; i++, p -= 4)
 	{
-	  t0 = AES_ROUND(T, w0, w3, w2, w1, keys[4*i]);
-	  t1 = AES_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]);
-	  t2 = AES_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]);
-	  t3 = AES_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]);
+	  t0 = AES_ROUND(T, w0, w3, w2, w1, p[0]);
+	  t1 = AES_ROUND(T, w1, w0, w3, w2, p[1]);
+	  t2 = AES_ROUND(T, w2, w1, w0, w3, p[2]);
+	  t3 = AES_ROUND(T, w3, w2, w1, w0, p[3]);
 
 	  /* We could unroll the loop twice, to avoid these
 	     assignments. If all eight variables fit in registers,
@@ -88,10 +89,10 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
 
       /* Final round */
 
-      t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, keys[4*i]);
-      t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]);
-      t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]);
-      t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]);
+      t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, p[0]);
+      t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, p[1]);
+      t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, p[2]);
+      t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, p[3]);
 
       LE_WRITE_UINT32(dst, t0);
       LE_WRITE_UINT32(dst + 4, t1);
diff --git a/aes-internal.h b/aes-internal.h
index 64cf7be56ed1d27ecc414154866770b977593e99..442b045b279121612cf3b2a17177ecf852f2a1e5 100644
--- a/aes-internal.h
+++ b/aes-internal.h
@@ -66,6 +66,8 @@ _nettle_aes_encrypt(unsigned rounds, const uint32_t *keys,
 		    size_t length, uint8_t *dst,
 		    const uint8_t *src);
 
+/* The keys pointer points at the subkeys for the first decrypt round,
+   located at the end of the array. */
 void
 _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
 		    const struct aes_table *T,
diff --git a/aes-invert-internal.c b/aes-invert-internal.c
index a2faefa4d0262f7c2ed3581f7f2b8889f0d37796..7364616cb84edaea7eacf7ac1a1f09000307d44c 100644
--- a/aes-invert-internal.c
+++ b/aes-invert-internal.c
@@ -111,9 +111,9 @@ static const uint32_t mtable[0x100] =
   0xbe805d9f,0xb58d5491,0xa89a4f83,0xa397468d,
 };
 
-#define MIX_COLUMN(T, key) do { \
+#define MIX_COLUMN(T, out, in) do {		\
     uint32_t _k, _nk, _t;	\
-    _k = (key);			\
+    _k = (in);			\
     _nk = T[_k & 0xff];		\
     _k >>= 8;			\
     _t = T[_k & 0xff];		\
@@ -124,7 +124,7 @@ static const uint32_t mtable[0x100] =
     _k >>= 8;			\
     _t = T[_k & 0xff];		\
     _nk ^= ROTL32(24, _t);	\
-    (key) = _nk;		\
+    (out) = _nk;		\
   } while(0)
   
 
@@ -136,29 +136,13 @@ _nettle_aes_invert(unsigned rounds, uint32_t *dst, const uint32_t *src)
 {
   unsigned i;
 
-  /* Reverse the order of subkeys, in groups of 4. */
-  /* FIXME: Instead of reordering the subkeys, change the access order
-     of aes_decrypt, since it's a separate function anyway? */
-  if (src == dst)
-    {
-      unsigned j, k;
+  /* Transform all subkeys but the first and last. */
+  for (i = 4; i < 4 * rounds; i++)
+    MIX_COLUMN (mtable, dst[i], src[i]);
 
-      for (i = 0, j = rounds * 4;
-	   i < j;
-	   i += 4, j -= 4)
-	for (k = 0; k<4; k++)
-	  SWAP(dst[i+k], dst[j+k]);
-    }
-  else
+  if (src != dst)
     {
-      unsigned k;
-
-      for (i = 0; i <= rounds * 4; i += 4)
-	for (k = 0; k < 4; k++)
-	  dst[i+k] = src[rounds * 4 - i + k];
+      dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3];
+      dst[i] = src[i]; dst[i+1] = src[i+1]; dst[i+2] = src[i+2]; dst[i+3] = src[i+3];
     }
-
-  /* Transform all subkeys but the first and last. */
-  for (i = 4; i < 4 * rounds; i++)
-    MIX_COLUMN (mtable, dst[i]);
 }
diff --git a/aes128-decrypt.c b/aes128-decrypt.c
index 436438cc38cd36961ae01572bb629ea82a3a9762..96d6bc6ed57c64c1bd90b80266ce1e0f1f461ff6 100644
--- a/aes128-decrypt.c
+++ b/aes128-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes128_decrypt(const struct aes128_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys + 4*_AES128_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/aes192-decrypt.c b/aes192-decrypt.c
index 7746c76e574c09072c90836623e8c42d2581c82f..bf8702e4c679b0129210ad84efd3a0716dfa3562 100644
--- a/aes192-decrypt.c
+++ b/aes192-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes192_decrypt(const struct aes192_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys + 4 * _AES192_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/aes256-decrypt.c b/aes256-decrypt.c
index 89411c10ee3ecad7688271f4996b5c268b2a4826..ae050797abacdbc1b5afe6346956be12f8c76a36 100644
--- a/aes256-decrypt.c
+++ b/aes256-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes256_decrypt(const struct aes256_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys + 4 * _AES256_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm
index 1b04ed9a0a61aeb0c745e86fc39a24935c91c2d3..ed0d66f8cb87cf57f181549a9e5c5f4ca4730a6d 100644
--- a/arm/aes-decrypt-internal.asm
+++ b/arm/aes-decrypt-internal.asm
@@ -111,11 +111,12 @@ define(`AES_DECRYPT_ROUND', `
 	and	T0, MASK, $1, ror #22
 	ldr	T0, [TABLE, T0]
 
-	ldm	$9!, {$1,$2,$3,$4}
+	ldm	$9, {$1,$2,$3,$4}
 	eor	$8, $8, T0
 	sub	TABLE, TABLE, #3072
 	eor	$5, $5, $1
 	eor	$6, $6, $2
+	sub	$9, $9, #16
 	eor	$7, $7, $3
 	eor	$8, $8, $4
 ')
@@ -142,7 +143,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	AES_LOAD(X0,KEY,W0)
 	AES_LOAD(X0,KEY,W1)
 	AES_LOAD(X0,KEY,W2)
-	AES_LOAD(X0,KEY,W3)
+	AES_LOAD_INCR(X0,KEY,W3, -28)
 
 	str	X0, FRAME_SRC
 
diff --git a/arm/aes.m4 b/arm/aes.m4
index 95382de28122fcdf977e0e5ed90ee2031de38296..59706ba9e6784ee50dc2fc744d9b336a847098c6 100644
--- a/arm/aes.m4
+++ b/arm/aes.m4
@@ -1,6 +1,6 @@
 C Loads one word, and adds it to the subkey. Uses T0
-C AES_LOAD(SRC, KEY, REG)
-define(`AES_LOAD', `
+C AES_LOAD(SRC, KEY, REG, INCR)
+define(`AES_LOAD_INCR', `
 	ldrb	$3, [$1], #+1
 	ldrb	T0, [$1], #+1
 	orr	$3, T0, lsl #8
@@ -8,9 +8,13 @@ define(`AES_LOAD', `
 	orr	$3, T0, lsl #16
 	ldrb	T0, [$1], #+1
 	orr	$3, T0, lsl #24
-	ldr	T0, [$2], #+4
+	ldr	T0, [$2], #$4
 	eor	$3, T0
 ')
+C Loads one word, and adds it to the subkey. Uses T0
+C AES_LOAD(SRC, KEY, REG)
+define(`AES_LOAD', `AES_LOAD_INCR($1, $2, $3, +4)')
+
 C Stores one word. Destroys input.
 C AES_STORE(DST, X)
 define(`AES_STORE', `
diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm
index e8c6e91ad67304dcd8d8cc097c6ce028252b101b..10e0373ed5d32745e8d9cce979ac278b629b8909 100644
--- a/arm/v6/aes-decrypt-internal.asm
+++ b/arm/v6/aes-decrypt-internal.asm
@@ -114,11 +114,12 @@ define(`AES_DECRYPT_ROUND', `
 	uxtb	T0, $1, ror #24
 	ldr	T0, [TABLE, T0, lsl #2]
 
-	ldm	$9!, {$1,$2,$3,$4}
+	ldm	$9, {$1,$2,$3,$4}
 	eor	$8, $8, T0
 	sub	TABLE, TABLE, #3072
 	eor	$5, $5, $1
 	eor	$6, $6, $2
+	sub	$9, $9, #16
 	eor	$7, $7, $3
 	eor	$8, $8, $4
 ')
@@ -148,7 +149,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	AES_LOAD(SRC,KEY,W0)
 	AES_LOAD(SRC,KEY,W1)
 	AES_LOAD(SRC,KEY,W2)
-	AES_LOAD(SRC,KEY,W3)
+	AES_LOAD_INCR(SRC,KEY,W3, -28)
 
 	str	SRC, FRAME_SRC
 
diff --git a/arm64/crypto/aes128-decrypt.asm b/arm64/crypto/aes128-decrypt.asm
index aadfc480509b6aadec171808e5317c2ffbe7428e..70f2a64ece0f403b40b405fe37dca27c5bc49b8d 100644
--- a/arm64/crypto/aes128-decrypt.asm
+++ b/arm64/crypto/aes128-decrypt.asm
@@ -73,16 +73,16 @@ PROLOGUE(nettle_aes128_decrypt)
 L4B_loop:
     ld1            {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
     
-    AESD_ROUND_4B(S0,S1,S2,S3,K0)
-    AESD_ROUND_4B(S0,S1,S2,S3,K1)
-    AESD_ROUND_4B(S0,S1,S2,S3,K2)
-    AESD_ROUND_4B(S0,S1,S2,S3,K3)
-    AESD_ROUND_4B(S0,S1,S2,S3,K4)
-    AESD_ROUND_4B(S0,S1,S2,S3,K5)
-    AESD_ROUND_4B(S0,S1,S2,S3,K6)
-    AESD_ROUND_4B(S0,S1,S2,S3,K7)
+    AESD_ROUND_4B(S0,S1,S2,S3,K10)
+    AESD_ROUND_4B(S0,S1,S2,S3,K9)
     AESD_ROUND_4B(S0,S1,S2,S3,K8)
-    AESD_LAST_ROUND_4B(S0,S1,S2,S3,K9,K10)
+    AESD_ROUND_4B(S0,S1,S2,S3,K7)
+    AESD_ROUND_4B(S0,S1,S2,S3,K6)
+    AESD_ROUND_4B(S0,S1,S2,S3,K5)
+    AESD_ROUND_4B(S0,S1,S2,S3,K4)
+    AESD_ROUND_4B(S0,S1,S2,S3,K3)
+    AESD_ROUND_4B(S0,S1,S2,S3,K2)
+    AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0)
 
     st1            {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
 
@@ -97,16 +97,16 @@ L1B:
 L1B_loop:
     ld1            {S0.16b},[SRC],#16
     
-    AESD_ROUND_1B(S0,K0)
-    AESD_ROUND_1B(S0,K1)
-    AESD_ROUND_1B(S0,K2)
-    AESD_ROUND_1B(S0,K3)
-    AESD_ROUND_1B(S0,K4)
-    AESD_ROUND_1B(S0,K5)
-    AESD_ROUND_1B(S0,K6)
-    AESD_ROUND_1B(S0,K7)
+    AESD_ROUND_1B(S0,K10)
+    AESD_ROUND_1B(S0,K9)
     AESD_ROUND_1B(S0,K8)
-    AESD_LAST_ROUND_1B(S0,K9,K10)
+    AESD_ROUND_1B(S0,K7)
+    AESD_ROUND_1B(S0,K6)
+    AESD_ROUND_1B(S0,K5)
+    AESD_ROUND_1B(S0,K4)
+    AESD_ROUND_1B(S0,K3)
+    AESD_ROUND_1B(S0,K2)
+    AESD_LAST_ROUND_1B(S0,K1,K0)
 
     st1            {S0.16b},[DST],#16
 
diff --git a/arm64/crypto/aes192-decrypt.asm b/arm64/crypto/aes192-decrypt.asm
index 769edd15dbe65a363d9eba0e64aef659b57f1de0..b8eb30fd58776b321cd26ebd5d96466dd001d8e9 100644
--- a/arm64/crypto/aes192-decrypt.asm
+++ b/arm64/crypto/aes192-decrypt.asm
@@ -76,18 +76,18 @@ PROLOGUE(nettle_aes192_decrypt)
 L4B_loop:
     ld1            {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
     
-    AESD_ROUND_4B(S0,S1,S2,S3,K0)
-    AESD_ROUND_4B(S0,S1,S2,S3,K1)
-    AESD_ROUND_4B(S0,S1,S2,S3,K2)
-    AESD_ROUND_4B(S0,S1,S2,S3,K3)
-    AESD_ROUND_4B(S0,S1,S2,S3,K4)
-    AESD_ROUND_4B(S0,S1,S2,S3,K5)
-    AESD_ROUND_4B(S0,S1,S2,S3,K6)
-    AESD_ROUND_4B(S0,S1,S2,S3,K7)
-    AESD_ROUND_4B(S0,S1,S2,S3,K8)
-    AESD_ROUND_4B(S0,S1,S2,S3,K9)
+    AESD_ROUND_4B(S0,S1,S2,S3,K12)
+    AESD_ROUND_4B(S0,S1,S2,S3,K11)
     AESD_ROUND_4B(S0,S1,S2,S3,K10)
-    AESD_LAST_ROUND_4B(S0,S1,S2,S3,K11,K12)
+    AESD_ROUND_4B(S0,S1,S2,S3,K9)
+    AESD_ROUND_4B(S0,S1,S2,S3,K8)
+    AESD_ROUND_4B(S0,S1,S2,S3,K7)
+    AESD_ROUND_4B(S0,S1,S2,S3,K6)
+    AESD_ROUND_4B(S0,S1,S2,S3,K5)
+    AESD_ROUND_4B(S0,S1,S2,S3,K4)
+    AESD_ROUND_4B(S0,S1,S2,S3,K3)
+    AESD_ROUND_4B(S0,S1,S2,S3,K2)
+    AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0)
 
     st1            {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
 
@@ -102,18 +102,18 @@ L1B:
 L1B_loop:
     ld1            {S0.16b},[SRC],#16
     
-    AESD_ROUND_1B(S0,K0)
-    AESD_ROUND_1B(S0,K1)
-    AESD_ROUND_1B(S0,K2)
-    AESD_ROUND_1B(S0,K3)
-    AESD_ROUND_1B(S0,K4)
-    AESD_ROUND_1B(S0,K5)
-    AESD_ROUND_1B(S0,K6)
-    AESD_ROUND_1B(S0,K7)
-    AESD_ROUND_1B(S0,K8)
-    AESD_ROUND_1B(S0,K9)
+    AESD_ROUND_1B(S0,K12)
+    AESD_ROUND_1B(S0,K11)
     AESD_ROUND_1B(S0,K10)
-    AESD_LAST_ROUND_1B(S0,K11,K12)
+    AESD_ROUND_1B(S0,K9)
+    AESD_ROUND_1B(S0,K8)
+    AESD_ROUND_1B(S0,K7)
+    AESD_ROUND_1B(S0,K6)
+    AESD_ROUND_1B(S0,K5)
+    AESD_ROUND_1B(S0,K4)
+    AESD_ROUND_1B(S0,K3)
+    AESD_ROUND_1B(S0,K2)
+    AESD_LAST_ROUND_1B(S0,K1,K0)
 
     st1            {S0.16b},[DST],#16
 
diff --git a/arm64/crypto/aes256-decrypt.asm b/arm64/crypto/aes256-decrypt.asm
index d787a74a40e7f2092612d6c63fc64d50d1e7c4f0..dc1633157bccf5701fc66efa18568e4461cb4de5 100644
--- a/arm64/crypto/aes256-decrypt.asm
+++ b/arm64/crypto/aes256-decrypt.asm
@@ -78,20 +78,20 @@ PROLOGUE(nettle_aes256_decrypt)
 L4B_loop:
     ld1            {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
     
-    AESD_ROUND_4B(S0,S1,S2,S3,K0)
-    AESD_ROUND_4B(S0,S1,S2,S3,K1)
-    AESD_ROUND_4B(S0,S1,S2,S3,K2)
-    AESD_ROUND_4B(S0,S1,S2,S3,K3)
-    AESD_ROUND_4B(S0,S1,S2,S3,K4)
-    AESD_ROUND_4B(S0,S1,S2,S3,K5)
-    AESD_ROUND_4B(S0,S1,S2,S3,K6)
-    AESD_ROUND_4B(S0,S1,S2,S3,K7)
-    AESD_ROUND_4B(S0,S1,S2,S3,K8)
-    AESD_ROUND_4B(S0,S1,S2,S3,K9)
-    AESD_ROUND_4B(S0,S1,S2,S3,K10)
-    AESD_ROUND_4B(S0,S1,S2,S3,K11)
+    AESD_ROUND_4B(S0,S1,S2,S3,K14)
+    AESD_ROUND_4B(S0,S1,S2,S3,K13)
     AESD_ROUND_4B(S0,S1,S2,S3,K12)
-    AESD_LAST_ROUND_4B(S0,S1,S2,S3,K13,K14)
+    AESD_ROUND_4B(S0,S1,S2,S3,K11)
+    AESD_ROUND_4B(S0,S1,S2,S3,K10)
+    AESD_ROUND_4B(S0,S1,S2,S3,K9)
+    AESD_ROUND_4B(S0,S1,S2,S3,K8)
+    AESD_ROUND_4B(S0,S1,S2,S3,K7)
+    AESD_ROUND_4B(S0,S1,S2,S3,K6)
+    AESD_ROUND_4B(S0,S1,S2,S3,K5)
+    AESD_ROUND_4B(S0,S1,S2,S3,K4)
+    AESD_ROUND_4B(S0,S1,S2,S3,K3)
+    AESD_ROUND_4B(S0,S1,S2,S3,K2)
+    AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0)
 
     st1            {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
 
@@ -106,20 +106,20 @@ L1B:
 L1B_loop:
     ld1            {S0.16b},[SRC],#16
     
-    AESD_ROUND_1B(S0,K0)
-    AESD_ROUND_1B(S0,K1)
-    AESD_ROUND_1B(S0,K2)
-    AESD_ROUND_1B(S0,K3)
-    AESD_ROUND_1B(S0,K4)
-    AESD_ROUND_1B(S0,K5)
-    AESD_ROUND_1B(S0,K6)
-    AESD_ROUND_1B(S0,K7)
-    AESD_ROUND_1B(S0,K8)
-    AESD_ROUND_1B(S0,K9)
-    AESD_ROUND_1B(S0,K10)
-    AESD_ROUND_1B(S0,K11)
+    AESD_ROUND_1B(S0,K14)
+    AESD_ROUND_1B(S0,K13)
     AESD_ROUND_1B(S0,K12)
-    AESD_LAST_ROUND_1B(S0,K13,K14)
+    AESD_ROUND_1B(S0,K11)
+    AESD_ROUND_1B(S0,K10)
+    AESD_ROUND_1B(S0,K9)
+    AESD_ROUND_1B(S0,K8)
+    AESD_ROUND_1B(S0,K7)
+    AESD_ROUND_1B(S0,K6)
+    AESD_ROUND_1B(S0,K5)
+    AESD_ROUND_1B(S0,K4)
+    AESD_ROUND_1B(S0,K3)
+    AESD_ROUND_1B(S0,K2)
+    AESD_LAST_ROUND_1B(S0,K1,K0)
 
     st1            {S0.16b},[DST],#16
 
diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm
index 6336e5b78c01265e0b925100c5bee65f6a981678..d2a07a3824fc7005664975ff92c02595f891c9cb 100644
--- a/powerpc64/p8/aes-decrypt-internal.asm
+++ b/powerpc64/p8/aes-decrypt-internal.asm
@@ -115,7 +115,7 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3,S4,S5,S6,S7)')
  OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
 
  mtctr ROUNDS
- li r9,0x10
+ li r9,-0x10
 
 .align 5
 L8x_round_loop:
@@ -123,7 +123,7 @@ L8x_round_loop:
  vperm   K,K,K,SWAP_MASK
  OPN_XXY(vncipher, ZERO, S0, S1, S2, S3, S4, S5, S6, S7)
  OPN_XXY(vxor, K, S0, S1, S2, S3, S4, S5, S6, S7)
- addi r9,r9,0x10
+ subi r9,r9,0x10
  bdnz L8x_round_loop
 
  lxvd2x VSR(K),r9,KEYS
@@ -172,14 +172,14 @@ IF_LE(`OPN_XXXY(vperm, SWAP_MASK, S0,S1,S2,S3)')
  OPN_XXY(vxor, K, S0, S1, S2, S3)
 
  mtctr ROUNDS
- li r9,0x10
+ li r9,-0x10
 .align 5
 L4x_round_loop:
  lxvd2x VSR(K),r9,KEYS
  vperm  K,K,K,SWAP_MASK
  OPN_XXY(vncipher, ZERO, S0, S1, S2, S3)
  OPN_XXY(vxor, K, S0, S1, S2, S3)
- addi   r9,r9,0x10
+ subi   r9,r9,0x10
  bdnz  L4x_round_loop
 
  lxvd2x VSR(K),r9,KEYS
@@ -216,7 +216,7 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK
  vxor   S1,S1,K
 
  mtctr   ROUNDS
- li  r9,0x10
+ li  r9,-0x10
 .align 5
 L2x_round_loop:
  lxvd2x VSR(K),r9,KEYS
@@ -225,7 +225,7 @@ L2x_round_loop:
  vncipher S1,S1,ZERO
  vxor  S0,S0,K
  vxor  S1,S1,K
- addi   r9,r9,0x10
+ subi   r9,r9,0x10
  bdnz   L2x_round_loop
 
  lxvd2x VSR(K),r9,KEYS
@@ -258,14 +258,14 @@ IF_LE(`vperm S0,S0,S0,SWAP_MASK')
  vxor   S0,S0,K
 
  mtctr   ROUNDS
- li   r9,0x10
+ li   r9,-0x10
 .align 5
 L1x_round_loop:
  lxvd2x VSR(K),r9,KEYS
  vperm  K,K,K,SWAP_MASK
  vncipher S0,S0,ZERO
  vxor   S0,S0,K
- addi   r9,r9,0x10
+ subi   r9,r9,0x10
  bdnz   L1x_round_loop
 
  lxvd2x VSR(K),r9,KEYS
diff --git a/sparc64/aes-decrypt-internal.asm b/sparc64/aes-decrypt-internal.asm
index a43ced3646dbc54f34660672ffa8f0de64ece29f..8918de1baf84a9f700d76ca8ff8c508651e34a76 100644
--- a/sparc64/aes-decrypt-internal.asm
+++ b/sparc64/aes-decrypt-internal.asm
@@ -103,33 +103,32 @@ PROLOGUE(_nettle_aes_decrypt)
 
 	mov	ROUNDS, COUNT
 	add	SRC, 16, SRC
-	add	KEY, 16, KEY
+	sub	KEY, 32, KEY
 
 .Lround_loop:
 	C The AES_ROUND macro uses T0,... T3
 	C	Transform W -> X
-	AES_ROUND(0, W0, W3, W2, W1, KEY, X0)
-	AES_ROUND(1, W1, W0, W3, W2, KEY, X1)
-	AES_ROUND(2, W2, W1, W0, W3, KEY, X2)
-	AES_ROUND(3, W3, W2, W1, W0, KEY, X3)
+	AES_ROUND(4, W0, W3, W2, W1, KEY, X0)
+	AES_ROUND(5, W1, W0, W3, W2, KEY, X1)
+	AES_ROUND(6, W2, W1, W0, W3, KEY, X2)
+	AES_ROUND(7, W3, W2, W1, W0, KEY, X3)
 
 	C	Transform X -> W
-	AES_ROUND(4, X0, X3, X2, X1, KEY, W0)
-	AES_ROUND(5, X1, X0, X3, X2, KEY, W1)
-	AES_ROUND(6, X2, X1, X0, X3, KEY, W2)
-	AES_ROUND(7, X3, X2, X1, X0, KEY, W3)
+	AES_ROUND(0, X0, X3, X2, X1, KEY, W0)
+	AES_ROUND(1, X1, X0, X3, X2, KEY, W1)
+	AES_ROUND(2, X2, X1, X0, X3, KEY, W2)
+	AES_ROUND(3, X3, X2, X1, X0, KEY, W3)
 
 	subcc	COUNT, 1, COUNT
 	bne	.Lround_loop
-	add	KEY, 32, KEY
+	sub	KEY, 32, KEY
 
 	C	Penultimate round
-	AES_ROUND(0, W0, W3, W2, W1, KEY, X0)
-	AES_ROUND(1, W1, W0, W3, W2, KEY, X1)
-	AES_ROUND(2, W2, W1, W0, W3, KEY, X2)
-	AES_ROUND(3, W3, W2, W1, W0, KEY, X3)
+	AES_ROUND(4, W0, W3, W2, W1, KEY, X0)
+	AES_ROUND(5, W1, W0, W3, W2, KEY, X1)
+	AES_ROUND(6, W2, W1, W0, W3, KEY, X2)
+	AES_ROUND(7, W3, W2, W1, W0, KEY, X3)
 
-	add	KEY, 16, KEY
 	C	Final round
 	AES_FINAL_ROUND(0, T, X0, X3, X2, X1, KEY, DST)
 	AES_FINAL_ROUND(1, T, X1, X0, X3, X2, KEY, DST)
diff --git a/x86/aes-decrypt-internal.asm b/x86/aes-decrypt-internal.asm
index 730ce2ed88c78aa32f6417f01e28e831065329eb..d8d980da8b4880a56e1c022ba38770fa93bfdf37 100644
--- a/x86/aes-decrypt-internal.asm
+++ b/x86/aes-decrypt-internal.asm
@@ -90,7 +90,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	jz	.Lend
 
 	shrl	$4, PARAM_LENGTH
-	subl	$1, PARAM_ROUNDS
+	decl	PARAM_ROUNDS
 .Lblock_loop:
 	movl	PARAM_KEYS, KEY	C  address of subkeys
 	
@@ -103,7 +103,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	C Loop counter on stack
 	movl	TMP, FRAME_COUNT
 
-	addl	$16,KEY		C  point to next key
+	subl	$16,KEY		C  point to next key
 	movl	KEY,FRAME_KEY
 	ALIGN(16)
 .Lround_loop:
@@ -128,7 +128,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	xorl	4(KEY),SB
 	xorl	8(KEY),SC
 	xorl	12(KEY),SD
-	addl	$16,FRAME_KEY	C  point to next key
+	subl	$16,FRAME_KEY	C  point to next key
 	decl	FRAME_COUNT
 	jnz	.Lround_loop
 
diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm
index d3bedc25ac557d163e84b4bb5afa6607d5aea84c..afa44f3568d72bba9038d536d3d3f2c4075dfcce 100644
--- a/x86_64/aes-decrypt-internal.asm
+++ b/x86_64/aes-decrypt-internal.asm
@@ -83,8 +83,8 @@ PROLOGUE(_nettle_aes_decrypt)
 	push	%r15	
 
 	subl	$1, XREG(ROUNDS)
-	push	ROUNDS		C Rounds at (%rsp) 
-	
+	push	ROUNDS		C Rounds stored at (%rsp)
+
 	mov	PARAM_TABLE, TABLE
 	mov	PARAM_LENGTH, LENGTH
 	shr	$4, LENGTH
@@ -96,7 +96,8 @@ PROLOGUE(_nettle_aes_decrypt)
 
 	movl	(%rsp), XREG(ROUNDS)
 
-	add	$16, KEY	C  point to next key
+	sub	$16, KEY	C  point to next key
+
 	ALIGN(16)
 .Lround_loop:
 	AES_ROUND(TABLE, SA,SD,SC,SB, TA, TMP)
@@ -113,7 +114,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	xorl	8(KEY),SC
 	xorl	12(KEY),SD
 
-	add	$16, KEY	C  point to next key
+	sub	$16, KEY	C  point to next key
 	decl	XREG(ROUNDS)
 	jnz	.Lround_loop
 
diff --git a/x86_64/aesni/aes128-decrypt.asm b/x86_64/aesni/aes128-decrypt.asm
index 79111e470fcc092495cf344547b791486985d371..b200989403f80596bd68a5a588ace74f0722115b 100644
--- a/x86_64/aesni/aes128-decrypt.asm
+++ b/x86_64/aesni/aes128-decrypt.asm
@@ -64,17 +64,17 @@ PROLOGUE(nettle_aes128_decrypt)
 	test	LENGTH, LENGTH
 	jz	.Lend
 
-	movups	(CTX), KEY0
-	movups	16(CTX), KEY1
-	movups	32(CTX), KEY2
-	movups	48(CTX), KEY3
-	movups	64(CTX), KEY4
+	movups	160(CTX), KEY0
+	movups	144(CTX), KEY1
+	movups	128(CTX), KEY2
+	movups	112(CTX), KEY3
+	movups	96(CTX), KEY4
 	movups	80(CTX), KEY5
-	movups	96(CTX), KEY6
-	movups	112(CTX), KEY7
-	movups	128(CTX), KEY8
-	movups	144(CTX), KEY9
-	movups	160(CTX), KEY10
+	movups	64(CTX), KEY6
+	movups	48(CTX), KEY7
+	movups	32(CTX), KEY8
+	movups	16(CTX), KEY9
+	movups	(CTX), KEY10
 	shr	LENGTH
 	jnc	.Lblock_loop
 
diff --git a/x86_64/aesni/aes192-decrypt.asm b/x86_64/aesni/aes192-decrypt.asm
index 399f89b6ebe11d5473161eaecb22fa3d7951836b..24c178278876ccae8ab0a684651824cfd5340a53 100644
--- a/x86_64/aesni/aes192-decrypt.asm
+++ b/x86_64/aesni/aes192-decrypt.asm
@@ -66,19 +66,19 @@ PROLOGUE(nettle_aes192_decrypt)
 	test	LENGTH, LENGTH
 	jz	.Lend
 
-	movups	(CTX), KEY0
-	movups	16(CTX), KEY1
-	movups	32(CTX), KEY2
-	movups	48(CTX), KEY3
-	movups	64(CTX), KEY4
-	movups	80(CTX), KEY5
+	movups	192(CTX), KEY0
+	movups	176(CTX), KEY1
+	movups	160(CTX), KEY2
+	movups	144(CTX), KEY3
+	movups	128(CTX), KEY4
+	movups	112(CTX), KEY5
 	movups	96(CTX), KEY6
-	movups	112(CTX), KEY7
-	movups	128(CTX), KEY8
-	movups	144(CTX), KEY9
-	movups	160(CTX), KEY10
-	movups	176(CTX), KEY11
-	movups	192(CTX), KEY12
+	movups	80(CTX), KEY7
+	movups	64(CTX), KEY8
+	movups	48(CTX), KEY9
+	movups	32(CTX), KEY10
+	movups	16(CTX), KEY11
+	movups	(CTX), KEY12
 	shr	LENGTH
 	jnc	.Lblock_loop
 
diff --git a/x86_64/aesni/aes256-decrypt.asm b/x86_64/aesni/aes256-decrypt.asm
index 0fc5ad2a705f191a35982c5d9d3dda3328fee4f2..247655a33eec578954adb2772f3fe5659449d2c1 100644
--- a/x86_64/aesni/aes256-decrypt.asm
+++ b/x86_64/aesni/aes256-decrypt.asm
@@ -67,20 +67,20 @@ PROLOGUE(nettle_aes256_decrypt)
 	test	LENGTH, LENGTH
 	jz	.Lend
 
-	movups	(CTX), KEY0_7
-	movups	16(CTX), KEY1
-	movups	32(CTX), KEY2
-	movups	48(CTX), KEY3
-	movups	64(CTX), KEY4
-	movups	80(CTX), KEY5
-	movups	96(CTX), KEY6
-	movups	128(CTX), KEY8
-	movups	144(CTX), KEY9
-	movups	160(CTX), KEY10
-	movups	176(CTX), KEY11
-	movups	192(CTX), KEY12
-	movups	208(CTX), KEY13
-	movups	224(CTX), KEY14
+	movups	224(CTX), KEY0_7
+	movups	208(CTX), KEY1
+	movups	192(CTX), KEY2
+	movups	176(CTX), KEY3
+	movups	160(CTX), KEY4
+	movups	144(CTX), KEY5
+	movups	128(CTX), KEY6
+	movups	96(CTX), KEY8
+	movups	80(CTX), KEY9
+	movups	64(CTX), KEY10
+	movups	48(CTX), KEY11
+	movups	32(CTX), KEY12
+	movups	16(CTX), KEY13
+	movups	(CTX), KEY14
 
 	shr	LENGTH
 	jnc	.Lblock_loop
@@ -95,7 +95,7 @@ PROLOGUE(nettle_aes256_decrypt)
 	aesdec	KEY5, X
 	aesdec	KEY6, X
 	aesdec	KEY0_7, X
-	movups	(CTX), KEY0_7
+	movups	224(CTX), KEY0_7
 	aesdec	KEY8, X
 	aesdec	KEY9, X
 	aesdec	KEY10, X
@@ -130,7 +130,7 @@ PROLOGUE(nettle_aes256_decrypt)
 	aesdec	KEY6, Y
 	aesdec	KEY0_7, X
 	aesdec	KEY0_7, Y
-	movups	(CTX), KEY0_7
+	movups	224(CTX), KEY0_7
 	aesdec	KEY8, X
 	aesdec	KEY8, Y
 	aesdec	KEY9, X