From c54e0270b010ca1a52c988835c170c6b32bd2935 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 25 Jan 2024 20:36:28 +0100
Subject: [PATCH] Change _nettle_aes_decrypt to pass pointer to last subkey.

---
 aes-decrypt-internal.c           | 27 ++++++++++++++-------------
 aes-internal.h                   |  2 ++
 aes128-decrypt.c                 |  4 ++--
 aes192-decrypt.c                 |  4 ++--
 aes256-decrypt.c                 |  4 ++--
 sparc64/aes-decrypt-internal.asm |  3 ---
 x86/aes-decrypt-internal.asm     |  3 ---
 x86_64/aes-decrypt-internal.asm  |  6 ++----
 8 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/aes-decrypt-internal.c b/aes-decrypt-internal.c
index fcd7289a..47747ab8 100644
--- a/aes-decrypt-internal.c
+++ b/aes-decrypt-internal.c
@@ -60,22 +60,23 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
     {
       uint32_t w0, w1, w2, w3;		/* working ciphertext */
       uint32_t t0, t1, t2, t3;
+      const uint32_t *p;
       unsigned i;
       
       /* Get clear text, using little-endian byte order.
        * Also XOR with the first subkey. */
 
-      w0 = LE_READ_UINT32(src)      ^ keys[4*rounds];
-      w1 = LE_READ_UINT32(src + 4)  ^ keys[4*rounds + 1];
-      w2 = LE_READ_UINT32(src + 8)  ^ keys[4*rounds + 2];
-      w3 = LE_READ_UINT32(src + 12) ^ keys[4*rounds + 3];
+      w0 = LE_READ_UINT32(src)      ^ keys[0];
+      w1 = LE_READ_UINT32(src + 4)  ^ keys[1];
+      w2 = LE_READ_UINT32(src + 8)  ^ keys[2];
+      w3 = LE_READ_UINT32(src + 12) ^ keys[3];
 
-      for (i = rounds - 1; i > 0; i--)
+      for (i = 1, p = keys - 4; i < rounds; i++, p -= 4)
 	{
-	  t0 = AES_ROUND(T, w0, w3, w2, w1, keys[4*i]);
-	  t1 = AES_ROUND(T, w1, w0, w3, w2, keys[4*i + 1]);
-	  t2 = AES_ROUND(T, w2, w1, w0, w3, keys[4*i + 2]);
-	  t3 = AES_ROUND(T, w3, w2, w1, w0, keys[4*i + 3]);
+	  t0 = AES_ROUND(T, w0, w3, w2, w1, p[0]);
+	  t1 = AES_ROUND(T, w1, w0, w3, w2, p[1]);
+	  t2 = AES_ROUND(T, w2, w1, w0, w3, p[2]);
+	  t3 = AES_ROUND(T, w3, w2, w1, w0, p[3]);
 
 	  /* We could unroll the loop twice, to avoid these
 	     assignments. If all eight variables fit in registers,
@@ -88,10 +89,10 @@ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
 
       /* Final round */
 
-      t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, keys[0]);
-      t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, keys[1]);
-      t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, keys[2]);
-      t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, keys[3]);
+      t0 = AES_FINAL_ROUND(T, w0, w3, w2, w1, p[0]);
+      t1 = AES_FINAL_ROUND(T, w1, w0, w3, w2, p[1]);
+      t2 = AES_FINAL_ROUND(T, w2, w1, w0, w3, p[2]);
+      t3 = AES_FINAL_ROUND(T, w3, w2, w1, w0, p[3]);
 
       LE_WRITE_UINT32(dst, t0);
       LE_WRITE_UINT32(dst + 4, t1);
diff --git a/aes-internal.h b/aes-internal.h
index 64cf7be5..442b045b 100644
--- a/aes-internal.h
+++ b/aes-internal.h
@@ -66,6 +66,8 @@ _nettle_aes_encrypt(unsigned rounds, const uint32_t *keys,
 		    size_t length, uint8_t *dst,
 		    const uint8_t *src);
 
+/* The keys pointer points at the subkeys for the first decrypt round,
+   located at the end of the array. */
 void
 _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
 		    const struct aes_table *T,
diff --git a/aes128-decrypt.c b/aes128-decrypt.c
index 436438cc..96d6bc6e 100644
--- a/aes128-decrypt.c
+++ b/aes128-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes128_decrypt(const struct aes128_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES128_ROUNDS, ctx->keys + 4*_AES128_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/aes192-decrypt.c b/aes192-decrypt.c
index 7746c76e..bf8702e4 100644
--- a/aes192-decrypt.c
+++ b/aes192-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes192_decrypt(const struct aes192_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES192_ROUNDS, ctx->keys + 4 * _AES192_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/aes256-decrypt.c b/aes256-decrypt.c
index 89411c10..ae050797 100644
--- a/aes256-decrypt.c
+++ b/aes256-decrypt.c
@@ -54,6 +54,6 @@ nettle_aes256_decrypt(const struct aes256_ctx *ctx,
 	       const uint8_t *src)
 {
   assert(!(length % AES_BLOCK_SIZE) );
-  _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys, &_nettle_aes_decrypt_table,
-		      length, dst, src);
+  _nettle_aes_decrypt(_AES256_ROUNDS, ctx->keys + 4 * _AES256_ROUNDS,
+		      &_nettle_aes_decrypt_table, length, dst, src);
 }
diff --git a/sparc64/aes-decrypt-internal.asm b/sparc64/aes-decrypt-internal.asm
index 5e795233..8918de1b 100644
--- a/sparc64/aes-decrypt-internal.asm
+++ b/sparc64/aes-decrypt-internal.asm
@@ -88,9 +88,6 @@ PROLOGUE(_nettle_aes_decrypt)
 	add	T, AES_TABLE2, T2
 	add	T, AES_TABLE3, T3
 
-	sll	ROUNDS, 4, W0	C Can use W0 as scratch
-	add	KEYS, W0, KEYS	C Point to last subkey
-
 	C	Must be even, and includes the final round
 	srl	ROUNDS, 1, ROUNDS
 	C	Last two rounds handled specially
diff --git a/x86/aes-decrypt-internal.asm b/x86/aes-decrypt-internal.asm
index 36d6ca44..d8d980da 100644
--- a/x86/aes-decrypt-internal.asm
+++ b/x86/aes-decrypt-internal.asm
@@ -90,10 +90,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	jz	.Lend
 
 	shrl	$4, PARAM_LENGTH
-	movl	PARAM_ROUNDS, TMP
 	decl	PARAM_ROUNDS
-	shll	$4, TMP
-	addl	TMP, PARAM_KEYS
 .Lblock_loop:
 	movl	PARAM_KEYS, KEY	C  address of subkeys
 	
diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm
index ed753a2c..afa44f35 100644
--- a/x86_64/aes-decrypt-internal.asm
+++ b/x86_64/aes-decrypt-internal.asm
@@ -83,9 +83,7 @@ PROLOGUE(_nettle_aes_decrypt)
 	push	%r15	
 
 	subl	$1, XREG(ROUNDS)
-	push	ROUNDS			C Rounds stored at (%rsp)
-	shl	$4, XREG(ROUNDS)	C Zero-extends
-	lea	16(KEYS, ROUNDS), KEYS
+	push	ROUNDS		C Rounds stored at (%rsp)
 
 	mov	PARAM_TABLE, TABLE
 	mov	PARAM_LENGTH, LENGTH
@@ -94,8 +92,8 @@ PROLOGUE(_nettle_aes_decrypt)
 	mov	KEYS, KEY
 	
 	AES_LOAD(SA, SB, SC, SD, SRC, KEY)
+	add	$16, SRC	C Increment src pointer
 
-	add	$16, SRC	C  increment src pointer
 	movl	(%rsp), XREG(ROUNDS)
 
 	sub	$16, KEY	C  point to next key
-- 
GitLab