From ec5c42a8a28b6931a60e26fdd3373dc35c5c52a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 13 Feb 2002 14:05:04 +0100
Subject: [PATCH] * aes.c (aes_encrypt): Don't unroll the innerloop.
 (aes_encrypt): Don't unroll the loop for the final round. (aes_decrypt):
 Likewise, no loop unrolling.

Rev: src/nettle/aes.c:1.9
---
 aes.c | 329 +++++++++++++---------------------------------------------
 1 file changed, 73 insertions(+), 256 deletions(-)

diff --git a/aes.c b/aes.c
index a32e6df4..f6db8714 100644
--- a/aes.c
+++ b/aes.c
@@ -47,19 +47,31 @@
 
 /* Column j are the shifts used when computing t[j].
  * Row i is says which byte is used */
-#if AES_SMALL
+
+/* FIXME: Figure out how the indexing should really be done. It looks
+ * like this code shifts the rows in the wrong direction, but it
+ * passes the testsuite. Perhaps the tables are rotated in the wrong
+ * direction, but I don't think so. */
+
+/* The row shift counts C1, C2 and C3 are (1, 2, 3) */
+
 static const unsigned idx[4][4] = {
   { 0, 1, 2, 3 },
   { 1, 2, 3, 0 },
   { 2, 3, 0, 1 },
   { 3, 0, 1, 2 } };
-
+#if 0
+static const unsigned idx4[4][4] = {
+  { 0, 4, 8, 12 }, 
+  { 4, 8, 12, 0 }, 
+  { 8, 12, 0, 4 }, 
+  { 12, 0, 4, 8 } };
+#endif
 static const unsigned iidx[4][4] = {
   { 0, 1, 2, 3 },
   { 3, 0, 1, 2 },
   { 2, 3, 0, 1 },
   { 1, 2, 3, 0 } };
-#endif /* AES_SMALL */
 
 void
 aes_encrypt(struct aes_ctx *ctx,
@@ -82,53 +94,23 @@ aes_encrypt(struct aes_ctx *ctx,
 	  uint32_t t[4];
 	  unsigned j;
 
-#if DEBUG
-	  fprintf(stderr, "encrypt, round: %d\n  wtxt: ", round);
-	  for (j = 0; j<4; j++)
-	    fprintf(stderr, "%08x, ", wtxt[j]);
-	  fprintf(stderr, "\n  key: ");
-	  for (j = 0; j<4; j++)
-	    fprintf(stderr, "%08x, ", ctx->keys[4*round + j]);
-	  fprintf(stderr, "\n");
-#endif
-	  /* The row shift counts C1, C2 and C3 are (1, 2, 3) */
 	  /* What's the best way to order this loop? Ideally,
 	   * we'd want to keep both t and wtxt in registers. */
 
-#if AES_SMALL
 	  for (j=0; j<4; j++)
-	    t[j] =         dtable[0][ B0(wtxt[j]) ] ^
-	      ROTRBYTE(    dtable[0][ B1(wtxt[idx[1][j]]) ]^
-		ROTRBYTE(  dtable[0][ B2(wtxt[idx[2][j]]) ] ^
-		  ROTRBYTE(dtable[0][ B3(wtxt[idx[3][j]]) ])));
+	    {
+#if AES_SMALL
+	      t[j] =         dtable[0][ B0(wtxt[j]) ] ^
+		ROTRBYTE(    dtable[0][ B1(wtxt[idx[1][j]]) ]^
+		  ROTRBYTE(  dtable[0][ B2(wtxt[idx[2][j]]) ] ^
+		    ROTRBYTE(dtable[0][ B3(wtxt[idx[3][j]]) ])));
 #else /* !AES_SMALL */
-	  
-	  /* FIXME: Figure out how the indexing should really be done.
-	   * It looks like this code shifts the rows in the wrong
-	   * direction, but it passes the testsuite. */
-	  t[0] = (  dtable[0][ B0(wtxt[0]) ]
-		  ^ dtable[1][ B1(wtxt[1]) ]
-		  ^ dtable[2][ B2(wtxt[2]) ]
-		  ^ dtable[3][ B3(wtxt[3]) ]);
-	  t[1] = (  dtable[0][ B0(wtxt[1]) ]
-		  ^ dtable[1][ B1(wtxt[2]) ]
-		  ^ dtable[2][ B2(wtxt[3]) ]
-		  ^ dtable[3][ B3(wtxt[0]) ]);
-	  t[2] = (  dtable[0][ B0(wtxt[2]) ]
-		  ^ dtable[1][ B1(wtxt[3]) ]
-		  ^ dtable[2][ B2(wtxt[0]) ]
-		  ^ dtable[3][ B3(wtxt[1]) ]);
-	  t[3] = (  dtable[0][ B0(wtxt[3]) ]
-		  ^ dtable[1][ B1(wtxt[0]) ]
-		  ^ dtable[2][ B2(wtxt[1]) ]
-		  ^ dtable[3][ B3(wtxt[2]) ]);
+	      t[j] = (  dtable[0][ B0(wtxt[idx[0][j]]) ]
+		      ^ dtable[1][ B1(wtxt[idx[1][j]]) ]
+		      ^ dtable[2][ B2(wtxt[idx[2][j]]) ]
+		      ^ dtable[3][ B3(wtxt[idx[3][j]]) ]);
 #endif /* !AES_SMALL */
-#if DEBUG
-	  fprintf(stderr, "\n  t: ");
-	  for (j = 0; j<4; j++)
-	    fprintf(stderr, "%08x, ", t[j]);
-	  fprintf(stderr, "\n");
-#endif
+	    }
 
 	  for (j = 0; j<4; j++)
 	    wtxt[j] = t[j] ^ ctx->keys[4*round + j];
@@ -136,63 +118,29 @@ aes_encrypt(struct aes_ctx *ctx,
       /* Final round */
       {
 	uint32_t cipher;
-
-	/* FIXME: Figure out how the indexing should really be done.
-	 * It looks like this code shifts the rows in the wrong
-	 * direction, but it passes the testsuite. */
-
-	cipher = (   (uint32_t) sbox[ B0(wtxt[0]) ]
-		  | ((uint32_t) sbox[ B1(wtxt[1]) ] << 8)
-		  | ((uint32_t) sbox[ B2(wtxt[2]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[3]) ] << 24));
+	unsigned j;
+	for (j = 0; j<4; j++)
+	  {
+	    /* FIXME: Figure out how the indexing should really be done.
+	     * It looks like this code shifts the rows in the wrong
+	     * direction, but it passes the testsuite. */
+
+	    cipher = (   (uint32_t) sbox[ B0(wtxt[j]) ]
+			 | ((uint32_t) sbox[ B1(wtxt[idx[1][j]]) ] << 8)
+			 | ((uint32_t) sbox[ B2(wtxt[idx[2][j]]) ] << 16)
+			 | ((uint32_t) sbox[ B3(wtxt[idx[3][j]]) ] << 24));
 #if DEBUG
-	fprintf(stderr, "  t[0]: %x, key: %x\n",
-		cipher, ctx->keys[4*round]);
+	    fprintf(stderr, "  t[%d]: %x, key: %x\n",
+		    j, cipher, ctx->keys[4*round + j]);
 #endif
-	cipher ^= ctx->keys[4*round];
+	    cipher ^= ctx->keys[4*round + j];
 
-	LE_WRITE_UINT32(dst, cipher);
-
-	cipher = (   (uint32_t) sbox[ B0(wtxt[1]) ]
-		  | ((uint32_t) sbox[ B1(wtxt[2]) ] << 8)
-		  | ((uint32_t) sbox[ B2(wtxt[3]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[0]) ] << 24));
-#if DEBUG
-	fprintf(stderr, "  t[1]: %x, key: %x\n",
-		cipher, ctx->keys[4*round + 1]);
-#endif
-	cipher ^= ctx->keys[4*round + 1];
-	
-	LE_WRITE_UINT32(dst + 4, cipher);
-		
-	cipher = (   (uint32_t) sbox[ B0(wtxt[2]) ]
-		  | ((uint32_t) sbox[ B1(wtxt[3]) ] << 8)
-		  | ((uint32_t) sbox[ B2(wtxt[0]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[1]) ] << 24));
-#if DEBUG
-	fprintf(stderr, "  t[2]: %x, key: %x\n",
-		cipher, ctx->keys[4*round + 2]);
-#endif
-	cipher ^= ctx->keys[4*round + 2];
-
-	LE_WRITE_UINT32(dst + 8, cipher);
-
-	cipher = (   (uint32_t) sbox[ B0(wtxt[3]) ]
-		  | ((uint32_t) sbox[ B1(wtxt[0]) ] << 8)
-		  | ((uint32_t) sbox[ B2(wtxt[1]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[2]) ] << 24));
-#if DEBUG
-	fprintf(stderr, "  t[3]: %x, key: %x\n",
-		cipher, ctx->keys[4*round + 3]);
-#endif
-	cipher ^= ctx->keys[4*round + 3];
-
-	LE_WRITE_UINT32(dst + 12, cipher);
+	    LE_WRITE_UINT32(dst + 4*j, cipher);
+	  }
       }
     }
 }
 
-#if 1
 void
 aes_decrypt(struct aes_ctx *ctx,
 	    unsigned length, uint8_t *dst,
@@ -240,33 +188,24 @@ aes_decrypt(struct aes_ctx *ctx,
 	  /* What's the best way to order this loop? Ideally,
 	   * we'd want to keep both t and wtxt in registers. */
 
-#if AES_SMALL
 	  for (j=0; j<4; j++)
-	    t[j] =         itable[0][ B0(wtxt[j]) ] ^
-	      ROTRBYTE(    itable[0][ B1(wtxt[iidx[1][j]]) ]^
-		ROTRBYTE(  itable[0][ B2(wtxt[iidx[2][j]]) ] ^
-		  ROTRBYTE(itable[0][ B3(wtxt[iidx[3][j]]) ])));
+	    {
+#if AES_SMALL
+	      t[j] =         itable[0][ B0(wtxt[j]) ] ^
+		ROTRBYTE(    itable[0][ B1(wtxt[iidx[1][j]]) ]^
+		  ROTRBYTE(  itable[0][ B2(wtxt[iidx[2][j]]) ] ^
+		    ROTRBYTE(itable[0][ B3(wtxt[iidx[3][j]]) ])));
 #else /* !AES_SMALL */
-	  /* FIXME: Figure out how the indexing should really be done.
-	   * It looks like this code shifts the rows in the wrong
-	   * direction, but it passes the testsuite. */
-	  t[0] = (  itable[0][ B0(wtxt[0]) ] /* 0 1 2 3 */
-		  ^ itable[1][ B1(wtxt[3]) ]
-		  ^ itable[2][ B2(wtxt[2]) ]
-		  ^ itable[3][ B3(wtxt[1]) ]);
-	  t[1] = (  itable[0][ B0(wtxt[1]) ] /* 3 0 1 2 */
-		  ^ itable[1][ B1(wtxt[0]) ]
-		  ^ itable[2][ B2(wtxt[3]) ]
-		  ^ itable[3][ B3(wtxt[2]) ]);
-	  t[2] = (  itable[0][ B0(wtxt[2]) ] /* 2 3 0 1 */
-		  ^ itable[1][ B1(wtxt[1]) ]
-		  ^ itable[2][ B2(wtxt[0]) ]
-		  ^ itable[3][ B3(wtxt[3]) ]);
-	  t[3] = (  itable[0][ B0(wtxt[3]) ] /* 1 2 3 0 */
-		  ^ itable[1][ B1(wtxt[2]) ]
-		  ^ itable[2][ B2(wtxt[1]) ]
-		  ^ itable[3][ B3(wtxt[0]) ]);
+	      /* FIXME: Figure out how the indexing should really be done.
+	       * It looks like this code shifts the rows in the wrong
+	       * direction, but it passes the testsuite. */
+	      for (j=0; j<4; j++)
+		t[j] = (  itable[0][ B0(wtxt[iidx[0][j]]) ]
+			^ itable[1][ B1(wtxt[iidx[1][j]]) ]
+			^ itable[2][ B2(wtxt[iidx[2][j]]) ]
+			^ itable[3][ B3(wtxt[iidx[3][j]]) ]);
 #endif /* !AES_SMALL */
+	    }
 #if DEBUG
 	  fprintf(stderr, "  t: ");
 	  for (j = 0; j<4; j++)
@@ -279,148 +218,26 @@ aes_decrypt(struct aes_ctx *ctx,
       /* Final round */
       {
 	uint32_t clear;
+	unsigned j;
+	for (j = 0; j<4; j++)
+	  {
+	    /* FIXME: Figure out how the indexing should really be done.
+	     * It looks like this code shifts the rows in the wrong
+	     * direction, but it passes the testsuite. */
+
+	    clear = (   (uint32_t) isbox[ B0(wtxt[j]) ]
+			| ((uint32_t) isbox[ B1(wtxt[iidx[1][j]]) ] << 8)
+			| ((uint32_t) isbox[ B2(wtxt[iidx[2][j]]) ] << 16)
+			| ((uint32_t) isbox[ B3(wtxt[iidx[3][j]]) ] << 24));
 
-	/* FIXME: Figure out how the indexing should really be done.
-	 * It looks like this code shifts the rows in the wrong
-	 * direction, but it passes the testsuite. */
-
-	clear = (   (uint32_t) isbox[ B0(wtxt[0]) ]
-		 | ((uint32_t) isbox[ B1(wtxt[3]) ] << 8)
-		 | ((uint32_t) isbox[ B2(wtxt[2]) ] << 16)
-		 | ((uint32_t) isbox[ B3(wtxt[1]) ] << 24));
-#if DEBUG
-	fprintf(stderr, "  t[0]: %x, key: %x\n",
-		clear, ctx->ikeys[4*round]);
-#endif
-	clear ^= ctx->ikeys[4*round];
-
-	LE_WRITE_UINT32(dst, clear);
-
-	clear = (   (uint32_t) isbox[ B0(wtxt[1]) ]
-		 | ((uint32_t) isbox[ B1(wtxt[0]) ] << 8)
-		 | ((uint32_t) isbox[ B2(wtxt[3]) ] << 16)
-		 | ((uint32_t) isbox[ B3(wtxt[2]) ] << 24));
-#if DEBUG
-	fprintf(stderr, "  t[1]: %x, key: %x\n",
-		clear, ctx->ikeys[4*round + 1]);
-#endif
-	clear ^= ctx->ikeys[4*round + 1];
-	
-	LE_WRITE_UINT32(dst + 4, clear);
-		
-	clear = (   (uint32_t) isbox[ B0(wtxt[2]) ]
-		 | ((uint32_t) isbox[ B1(wtxt[1]) ] << 8)
-		 | ((uint32_t) isbox[ B2(wtxt[0]) ] << 16)
-		 | ((uint32_t) isbox[ B3(wtxt[3]) ] << 24));
 #if DEBUG
-	fprintf(stderr, "  t[2]: %x, key: %x\n",
-		clear, ctx->ikeys[4*round+2]);
+	    fprintf(stderr, "  t[%d]: %x, key: %x\n",
+		    j, clear, ctx->ikeys[4*round + j]);
 #endif
-	clear ^= ctx->ikeys[4*round + 2];
+	    clear ^= ctx->ikeys[4*round + j];
 
-	LE_WRITE_UINT32(dst + 8, clear);
-
-	clear = (   (uint32_t) isbox[ B0(wtxt[3]) ]
-		 | ((uint32_t) isbox[ B1(wtxt[2]) ] << 8)
-		 | ((uint32_t) isbox[ B2(wtxt[1]) ] << 16)
-		 | ((uint32_t) isbox[ B3(wtxt[0]) ] << 24));
-#if DEBUG
-	fprintf(stderr, "  t[3]: %x, key: %x\n",
-		clear, ctx->ikeys[4*round+3]);
-#endif
-	clear ^= ctx->ikeys[4*round + 3];
-
-	LE_WRITE_UINT32(dst + 12, clear);
+	    LE_WRITE_UINT32(dst + 4*j, clear);
+	  }
       }
     }
 }
-
-#else
-/* Key addition that also packs every byte in the key to a word rep. */
-static void
-key_addition_8to32(const uint8_t *txt, const uint32_t *keys, uint32_t *out)
-{
-  const uint8_t *ptr;
-  unsigned i, j;
-  uint32_t val;
-
-  ptr = txt;
-  for (i=0; i<4; i++)
-    {
-      /* FIXME: Use the READ_UINT32 or LE_READ_UINT32 macro. */
-      val = 0;
-      for (j=0; j<4; j++)
-	val |= (*ptr++ << 8*j);
-      out[i] = keys[i]^val;
-    }
-}
-
-static void
-key_addition32(const uint32_t *txt, const uint32_t *keys, uint32_t *out)
-{
-  unsigned i;
-
-  for (i=0; i<4; i++)
-    out[i] = keys[i] ^ txt[i];
-}
-
-static void
-key_addition32to8(const uint32_t *txt, const uint32_t *keys, uint8_t *out)
-{
-  uint8_t *ptr;
-  unsigned i, j;
-  uint32_t val;
-
-  ptr = out;
-  for (i=0; i<4; i++)
-    {
-      /* FIXME: Use WRITE_UINT32 or LE_WRITE_UINT32 */
-      val = txt[i] ^ keys[i];
-      for (j=0; j<4; j++)
-	*ptr++ = (val >> 8*j) & 0xff;
-    }
-}
-
-void
-aes_decrypt(struct aes_ctx *ctx,
-	    unsigned length, uint8_t *dst,
-	    const uint8_t *src)
-{
-  unsigned r, j;
-  uint32_t wtxt[4], t[4];		/* working ciphertext */
-  uint32_t e;
-
-  assert(!(length % AES_BLOCK_SIZE));
-
-  for (; length;
-       length -= AES_BLOCK_SIZE, src += AES_BLOCK_SIZE, dst += AES_BLOCK_SIZE)
-    {
-      key_addition_8to32(src, ctx->ikeys + 4*ctx->nrounds, wtxt);
-      for (r=ctx->nrounds-1; r> 0;  r--)
-	{
-	  for (j=0; j<4; j++)
-	    {
-	      t[j] = itbl[wtxt[j] & 0xff] ^
-		ROTRBYTE(itbl[(wtxt[iidx[1][j]] >> 8) & 0xff]^
-			 ROTRBYTE(itbl[(wtxt[iidx[2][j]] >> 16) & 0xff] ^
-				  ROTRBYTE(itbl[(wtxt[iidx[3][j]] >> 24) & 0xff])));
-	    }
-	  key_addition32(t, ctx->ikeys + r*4, wtxt);
-	}
-      /* last round is special: there is no mixcolumn, so we can't use the big
-	 tables. */
-      for (j=0; j<4; j++)
-	{
-	  e = wtxt[j] & 0xff;
-	  e |= (wtxt[iidx[1][j]]) & (0xff << 8);
-	  e |= (wtxt[iidx[2][j]]) & (0xff << 16);
-	  e |= (wtxt[iidx[3][j]]) & (0xff << 24);
-	  t[j] = e;
-	}
-      for (j=0; j<4; j++)
-	t[j] = SUBBYTE(t[j], isbox);
-
-      key_addition32to8(t, ctx->ikeys, dst);
-    }
-}
-#endif
-- 
GitLab