diff --git a/aes.c b/aes.c
index 47a054729dac9b54fed5c1c08b80e9a36b1a3690..a32e6df43fe76f7979d0ff67f7f8074dd9b35cec 100644
--- a/aes.c
+++ b/aes.c
@@ -32,7 +32,7 @@
 #include <assert.h>
 
 #ifndef DEBUG
-#define DEBUG 0
+# define DEBUG 0
 #endif
 
 #if DEBUG
@@ -45,12 +45,20 @@
 #define B2(x) (((x) >> 16) & 0xff)
 #define B3(x) (((x) >> 24) & 0xff)
 
+/* Column j are the shifts used when computing t[j].
+ * Row i is says which byte is used */
 #if AES_SMALL
 static const unsigned idx[4][4] = {
   { 0, 1, 2, 3 },
   { 1, 2, 3, 0 },
   { 2, 3, 0, 1 },
   { 3, 0, 1, 2 } };
+
+static const unsigned iidx[4][4] = {
+  { 0, 1, 2, 3 },
+  { 3, 0, 1, 2 },
+  { 2, 3, 0, 1 },
+  { 1, 2, 3, 0 } };
 #endif /* AES_SMALL */
 
 void
@@ -75,71 +83,53 @@ aes_encrypt(struct aes_ctx *ctx,
 	  unsigned j;
 
 #if DEBUG
-	  fprintf(stderr, "round: %d\n  wtxt: ", round);
+	  fprintf(stderr, "encrypt, round: %d\n  wtxt: ", round);
 	  for (j = 0; j<4; j++)
 	    fprintf(stderr, "%08x, ", wtxt[j]);
 	  fprintf(stderr, "\n  key: ");
 	  for (j = 0; j<4; j++)
 	    fprintf(stderr, "%08x, ", ctx->keys[4*round + j]);
 	  fprintf(stderr, "\n");
-
-	  fprintf(stderr,
-		  "  B0(wtxt[0]): %x\n"
-		  "    dtbl[0]: %x\n",
-		  B0(wtxt[0]), dtbl[0][ B0(wtxt[0]) ]);
-	  fprintf(stderr,
-		  "  B1(wtxt[1]): %x\n"
-		  "    dtbl[1]: %x\n",
-		  B1(wtxt[1]), dtbl[1][ B1(wtxt[1]) ]);
-	  fprintf(stderr,
-		  "  B2(wtxt[2]): %x\n"
-		  "    dtbl[2]: %x\n",
-		  B2(wtxt[2]), dtbl[2][ B2(wtxt[2]) ]);
-	  fprintf(stderr,
-		  "  B3(wtxt[3]): %x\n"
-		  "    dtbl[3]: %x\n",
-		  B3(wtxt[3]), dtbl[3][ B3(wtxt[3]) ]);
 #endif
-
 	  /* The row shift counts C1, C2 and C3 are (1, 2, 3) */
 	  /* What's the best way to order this loop? Ideally,
 	   * we'd want to keep both t and wtxt in registers. */
 
 #if AES_SMALL
 	  for (j=0; j<4; j++)
-	    t[j] =         dtbl[0][ B0(wtxt[j]) ] ^
-	      ROTRBYTE(    dtbl[0][ B1(wtxt[idx[1][j]]) ]^
-		ROTRBYTE(  dtbl[0][ B2(wtxt[idx[2][j]]) ] ^
-		  ROTRBYTE(dtbl[0][ B3(wtxt[idx[3][j]]) ])));
+	    t[j] =         dtable[0][ B0(wtxt[j]) ] ^
+	      ROTRBYTE(    dtable[0][ B1(wtxt[idx[1][j]]) ]^
+		ROTRBYTE(  dtable[0][ B2(wtxt[idx[2][j]]) ] ^
+		  ROTRBYTE(dtable[0][ B3(wtxt[idx[3][j]]) ])));
 #else /* !AES_SMALL */
 	  
 	  /* FIXME: Figure out how the indexing should really be done.
 	   * It looks like this code shifts the rows in the wrong
 	   * direction, but it passes the testsuite. */
-	  t[0] = (  dtbl[0][ B0(wtxt[0]) ]
-		  ^ dtbl[1][ B1(wtxt[1]) ]
-		  ^ dtbl[2][ B2(wtxt[2]) ]
-		  ^ dtbl[3][ B3(wtxt[3]) ]);
-	  t[3] = (  dtbl[0][ B0(wtxt[3]) ]
-		  ^ dtbl[1][ B1(wtxt[0]) ]
-		  ^ dtbl[2][ B2(wtxt[1]) ]
-		  ^ dtbl[3][ B3(wtxt[2]) ]);
-	  t[2] = (  dtbl[0][ B0(wtxt[2]) ]
-		  ^ dtbl[1][ B1(wtxt[3]) ]
-		  ^ dtbl[2][ B2(wtxt[0]) ]
-		  ^ dtbl[3][ B3(wtxt[1]) ]);
-	  t[1] = (  dtbl[0][ B0(wtxt[1]) ]
-		  ^ dtbl[1][ B1(wtxt[2]) ]
-		  ^ dtbl[2][ B2(wtxt[3]) ]
-		  ^ dtbl[3][ B3(wtxt[0]) ]);
+	  t[0] = (  dtable[0][ B0(wtxt[0]) ]
+		  ^ dtable[1][ B1(wtxt[1]) ]
+		  ^ dtable[2][ B2(wtxt[2]) ]
+		  ^ dtable[3][ B3(wtxt[3]) ]);
+	  t[1] = (  dtable[0][ B0(wtxt[1]) ]
+		  ^ dtable[1][ B1(wtxt[2]) ]
+		  ^ dtable[2][ B2(wtxt[3]) ]
+		  ^ dtable[3][ B3(wtxt[0]) ]);
+	  t[2] = (  dtable[0][ B0(wtxt[2]) ]
+		  ^ dtable[1][ B1(wtxt[3]) ]
+		  ^ dtable[2][ B2(wtxt[0]) ]
+		  ^ dtable[3][ B3(wtxt[1]) ]);
+	  t[3] = (  dtable[0][ B0(wtxt[3]) ]
+		  ^ dtable[1][ B1(wtxt[0]) ]
+		  ^ dtable[2][ B2(wtxt[1]) ]
+		  ^ dtable[3][ B3(wtxt[2]) ]);
 #endif /* !AES_SMALL */
-
 #if DEBUG
 	  fprintf(stderr, "\n  t: ");
 	  for (j = 0; j<4; j++)
 	    fprintf(stderr, "%08x, ", t[j]);
 	  fprintf(stderr, "\n");
 #endif
+
 	  for (j = 0; j<4; j++)
 	    wtxt[j] = t[j] ^ ctx->keys[4*round + j];
 	}
@@ -154,39 +144,198 @@ aes_encrypt(struct aes_ctx *ctx,
 	cipher = (   (uint32_t) sbox[ B0(wtxt[0]) ]
 		  | ((uint32_t) sbox[ B1(wtxt[1]) ] << 8)
 		  | ((uint32_t) sbox[ B2(wtxt[2]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[3]) ] << 24))
-	  ^ ctx->keys[4*round];
+		  | ((uint32_t) sbox[ B3(wtxt[3]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[0]: %x, key: %x\n",
+		cipher, ctx->keys[4*round]);
+#endif
+	cipher ^= ctx->keys[4*round];
 
 	LE_WRITE_UINT32(dst, cipher);
 
 	cipher = (   (uint32_t) sbox[ B0(wtxt[1]) ]
 		  | ((uint32_t) sbox[ B1(wtxt[2]) ] << 8)
 		  | ((uint32_t) sbox[ B2(wtxt[3]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[0]) ] << 24))
-	  ^ ctx->keys[4*round + 1];
+		  | ((uint32_t) sbox[ B3(wtxt[0]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[1]: %x, key: %x\n",
+		cipher, ctx->keys[4*round + 1]);
+#endif
+	cipher ^= ctx->keys[4*round + 1];
 	
 	LE_WRITE_UINT32(dst + 4, cipher);
 		
 	cipher = (   (uint32_t) sbox[ B0(wtxt[2]) ]
 		  | ((uint32_t) sbox[ B1(wtxt[3]) ] << 8)
 		  | ((uint32_t) sbox[ B2(wtxt[0]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[1]) ] << 24))
-	  ^ ctx->keys[4*round + 2];
+		  | ((uint32_t) sbox[ B3(wtxt[1]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[2]: %x, key: %x\n",
+		cipher, ctx->keys[4*round + 2]);
+#endif
+	cipher ^= ctx->keys[4*round + 2];
 
 	LE_WRITE_UINT32(dst + 8, cipher);
 
 	cipher = (   (uint32_t) sbox[ B0(wtxt[3]) ]
 		  | ((uint32_t) sbox[ B1(wtxt[0]) ] << 8)
 		  | ((uint32_t) sbox[ B2(wtxt[1]) ] << 16)
-		  | ((uint32_t) sbox[ B3(wtxt[2]) ] << 24))
-	  ^ ctx->keys[4*round + 3];
+		  | ((uint32_t) sbox[ B3(wtxt[2]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[3]: %x, key: %x\n",
+		cipher, ctx->keys[4*round + 3]);
+#endif
+	cipher ^= ctx->keys[4*round + 3];
 
 	LE_WRITE_UINT32(dst + 12, cipher);
       }
     }
 }
-      
-      
+
+#if 1
+void
+aes_decrypt(struct aes_ctx *ctx,
+	    unsigned length, uint8_t *dst,
+	    const uint8_t *src)
+{
+#if DEBUG
+  {
+    unsigned i, j;
+    fprintf(stderr, "subkeys:\n");
+    for (j = 0; j<=ctx->nrounds; j++)
+      {
+	printf(" %d: ", j);
+	for (i = 0; i<4; i++)
+	  printf("%08x, ", ctx->ikeys[i + 4*j]);
+	printf("\n");
+      }
+  }
+#endif
+  FOR_BLOCKS(length, dst, src, AES_BLOCK_SIZE)
+    {
+      uint32_t wtxt[4];		/* working ciphertext */
+      unsigned i;
+      unsigned round;
+
+      /* Get cipher text, using little-endian byte order.
+       * Also XOR with the first subkey. */
+      for (i = 0; i<4; i++)
+	wtxt[i] = LE_READ_UINT32(src + 4*i) ^ ctx->ikeys[i];
+
+      for (round = 1; round < ctx->nrounds; round++)
+	{
+	  uint32_t t[4];
+	  unsigned j;
+
+#if DEBUG
+	  fprintf(stderr, "decrypt, round: %d\n  wtxt: ", round);
+	  for (j = 0; j<4; j++)
+	    fprintf(stderr, "%08x, ", wtxt[j]);
+	  fprintf(stderr, "\n  key: ");
+	  for (j = 0; j<4; j++)
+	    fprintf(stderr, "%08x, ", ctx->ikeys[4*round + j]);
+	  fprintf(stderr, "\n");
+#endif
+	  /* The row shift counts C1, C2 and C3 are (1, 2, 3) */
+	  /* What's the best way to order this loop? Ideally,
+	   * we'd want to keep both t and wtxt in registers. */
+
+#if AES_SMALL
+	  for (j=0; j<4; j++)
+	    t[j] =         itable[0][ B0(wtxt[j]) ] ^
+	      ROTRBYTE(    itable[0][ B1(wtxt[iidx[1][j]]) ]^
+		ROTRBYTE(  itable[0][ B2(wtxt[iidx[2][j]]) ] ^
+		  ROTRBYTE(itable[0][ B3(wtxt[iidx[3][j]]) ])));
+#else /* !AES_SMALL */
+	  /* FIXME: Figure out how the indexing should really be done.
+	   * It looks like this code shifts the rows in the wrong
+	   * direction, but it passes the testsuite. */
+	  t[0] = (  itable[0][ B0(wtxt[0]) ] /* 0 1 2 3 */
+		  ^ itable[1][ B1(wtxt[3]) ]
+		  ^ itable[2][ B2(wtxt[2]) ]
+		  ^ itable[3][ B3(wtxt[1]) ]);
+	  t[1] = (  itable[0][ B0(wtxt[1]) ] /* 3 0 1 2 */
+		  ^ itable[1][ B1(wtxt[0]) ]
+		  ^ itable[2][ B2(wtxt[3]) ]
+		  ^ itable[3][ B3(wtxt[2]) ]);
+	  t[2] = (  itable[0][ B0(wtxt[2]) ] /* 2 3 0 1 */
+		  ^ itable[1][ B1(wtxt[1]) ]
+		  ^ itable[2][ B2(wtxt[0]) ]
+		  ^ itable[3][ B3(wtxt[3]) ]);
+	  t[3] = (  itable[0][ B0(wtxt[3]) ] /* 1 2 3 0 */
+		  ^ itable[1][ B1(wtxt[2]) ]
+		  ^ itable[2][ B2(wtxt[1]) ]
+		  ^ itable[3][ B3(wtxt[0]) ]);
+#endif /* !AES_SMALL */
+#if DEBUG
+	  fprintf(stderr, "  t: ");
+	  for (j = 0; j<4; j++)
+	    fprintf(stderr, "%08x, ", t[j]);
+	  fprintf(stderr, "\n");
+#endif
+	  for (j = 0; j<4; j++)
+	    wtxt[j] = t[j] ^ ctx->ikeys[4*round + j];
+	}
+      /* Final round */
+      {
+	uint32_t clear;
+
+	/* FIXME: Figure out how the indexing should really be done.
+	 * It looks like this code shifts the rows in the wrong
+	 * direction, but it passes the testsuite. */
+
+	clear = (   (uint32_t) isbox[ B0(wtxt[0]) ]
+		 | ((uint32_t) isbox[ B1(wtxt[3]) ] << 8)
+		 | ((uint32_t) isbox[ B2(wtxt[2]) ] << 16)
+		 | ((uint32_t) isbox[ B3(wtxt[1]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[0]: %x, key: %x\n",
+		clear, ctx->ikeys[4*round]);
+#endif
+	clear ^= ctx->ikeys[4*round];
+
+	LE_WRITE_UINT32(dst, clear);
+
+	clear = (   (uint32_t) isbox[ B0(wtxt[1]) ]
+		 | ((uint32_t) isbox[ B1(wtxt[0]) ] << 8)
+		 | ((uint32_t) isbox[ B2(wtxt[3]) ] << 16)
+		 | ((uint32_t) isbox[ B3(wtxt[2]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[1]: %x, key: %x\n",
+		clear, ctx->ikeys[4*round + 1]);
+#endif
+	clear ^= ctx->ikeys[4*round + 1];
+	
+	LE_WRITE_UINT32(dst + 4, clear);
+		
+	clear = (   (uint32_t) isbox[ B0(wtxt[2]) ]
+		 | ((uint32_t) isbox[ B1(wtxt[1]) ] << 8)
+		 | ((uint32_t) isbox[ B2(wtxt[0]) ] << 16)
+		 | ((uint32_t) isbox[ B3(wtxt[3]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[2]: %x, key: %x\n",
+		clear, ctx->ikeys[4*round+2]);
+#endif
+	clear ^= ctx->ikeys[4*round + 2];
+
+	LE_WRITE_UINT32(dst + 8, clear);
+
+	clear = (   (uint32_t) isbox[ B0(wtxt[3]) ]
+		 | ((uint32_t) isbox[ B1(wtxt[2]) ] << 8)
+		 | ((uint32_t) isbox[ B2(wtxt[1]) ] << 16)
+		 | ((uint32_t) isbox[ B3(wtxt[0]) ] << 24));
+#if DEBUG
+	fprintf(stderr, "  t[3]: %x, key: %x\n",
+		clear, ctx->ikeys[4*round+3]);
+#endif
+	clear ^= ctx->ikeys[4*round + 3];
+
+	LE_WRITE_UINT32(dst + 12, clear);
+      }
+    }
+}
+
+#else
 /* Key addition that also packs every byte in the key to a word rep. */
 static void
 key_addition_8to32(const uint8_t *txt, const uint32_t *keys, uint32_t *out)
@@ -232,12 +381,6 @@ key_addition32to8(const uint32_t *txt, const uint32_t *keys, uint8_t *out)
     }
 }
 
-static const unsigned iidx[4][4] = {
-  { 0, 1, 2, 3 },
-  { 3, 0, 1, 2 },
-  { 2, 3, 0, 1 },
-  { 1, 2, 3, 0 } };
-
 void
 aes_decrypt(struct aes_ctx *ctx,
 	    unsigned length, uint8_t *dst,
@@ -280,3 +423,4 @@ aes_decrypt(struct aes_ctx *ctx,
       key_addition32to8(t, ctx->ikeys, dst);
     }
 }
+#endif