diff --git a/ChangeLog b/ChangeLog
index 7cdde45dede94953c3f5b1fd9ced45756a0315c4..66fedf0670712ec8e0f239293f0fbd6da0fff80b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,14 @@
 2016-10-12  Niels Möller  <nisse@lysator.liu.se>
 
+	* skein.h (_SKEIN256_NKEYS, _SKEIN_NTWEAK): Increase to 6 and 4,
+	respectively, to reduce number of modulo operations.
+	* skein256.c (_skein256_expand): Intialize the additional words.
+	(_skein256_block): Avoid modulo operations in the loop. Almost 50%
+	speedup.
+
 	* skein256.c (_skein256_expand): New function.
 	(skein256_process_block): Use it.
+
 	* testsuite/skein256-test.c (test_skein256_block): Use it.
 
 2016-10-10  Niels Möller  <nisse@lysator.liu.se>
diff --git a/skein.h b/skein.h
index a5d09de18f7e2e22c4094e29a31c8ec59693707f..c22f06d6fa7a267d7621f23e8a879c2377a901ef 100644
--- a/skein.h
+++ b/skein.h
@@ -50,10 +50,12 @@ extern "C" {
 #define SKEIN256_BLOCK_SIZE 32
 #define SKEIN256_DIGEST_SIZE 32
 
-/* Internal lengths, as 64-bit words. */
-#define _SKEIN_NTWEAK 3
+/* Internal lengths, as 64-bit words. We use *two* redundant words for
+   both key and tweak, to reduce the number of index mod
+   operations. */
 #define _SKEIN256_LENGTH 4
-#define _SKEIN256_NKEYS 5
+#define _SKEIN256_NKEYS 6
+#define _SKEIN_NTWEAK 4
 
 struct skein256_ctx {
   uint64_t state[_SKEIN256_NKEYS];
diff --git a/skein256.c b/skein256.c
index 703182a91b1d218add4e1888d5d2cca6e64f9a13..12fab3f88d4984de531a05d16580bbe6c754b73a 100644
--- a/skein256.c
+++ b/skein256.c
@@ -92,11 +92,11 @@
     w3 ^= w2;								\
   } while(0)
 
-#define ADD_SUBKEY(w0, w1, w2, w3, keys, tw, i) do { \
-    w0 += (keys)[(i) % 5u];			    \
-    w1 += (keys)[((i)+1u) % 5u] + (tw)[(i) % 3u];	    \
-    w2 += (keys)[((i)+2u) % 5u] + (tw)[((i)+1u) % 3u]; \
-    w3 += (keys)[((i)+3u) % 5u] + (i);		       \
+#define ADD_SUBKEY(w0, w1, w2, w3, k0, k1, k2, k3, t0, t1, i) do { \
+    w0 += (k0);			    \
+    w1 += (k1) + (t0);	    \
+    w2 += (k2) + (t1); \
+    w3 += (k3) + (i);		       \
   } while (0)
 
 void
@@ -108,29 +108,47 @@ _skein256_block (uint64_t dst[_SKEIN256_LENGTH],
   uint64_t s0, s1, s2, s3;
   uint64_t w0, w1, w2, w3;
   unsigned i;
+  unsigned imod5, ip2mod5, imod3;
 
   w0 = s0 = LE_READ_UINT64(src);
   w1 = s1 = LE_READ_UINT64(src + 8);
   w2 = s2 = LE_READ_UINT64(src + 16);
   w3 = s3 = LE_READ_UINT64(src + 24);
 
-  for (i = 0; i < 9; i++)
+  for (i = imod5 = imod3 = 0, ip2mod5 = 2; i < 18; i+=2)
     {
-      ADD_SUBKEY(w0, w1, w2, w3, keys, tweak, 2*i);
+      unsigned ip4mod5;
+      unsigned ip2mod3;
+
+      ADD_SUBKEY(w0, w1, w2, w3,
+		 keys[imod5], keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1],
+		 tweak[imod3], tweak[imod3+1], i);
 
       ROUND(w0, w1, w2, w3, 14, 16);
       ROUND(w0, w3, w2, w1, 52, 57);
       ROUND(w0, w1, w2, w3, 23, 40);
       ROUND(w0, w3, w2, w1, 5, 37);
 
-      ADD_SUBKEY(w0, w1, w2, w3, keys, tweak, 2*i+1);
+      /* Hopefully compiled to a conditional move, but gcc-6.1.1 doesn't. */
+      ip4mod5 = imod5 ? imod5 - 1 : 4;
+      ip2mod3 = imod3 ? imod3 - 1 : 2;
+
+      ADD_SUBKEY(w0, w1, w2, w3,
+		 keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1], keys[ip4mod5],
+		 tweak[imod3+1], tweak[ip2mod3], i + 1);
 
       ROUND(w0, w1, w2, w3, 25, 33);
       ROUND(w0, w3, w2, w1, 46, 12);
       ROUND(w0, w1, w2, w3, 58, 22);
       ROUND(w0, w3, w2, w1, 32, 32);
+
+      imod5 = ip2mod5;
+      ip2mod5 = ip4mod5;
+      imod3 = ip2mod3;
     }
-  ADD_SUBKEY(w0, w1, w2, w3, keys, tweak, 18);
+  ADD_SUBKEY(w0, w1, w2, w3, /* 18 mod 5 = 3, 18 mod 3 = 0 */
+	     keys[3], keys[4], keys[0], keys[1],
+	     tweak[0], tweak[1], 18);
 
   dst[0] = s0 ^ w0;
   dst[1] = s1 ^ w1;
@@ -148,7 +166,9 @@ _skein256_expand(uint64_t keys[_SKEIN256_NKEYS],
   for (i = 0, sum = _SKEIN_C240; i < _SKEIN256_LENGTH; i++)
     sum ^= keys[i];
   keys[_SKEIN256_LENGTH] = sum;
+  keys[_SKEIN256_LENGTH + 1] = keys[0];
   tweak[2] = tweak[0] ^ tweak[1];
+  tweak[3] = tweak[0];
 }
 
 void