diff --git a/ChangeLog b/ChangeLog index 7cdde45dede94953c3f5b1fd9ced45756a0315c4..66fedf0670712ec8e0f239293f0fbd6da0fff80b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,14 @@ 2016-10-12 Niels Möller <nisse@lysator.liu.se> + * skein.h (_SKEIN256_NKEYS, _SKEIN_NTWEAK): Increase to 6 and 4, + respectively, to reduce number of modulo operations. + * skein256.c (_skein256_expand): Intialize the additional words. + (_skein256_block): Avoid modulo operations in the loop. Almost 50% + speedup. + * skein256.c (_skein256_expand): New function. (skein256_process_block): Use it. + * testsuite/skein256-test.c (test_skein256_block): Use it. 2016-10-10 Niels Möller <nisse@lysator.liu.se> diff --git a/skein.h b/skein.h index a5d09de18f7e2e22c4094e29a31c8ec59693707f..c22f06d6fa7a267d7621f23e8a879c2377a901ef 100644 --- a/skein.h +++ b/skein.h @@ -50,10 +50,12 @@ extern "C" { #define SKEIN256_BLOCK_SIZE 32 #define SKEIN256_DIGEST_SIZE 32 -/* Internal lengths, as 64-bit words. */ -#define _SKEIN_NTWEAK 3 +/* Internal lengths, as 64-bit words. We use *two* redundant words for + both key and tweak, to reduce the number of index mod + operations. */ #define _SKEIN256_LENGTH 4 -#define _SKEIN256_NKEYS 5 +#define _SKEIN256_NKEYS 6 +#define _SKEIN_NTWEAK 4 struct skein256_ctx { uint64_t state[_SKEIN256_NKEYS]; diff --git a/skein256.c b/skein256.c index 703182a91b1d218add4e1888d5d2cca6e64f9a13..12fab3f88d4984de531a05d16580bbe6c754b73a 100644 --- a/skein256.c +++ b/skein256.c @@ -92,11 +92,11 @@ w3 ^= w2; \ } while(0) -#define ADD_SUBKEY(w0, w1, w2, w3, keys, tw, i) do { \ - w0 += (keys)[(i) % 5u]; \ - w1 += (keys)[((i)+1u) % 5u] + (tw)[(i) % 3u]; \ - w2 += (keys)[((i)+2u) % 5u] + (tw)[((i)+1u) % 3u]; \ - w3 += (keys)[((i)+3u) % 5u] + (i); \ +#define ADD_SUBKEY(w0, w1, w2, w3, k0, k1, k2, k3, t0, t1, i) do { \ + w0 += (k0); \ + w1 += (k1) + (t0); \ + w2 += (k2) + (t1); \ + w3 += (k3) + (i); \ } while (0) void @@ -108,29 +108,47 @@ _skein256_block (uint64_t dst[_SKEIN256_LENGTH], uint64_t s0, s1, s2, s3; uint64_t w0, w1, w2, w3; unsigned i; + unsigned imod5, ip2mod5, imod3; w0 = s0 = LE_READ_UINT64(src); w1 = s1 = LE_READ_UINT64(src + 8); w2 = s2 = LE_READ_UINT64(src + 16); w3 = s3 = LE_READ_UINT64(src + 24); - for (i = 0; i < 9; i++) + for (i = imod5 = imod3 = 0, ip2mod5 = 2; i < 18; i+=2) { - ADD_SUBKEY(w0, w1, w2, w3, keys, tweak, 2*i); + unsigned ip4mod5; + unsigned ip2mod3; + + ADD_SUBKEY(w0, w1, w2, w3, + keys[imod5], keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1], + tweak[imod3], tweak[imod3+1], i); ROUND(w0, w1, w2, w3, 14, 16); ROUND(w0, w3, w2, w1, 52, 57); ROUND(w0, w1, w2, w3, 23, 40); ROUND(w0, w3, w2, w1, 5, 37); - ADD_SUBKEY(w0, w1, w2, w3, keys, tweak, 2*i+1); + /* Hopefully compiled to a conditional move, but gcc-6.1.1 doesn't. */ + ip4mod5 = imod5 ? imod5 - 1 : 4; + ip2mod3 = imod3 ? imod3 - 1 : 2; + + ADD_SUBKEY(w0, w1, w2, w3, + keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1], keys[ip4mod5], + tweak[imod3+1], tweak[ip2mod3], i + 1); ROUND(w0, w1, w2, w3, 25, 33); ROUND(w0, w3, w2, w1, 46, 12); ROUND(w0, w1, w2, w3, 58, 22); ROUND(w0, w3, w2, w1, 32, 32); + + imod5 = ip2mod5; + ip2mod5 = ip4mod5; + imod3 = ip2mod3; } - ADD_SUBKEY(w0, w1, w2, w3, keys, tweak, 18); + ADD_SUBKEY(w0, w1, w2, w3, /* 18 mod 5 = 3, 18 mod 3 = 0 */ + keys[3], keys[4], keys[0], keys[1], + tweak[0], tweak[1], 18); dst[0] = s0 ^ w0; dst[1] = s1 ^ w1; @@ -148,7 +166,9 @@ _skein256_expand(uint64_t keys[_SKEIN256_NKEYS], for (i = 0, sum = _SKEIN_C240; i < _SKEIN256_LENGTH; i++) sum ^= keys[i]; keys[_SKEIN256_LENGTH] = sum; + keys[_SKEIN256_LENGTH + 1] = keys[0]; tweak[2] = tweak[0] ^ tweak[1]; + tweak[3] = tweak[0]; } void