From a7457dfaee08e24f52568582dab1d897a786d85a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 9 Dec 2012 19:34:19 +0100 Subject: [PATCH] Rewrote C implementation of sha3-permute. --- ChangeLog | 5 ++ sha3-permute.c | 169 ++++++++++++++++++++++++++++--------------------- 2 files changed, 103 insertions(+), 71 deletions(-) diff --git a/ChangeLog b/ChangeLog index 93390291..c9270851 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2012-12-09 Niels Möller <nisse@lysator.liu.se> + + * sha3-permute.c (sha3_permute): Rewrote to do permutation in + place. 80% speedup on x86_64, 2500 cycles. + 2012-12-04 Niels Möller <nisse@lysator.liu.se> * ctr.c (ctr_crypt): Fix bug reported by Tim Kosse. Don't diff --git a/sha3-permute.c b/sha3-permute.c index 1235bdc0..59dc303e 100644 --- a/sha3-permute.c +++ b/sha3-permute.c @@ -33,11 +33,24 @@ #define SHA3_ROUNDS 24 -/* Based on the pseudocode description at - http://keccak.noekeon.org/specs_summary.html */ void sha3_permute (struct sha3_state *state) { + static const uint64_t rc[SHA3_ROUNDS] = { + 0x0000000000000001ULL, 0X0000000000008082ULL, + 0X800000000000808AULL, 0X8000000080008000ULL, + 0X000000000000808BULL, 0X0000000080000001ULL, + 0X8000000080008081ULL, 0X8000000000008009ULL, + 0X000000000000008AULL, 0X0000000000000088ULL, + 0X0000000080008009ULL, 0X000000008000000AULL, + 0X000000008000808BULL, 0X800000000000008BULL, + 0X8000000000008089ULL, 0X8000000000008003ULL, + 0X8000000000008002ULL, 0X8000000000000080ULL, + 0X000000000000800AULL, 0X800000008000000AULL, + 0X8000000080008081ULL, 0X8000000000008080ULL, + 0X0000000080000001ULL, 0X8000000080008008ULL, + }; + /* Original permutation: 0,10,20, 5,15, @@ -55,43 +68,39 @@ sha3_permute (struct sha3_state *state) 18, 2, 61, 56, 14, */ - /* Inverse permutation, to generate the output array in order. */ - static const unsigned char iperm[25] = - { - 0, 6, 12, 18, 24, - 3, 9, 10, 16, 22, - 1, 7, 13, 19, 20, - 4, 5, 11, 17, 23, - 2, 8, 14, 15, 21 - }; - - /* Correspondingly permuted rotation counts. */ - static const unsigned char irot[25] = - { - 0, 44, 43, 21, 14, - 28, 20, 3, 45, 61, - 1, 6, 25, 8, 18, - 27, 36, 10, 15, 56, - 62, 55, 39, 41, 2 - }; + /* In-place implementation. Permutation done as a long sequence of + 25 moves "following" the permutation. + + T <-- 1 + 1 <-- 6 + 6 <-- 9 + 9 <-- 22 + 22 <-- 14 + 14 <-- 20 + 20 <-- 2 + 2 <-- 12 + 12 <-- 13 + 13 <-- 19 + 19 <-- 23 + 23 <-- 15 + 15 <-- 4 + 4 <-- 24 + 24 <-- 21 + 21 <-- 8 + 8 <-- 16 + 16 <-- 5 + 5 <-- 3 + 3 <-- 18 + 18 <-- 17 + 17 <-- 11 + 11 <-- 7 + 7 <-- 10 + 10 <-- T + + */ + uint64_t C[5], D[5], T, X; + unsigned i, y; - static const uint64_t rc[SHA3_ROUNDS] = { - 0x0000000000000001ULL, 0X0000000000008082ULL, - 0X800000000000808AULL, 0X8000000080008000ULL, - 0X000000000000808BULL, 0X0000000080000001ULL, - 0X8000000080008081ULL, 0X8000000000008009ULL, - 0X000000000000008AULL, 0X0000000000000088ULL, - 0X0000000080008009ULL, 0X000000008000000AULL, - 0X000000008000808BULL, 0X800000000000008BULL, - 0X8000000000008089ULL, 0X8000000000008003ULL, - 0X8000000000008002ULL, 0X8000000000000080ULL, - 0X000000000000800AULL, 0X800000008000000AULL, - 0X8000000080008081ULL, 0X8000000000008080ULL, - 0X0000000080000001ULL, 0X8000000080008008ULL, - }; - unsigned i; - uint64_t C[5]; - #define A state->a C[0] = A[0] ^ A[5+0] ^ A[10+0] ^ A[15+0] ^ A[20+0]; @@ -102,46 +111,64 @@ sha3_permute (struct sha3_state *state) for (i = 0; i < SHA3_ROUNDS; i++) { - uint64_t D[5], B[25]; - unsigned x, y; - - /* theta step */ D[0] = C[4] ^ ROTL64(1, C[1]); D[1] = C[0] ^ ROTL64(1, C[2]); D[2] = C[1] ^ ROTL64(1, C[3]); D[3] = C[2] ^ ROTL64(1, C[4]); D[4] = C[3] ^ ROTL64(1, C[0]); - for (x = 0; x < 5; x++) - for (y = 0; y < 25; y += 5) - B[y + x] = A[y + x] ^ D[x]; - - /* rho, pi, chi ant iota steps */ - D[0] = B[0]; - D[1] = ROTL64 (irot[1], B[iperm[1]]); - D[2] = ROTL64 (irot[2], B[iperm[2]]); - D[3] = ROTL64 (irot[3], B[iperm[3]]); - D[4] = ROTL64 (irot[4], B[iperm[4]]); - - A[0] = C[0] = D[0] ^(~D[1] & D[2]) ^ rc[i]; - A[1] = C[1] = D[1] ^(~D[2] & D[3]); - A[2] = C[2] = D[2] ^(~D[3] & D[4]); - A[3] = C[3] = D[3] ^(~D[4] & D[0]); - A[4] = C[4] = D[4] ^(~D[0] & D[1]); - - for (y = 5; y < 25; y += 5) + A[0] ^= D[0]; + X = A[ 1] ^ D[1]; T = ROTL64(1, X); + X = A[ 6] ^ D[1]; A[ 1] = ROTL64 (44, X); + X = A[ 9] ^ D[4]; A[ 6] = ROTL64 (20, X); + X = A[22] ^ D[2]; A[ 9] = ROTL64 (61, X); + X = A[14] ^ D[4]; A[22] = ROTL64 (39, X); + X = A[20] ^ D[0]; A[14] = ROTL64 (18, X); + X = A[ 2] ^ D[2]; A[20] = ROTL64 (62, X); + X = A[12] ^ D[2]; A[ 2] = ROTL64 (43, X); + X = A[13] ^ D[3]; A[12] = ROTL64 (25, X); + X = A[19] ^ D[4]; A[13] = ROTL64 ( 8, X); + X = A[23] ^ D[3]; A[19] = ROTL64 (56, X); + X = A[15] ^ D[0]; A[23] = ROTL64 (41, X); + X = A[ 4] ^ D[4]; A[15] = ROTL64 (27, X); + X = A[24] ^ D[4]; A[ 4] = ROTL64 (14, X); + X = A[21] ^ D[1]; A[24] = ROTL64 ( 2, X); + X = A[ 8] ^ D[3]; A[21] = ROTL64 (55, X); /* row 4 done */ + X = A[16] ^ D[1]; A[ 8] = ROTL64 (45, X); + X = A[ 5] ^ D[0]; A[16] = ROTL64 (36, X); + X = A[ 3] ^ D[3]; A[ 5] = ROTL64 (28, X); + X = A[18] ^ D[3]; A[ 3] = ROTL64 (21, X); /* row 0 done */ + X = A[17] ^ D[2]; A[18] = ROTL64 (15, X); + X = A[11] ^ D[1]; A[17] = ROTL64 (10, X); /* row 3 done */ + X = A[ 7] ^ D[2]; A[11] = ROTL64 ( 6, X); /* row 1 done */ + X = A[10] ^ D[0]; A[ 7] = ROTL64 ( 3, X); + A[10] = T; /* row 2 done */ + + D[0] = ~A[1] & A[2]; + D[1] = ~A[2] & A[3]; + D[2] = ~A[3] & A[4]; + D[3] = ~A[4] & A[0]; + D[4] = ~A[0] & A[1]; + + A[0] ^= D[0] ^ rc[i]; C[0] = A[0]; + A[1] ^= D[1]; C[1] = A[1]; + A[2] ^= D[2]; C[2] = A[2]; + A[3] ^= D[3]; C[3] = A[3]; + A[4] ^= D[4]; C[4] = A[4]; + + for (y = 5; y < 25; y+= 5) { - D[0] = ROTL64 (irot[y], B[iperm[y]]); - D[1] = ROTL64 (irot[y+1], B[iperm[y+1]]); - D[2] = ROTL64 (irot[y+2], B[iperm[y+2]]); - D[3] = ROTL64 (irot[y+3], B[iperm[y+3]]); - D[4] = ROTL64 (irot[y+4], B[iperm[y+4]]); - - C[0] ^= (A[y] = D[0] ^ (~D[1] & D[2])); - C[1] ^= (A[y+1] = D[1] ^ (~D[2] & D[3])); - C[2] ^= (A[y+2] = D[2] ^ (~D[3] & D[4])); - C[3] ^= (A[y+3] = D[3] ^ (~D[4] & D[0])); - C[4] ^= (A[y+4] = D[4] ^ (~D[0] & D[1])); + D[0] = ~A[y+1] & A[y+2]; + D[1] = ~A[y+2] & A[y+3]; + D[2] = ~A[y+3] & A[y+4]; + D[3] = ~A[y+4] & A[y+0]; + D[4] = ~A[y+0] & A[y+1]; + + A[y+0] ^= D[0]; C[0] ^= A[y+0]; + A[y+1] ^= D[1]; C[1] ^= A[y+1]; + A[y+2] ^= D[2]; C[2] ^= A[y+2]; + A[y+3] ^= D[3]; C[3] ^= A[y+3]; + A[y+4] ^= D[4]; C[4] ^= A[y+4]; } } #undef A -- GitLab