From 17f9a2da2a65c0399cee190bb217d206b8adab3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 16 Oct 2016 07:33:19 +0200 Subject: [PATCH] Keep skein256 subkey words in registers. --- ChangeLog | 7 ++ skein256-internal.c | 56 ++++++++-------- x86_64/skein256-internal.asm | 121 ++++++++++++++++++----------------- 3 files changed, 98 insertions(+), 86 deletions(-) diff --git a/ChangeLog b/ChangeLog index 61f33b01..43092436 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2016-10-16 Niels Möller <nisse@lysator.liu.se> + + * skein256-internal.c (_skein256_block): Keep the subkey words in + scalar variables. + * x86_64/skein256-internal.asm: Likewise, keep subkey words in + registers. + 2016-10-15 Niels Möller <nisse@lysator.liu.se> * skein256-internal.c (_skein256_block): Keep tweak words in diff --git a/skein256-internal.c b/skein256-internal.c index 725b3a61..87b5165e 100644 --- a/skein256-internal.c +++ b/skein256-internal.c @@ -88,13 +88,6 @@ w3 ^= w2; \ } while(0) -#define ADD_SUBKEY(w0, w1, w2, w3, k0, k1, k2, k3, t0, t1, i) do { \ - w0 += (k0); \ - w1 += (k1) + (t0); \ - w2 += (k2) + (t1); \ - w3 += (k3) + (i); \ - } while (0) - void _skein256_block (uint64_t dst[_SKEIN256_LENGTH], const uint64_t keys[_SKEIN256_NKEYS], @@ -103,9 +96,9 @@ _skein256_block (uint64_t dst[_SKEIN256_LENGTH], { uint64_t s0, s1, s2, s3; uint64_t w0, w1, w2, w3; + uint64_t k0, k1, k2, k3, k4; uint64_t t0, t1; unsigned i; - unsigned imod5, ip2mod5; w0 = s0 = LE_READ_UINT64(src); w1 = s1 = LE_READ_UINT64(src + 8); @@ -114,40 +107,49 @@ _skein256_block (uint64_t dst[_SKEIN256_LENGTH], t0 = tweak[0]; t1 = tweak[1]; - for (i = imod5 = 0, ip2mod5 = 2; i < 18; i+=2) - { - unsigned ip4mod5; - ADD_SUBKEY(w0, w1, w2, w3, - keys[imod5], keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1], - t0, t1, i); - t0 ^= t1; + k0 = keys[0]; + k1 = keys[1] + t0; + k2 = keys[2] + t1; + k3 = keys[3]; + k4 = keys[4]; + + for (i = 0; i < 18; i+=2) + { + uint64_t tmp; + w0 += k0; + w1 += k1; + w2 += k2; + w3 += k3 + i; ROUND(w0, w1, w2, w3, 14, 16); ROUND(w0, w3, w2, w1, 52, 57); ROUND(w0, w1, w2, w3, 23, 40); ROUND(w0, w3, w2, w1, 5, 37); - /* Hopefully compiled to a conditional move, but gcc-6.1.1 doesn't. */ - ip4mod5 = imod5 ? imod5 - 1 : 4; - - ADD_SUBKEY(w0, w1, w2, w3, - keys[imod5+1], keys[ip2mod5], keys[ip2mod5+1], keys[ip4mod5], - t1, t0, i + 1); + w0 += k1 - t0; /* Right-hand side equal to new k4, below. */ + w1 += k2; + t0 ^= t1; + w2 += k3 + t0; /* Right-hand side equal to new k1, below. */ + w3 += k4 + i + 1; + tmp = k1; + k1 = k3 + t0; + k3 = k0; + k0 = k2 - t1; t1 ^= t0; + k2 = k4 + t1; + k4 = tmp - t1; ROUND(w0, w1, w2, w3, 25, 33); ROUND(w0, w3, w2, w1, 46, 12); ROUND(w0, w1, w2, w3, 58, 22); ROUND(w0, w3, w2, w1, 32, 32); - - imod5 = ip2mod5; - ip2mod5 = ip4mod5; } - ADD_SUBKEY(w0, w1, w2, w3, /* 18 mod 5 = 3, 18 mod 3 = 0 */ - keys[3], keys[4], keys[0], keys[1], - t0, t1, 18); + w0 += k0; + w1 += k1; + w2 += k2; + w3 += k3 + 18; dst[0] = s0 ^ w0; dst[1] = s1 ^ w1; diff --git a/x86_64/skein256-internal.asm b/x86_64/skein256-internal.asm index 9eea7695..e8b63959 100644 --- a/x86_64/skein256-internal.asm +++ b/x86_64/skein256-internal.asm @@ -41,15 +41,14 @@ ifelse(< define(<W3>, <%r11>) define(<COUNT>, <%rcx>) C Overlaps SRC - define(<CMOD5>, <%rdi>) C Overlaps DST - define(<CP2MOD5>, <%rax>) define(<T0>, <%rbx>) define(<T1>, <%rdx>) C Overlaps TWEAK - define(<S0>, <%r12>) - define(<S1>, <%r13>) - define(<S2>, <%r14>) - define(<S3>, <%r15>) - define(<TMP>, <%rbp>) + define(<K0>, <%r12>) + define(<K1>, <%r13>) + define(<K2>, <%r14>) + define(<K3>, <%r15>) + define(<K4>, <%rsi>) C Overlaps KEYS + define(<TMP>, <%rax>) C ROUND(W0, W1, W2, W3, C0, C1) define(<ROUND>, < @@ -69,98 +68,102 @@ define(<ROUND>, < ALIGN(16) PROLOGUE(_nettle_skein256_block) W64_ENTRY(4, 0) - C Save registers, %rdi (DST) last + C Save registers, %rcx (SRC) last push %rbx - push %rbp push %r12 push %r13 push %r14 push %r15 - push DST + push SRC C Unaligned read of source data. - mov (SRC), S0 - mov 8(SRC), S1 - mov 16(SRC), S2 - mov 24(SRC), S3 - - C Read and add in first subkeys. - mov (KEYS), W0 - mov 8(KEYS), W1 - mov 16(KEYS), W2 - mov 24(KEYS), W3 - add S0, W0 - add S1, W1 - add S2, W2 - add S3, W3 + mov (SRC), W0 + mov 8(SRC), W1 + mov 16(SRC), W2 + mov 24(SRC), W3 + + C Read subkeys. + mov (KEYS), K0 + mov 8(KEYS), K1 + mov 16(KEYS), K2 + mov 24(KEYS), K3 + mov 32(KEYS), K4 C Read and add in tweak words. mov (TWEAK), T0 mov 8(TWEAK), T1 - add T0, W1 - add T1, W2 + add T0, K1 + add T1, K2 - mov $1, XREG(CMOD5) - mov $3, XREG(CP2MOD5) - mov $1, XREG(COUNT) + mov $0, XREG(COUNT) ALIGN(16) .Loop: + C Add subkeys + add K0, W0 + add K1, W1 + add K2, W2 + add K3, W3 + add COUNT, W3 + ROUND(W0, W1, W2, W3, 14, 16) ROUND(W0, W3, W2, W1, 52, 57) ROUND(W0, W1, W2, W3, 23, 40) ROUND(W0, W3, W2, W1, 5, 37) + mov K1, TMP + sub T0, TMP C New value for K4 + add TMP, W0 + + add K2, W1 + add K4, W3 + lea 1(W3, COUNT), W3 + xor T1, T0 C Next tweak word always xor of preceeding ones - add (KEYS, CMOD5, 8), W0 - add 8(KEYS, CMOD5, 8), W1 - add (KEYS, CP2MOD5, 8), W2 - add 8(KEYS, CP2MOD5, 8), W3 - add T1, W1 - add T0, W2 - add COUNT, W3 + lea (K3, T0), K1 + add K1, W2 + + mov K0, K3 + mov K2, K0 + sub T1, K0 + xor T0, T1 + lea (K4, T1), K2 + + mov TMP, K4 ROUND(W0, W1, W2, W3, 25, 33) ROUND(W0, W3, W2, W1, 46, 12) ROUND(W0, W1, W2, W3, 58, 22) ROUND(W0, W3, W2, W1, 32, 32) - xor T0, T1 - - add 8(KEYS, CMOD5, 8), W0 - add (KEYS, CP2MOD5, 8), W1 - add 8(KEYS, CP2MOD5, 8), W2 - lea 4(CMOD5), TMP - sub $1, XREG(CMOD5) - cmovnc XREG(CMOD5), XREG(TMP) - add (KEYS, TMP, 8), W3 - mov XREG(CP2MOD5), XREG(CMOD5) - mov XREG(TMP), XREG(CP2MOD5) - - add T0, W1 - add T1, W2 - lea 1(W3, COUNT), W3 - add $2, XREG(COUNT) - cmp $19, XREG(COUNT) + cmp $18, XREG(COUNT) jne .Loop - pop DST - xor S0, W0 + pop SRC + + add K0, W0 + add K1, W1 + add K2, W2 + lea 18(K3, W3), W3 + + C Repeats the unaligned reads. Keep in registers, + C if we get any spare registers. Or consider copying + C to stack? + xor (SRC), W0 mov W0, (DST) - xor S1, W1 + xor 8(SRC), W1 mov W1, 8(DST) - xor S2, W2 + xor 16(SRC), W2 mov W2, 16(DST) - xor S3, W3 + xor 24(SRC), W3 mov W3, 24(DST) pop %r15 pop %r14 pop %r13 pop %r12 - pop %rbp pop %rbx W64_EXIT(4, 0) -- GitLab