From fb49f89ffc9d3f5ae66e7c4bb9853faef2fcd2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Mon, 20 Jun 2011 13:01:41 +0200 Subject: [PATCH] Added an SSE2 loop, doing four blocks at a time in parallel. Rev: nettle/x86_64/serpent-encrypt.asm:1.3 --- x86_64/serpent-encrypt.asm | 429 ++++++++++++++++++++++++++++++++++++- 1 file changed, 421 insertions(+), 8 deletions(-) diff --git a/x86_64/serpent-encrypt.asm b/x86_64/serpent-encrypt.asm index 9b7a25b5..f342685f 100644 --- a/x86_64/serpent-encrypt.asm +++ b/x86_64/serpent-encrypt.asm @@ -41,6 +41,12 @@ define(<Y1>, <%xmm5>) define(<Y2>, <%xmm6>) define(<Y3>, <%xmm7>) +define(<MINUS1>, <%xmm8>) +define(<T0>, <%xmm9>) +define(<T1>, <%xmm10>) +define(<T2>, <%xmm11>) +define(<T3>, <%xmm12>) + C Arguments define(<CTX>, <%rdi>) define(<N>, <%rsi>) @@ -48,9 +54,9 @@ define(<DST>, <%rdx>) define(<SRC>, <%rcx>) define(<CNT>, <%r13>) -define(<TMP>, <%r14d>) C 32-bit temporary +define(<TMP32>, <%r14d>) -C Sbox macros. Inputs $1 - $4 (destroyed), outputs $5 - $8 +C SBOX macros. Inputs $1 - $4 (destroyed), outputs $5 - $8 define(<SBOX0>, < mov $2, $8 C y3 = x1 ^ x2 @@ -160,6 +166,7 @@ define(<SBOX3>, < mov $1, $5 xor $2, $5 >) + define(<SBOX4>, < mov $1, $8 or $2, $8 @@ -272,22 +279,351 @@ define(<LT>, < rol <$>3, $3 xor $1, $2 xor $3, $2 - mov $1, TMP - shl <$>3, TMP + mov $1, TMP32 + shl <$>3, TMP32 xor $3, $4 - xor TMP, $4 + xor TMP32, $4 rol $2 rol <$>7, $4 xor $2, $1 xor $4, $1 - mov $2, TMP - shl <$>7, TMP + mov $2, TMP32 + shl <$>7, TMP32 xor $4, $3 - xor TMP, $3 + xor TMP32, $3 rol <$>5, $1 rol <$>22, $3 >) +C Parallel operation on four blocks at a time. + +C pnot instruction is missing. For lack of a spare register, XOR with +C constant in memory. + +define(<PNOT>, < + pxor MINUS1, $1 +>) + +define(<WSBOX0>, < + movdqa $2, $8 C y3 = x1 ^ x2 + pxor $3, $8 + movdqa $1, $5 C y0 = x0 | x3 + por $4, $5 + movdqa $1, $6 C y1 = x0 ^ x1 + pxor $2, $6 + pxor $5, $8 C y3 ^= y0 + movdqa $3, $7 C y2 = x2 | y3 + por $8, $7 + pxor $4, $1 C x0 ^= x3 + pand $4, $7 C y2 &= x3 + pxor $3, $4 C x3 ^= x2 + por $2, $3 C x2 |= x1 + movdqa $6, $5 C y0 = y1 & x2 + pand $3, $5 + pxor $5, $7 C y2 ^= y0 + pand $7, $5 C y0 &= y2 + pxor $3, $5 C y0 ^= x2 + pand $1, $2 C x1 &= x0 + pxor $1, $5 C y0 ^= x0 + PNOT($5) C y0 = ~y0 + movdqa $5, $6 C y1 = y0 ^ x1 + pxor $2, $6 + pxor $4, $6 C y1 ^= x3 +>) + +define(<WSBOX1>, < + movdqa $1, $6 C y1 = x0 | x3 + por $4, $6 + movdqa $3, $7 C y2 = x2 ^ x3 + pxor $4, $7 + movdqa $2, $5 C y0 = ~x1 + PNOT($5) + movdqa $1, $8 C y3 = x0 ^ x2 + pxor $3, $8 + por $1, $5 C y0 |= x0 + pand $4, $8 C y3 &= x3 + movdqa $6, $1 C x0 = y1 & y2 + pand $7, $1 + por $2, $8 C y3 |= x1 + pxor $5, $7 C y2 ^= y0 + pxor $1, $8 C y3 ^= x0 + movdqa $6, $1 C x0 = y1 ^ y3 + pxor $8, $1 + pxor $7, $1 C x0 ^= y2 + movdqa $2, $6 C y1 = x1 & x3 + pand $4, $6 + pxor $1, $6 C y1 ^= x0 + movdqa $6, $4 C x3 = y1 | y3 + por $8, $4 + PNOT($8) C y3 = ~y3 + pand $4, $5 C y0 &= x3 + pxor $3, $5 C y0 ^= x2 +>) + +define(<WSBOX2>, < + movdqa $1, $7 C y2 = x1 | x2 + por $3, $7 + movdqa $1, $6 + pxor $2, $6 + movdqa $4, $8 + pxor $7, $8 + movdqa $6, $5 + pxor $8, $5 + por $1, $4 + pxor $5, $3 + movdqa $2, $1 + pxor $3, $1 + por $2, $3 + pand $7, $1 + pxor $3, $8 + por $8, $6 + pxor $1, $6 + movdqa $8, $7 + pxor $6, $7 + pxor $2, $7 + PNOT($8) + pxor $4, $7 +>) + +define(<WSBOX3>, < + movdqa $1, $6 + pxor $3, $6 + movdqa $1, $5 + por $4, $5 + movdqa $1, $8 + pand $4, $8 + pand $5, $6 + por $2, $8 + movdqa $1, $7 + pand $2, $7 + por $3, $7 + movdqa $4, $3 + pxor $6, $3 + pxor $8, $6 + por $3, $1 + pxor $2, $3 + pand $4, $8 + pxor $8, $5 + movdqa $7, $8 + pxor $3, $8 + pxor $5, $7 + por $8, $4 + pand $4, $2 + movdqa $1, $5 + pxor $2, $5 +>) + +define(<WSBOX4>, < + movdqa $1, $8 + por $2, $8 + movdqa $2, $7 + por $3, $7 + pxor $1, $7 + pand $4, $8 + movdqa $2, $5 + pxor $4, $5 + por $7, $4 + pand $4, $1 + pand $3, $2 + pxor $8, $3 + pxor $7, $8 + por $2, $7 + movdqa $8, $6 + pand $5, $6 + pxor $6, $7 + pxor $5, $6 + por $2, $6 + pxor $1, $6 + pand $4, $5 + pxor $3, $5 + PNOT($5) +>) + +define(<WSBOX5>, < + movdqa $2, $5 + por $4, $5 + pxor $3, $5 + movdqa $2, $3 + pxor $4, $3 + movdqa $1, $7 + pxor $3, $7 + pand $3, $1 + pxor $1, $5 + movdqa $2, $8 + por $7, $8 + por $5, $2 + PNOT($5) + por $5, $1 + pxor $3, $8 + pxor $1, $8 + movdqa $4, $6 + por $5, $6 + pxor $6, $4 + pxor $7, $6 + por $4, $7 + pxor $2, $7 +>) + +define(<WSBOX6>, < + movdqa $1, $5 + pxor $4, $5 + movdqa $1, $6 + pand $4, $6 + movdqa $1, $7 + por $3, $7 + por $2, $4 + pxor $3, $4 + pxor $2, $1 + movdqa $2, $8 + por $3, $8 + pxor $2, $3 + pand $5, $8 + pxor $3, $6 + PNOT($6) + pand $6, $5 + pand $6, $2 + pxor $8, $2 + pxor $4, $8 + pxor $2, $7 + PNOT($7) + pxor $7, $5 + pxor $1, $5 +>) + +define(<WSBOX7>, < + movdqa $1, $5 + pand $3, $5 + movdqa $2, $8 + por $5, $8 C t04 + pxor $3, $8 + movdqa $4, $6 + pandn $1, $6 C t02 implicit + pxor $6, $8 + movdqa $3, $6 + por $8, $6 + pxor $1, $6 + movdqa $1, $7 + pand $2, $7 + pxor $7, $3 + por $4, $7 + pxor $7, $6 + movdqa $2, $7 + por $5, $7 C t04 + pand $8, $7 + pxor $6, $2 + por $2, $7 + pxor $1, $7 + pxor $6, $5 + PNOT($4) C t02 + por $4, $5 + pxor $3, $5 +>) + +C WROL(count, w) +define(<WROL>, < + movdqa $2, T0 + pslld <$>$1, $2 + psrld <$>eval(32 - $1), T0 + por T0, $2 +>) + +C WLT(x0, x1, x2, x3) +define(<WLT>, < + WROL(13, $1) + WROL(3, $3) + pxor $1, $2 + pxor $3, $2 + movdqa $1, T0 + pslld <$>3, T0 + pxor $3, $4 + pxor T0, $4 + WROL(1, $2) + WROL(7, $4) + pxor $2, $1 + pxor $4, $1 + movdqa $2, T0 + pslld <$>7, T0 + pxor $4, $3 + pxor T0, $3 + WROL(5, $1) + WROL(22, $3) +>) + +C Note: Diagrams use little-endian representation, with least +C significant word to the left. + +C Transpose values from: +C +----+----+----+----+ +C x0: | a0 | a1 | a2 | a3 | +C x1: | b0 | b1 | b2 | b3 | +C x2: | c0 | c1 | c2 | c3 | +C x3: | d0 | d1 | d2 | d3 | +C +----+----+----+----+ +C To: +C +----+----+----+----+ +C x0: | a0 | b0 | c0 | d0 | +C x1: | a1 | b1 | c1 | d1 | +C x2: | a2 | b2 | c2 | d2 | +C x3: | a3 | b3 | c3 | d3 | +C +----+----+----+----+ + +define(<WTRANSPOSE>, < + movdqa $1, T0 + punpcklqdq $3, T0 C |a0 a1 c0 c1| + punpckhqdq $3, $1 C |a2 a3 c2 c3| + pshufd <$>0xd8, T0, T0 C |a0 c0 a1 c1| + pshufd <$>0xd8, $1, T1 C |a2 c2 a3 c3| + + movdqa $2, T2 + punpcklqdq $4, T2 C |b0 b1 d0 11| + punpckhqdq $4, $2 C |b2 b3 d2 d3| + pshufd <$>0xd8, T2, T2 C |b0 d0 b1 d1| + pshufd <$>0xd8, $2, T3 C |b2 d2 b3 d3| + + movdqa T0, $1 + punpckldq T2, $1 C |a0 b0 c0 d0| + movdqa T0, $2 + punpckhdq T2, $2 C |a1 b1 c1 d1| + + movdqa T1, $3 + punpckldq T3, $3 C |a2 b2 c2 d2| + movdqa T1, $4 + punpckhdq T3, $4 C |a3 b3 c3 d3| +>) + +C Copy subkeys, from: +C +C +----+----+----+----+ +C k0: | s3 | s2 | s1 | s0 | +C +----+----+----+----+ +C To: +C +----+----+----+----+ +C k0: | s0 | s0 | s0 | s0 | +C k1: | s1 | s1 | s1 | s1 | +C k2: | s2 | s2 | s2 | s2 | +C k3: | s3 | s3 | s3 | s3 | +C +----+----+----+----+ + +dnl define(<WCOPY>, < +dnl pshufd $55, $1, $2 +dnl pshufd $aa, $1, $3 +dnl pshufd $ff, $1, $4 +dnl pshufd $00, $1, $1 +dnl >) + +C FIXME: Arrange 16-byte alignment, so we can use movaps? +define(<WKEYXOR>, < + movups $1(CTX, CNT), T0 + pshufd <$>0x55, T0, T1 + pshufd <$>0xaa, T0, T2 + pxor T1, $3 + pxor T2, $4 + pshufd <$>0xff, T0, T1 + pshufd <$>0x00, T0, T0 + pxor T1, $5 + pxor T0, $2 +>) + .file "aes-serpent-encrypt.asm" C serpent_encrypt(struct serpent_context *ctx, @@ -311,9 +647,86 @@ PROLOGUE(nettle_serpent_encrypt) C Point at the final subkey. lea 512(CTX), CTX + cmp $-64, N + ja .Lwide_end + + pcmpeqd MINUS1, MINUS1 + +.Lwblock_loop: + movups (SRC, N), X0 + movups 16(SRC, N), X1 + movups 32(SRC, N), X2 + movups 48(SRC, N), X3 + + WTRANSPOSE(X0, X1, X2, X3) + + mov $-512, CNT + jmp .Lwround_start + + ALIGN(4) +.Lwround_loop: + WLT(X0,X1,X2,X3) +.Lwround_start: + WKEYXOR(, X0,X1,X2,X3) + WSBOX0(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WLT(Y0,Y1,Y2,Y3) + + WKEYXOR(16, Y0,Y1,Y2,Y3) + WSBOX1(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WLT(X0,X1,X2,X3) + + WKEYXOR(32, X0,X1,X2,X3) + WSBOX2(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WLT(Y0,Y1,Y2,Y3) + + WKEYXOR(48, Y0,Y1,Y2,Y3) + WSBOX3(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WLT(X0,X1,X2,X3) + + WKEYXOR(64, X0,X1,X2,X3) + WSBOX4(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WLT(Y0,Y1,Y2,Y3) + + WKEYXOR(80, Y0,Y1,Y2,Y3) + WSBOX5(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + WLT(X0,X1,X2,X3) + + WKEYXOR(96, X0,X1,X2,X3) + WSBOX6(X0,X1,X2,X3, Y0,Y1,Y2,Y3) + WLT(Y0,Y1,Y2,Y3) + + WKEYXOR(112, Y0,Y1,Y2,Y3) + WSBOX7(Y0,Y1,Y2,Y3, X0,X1,X2,X3) + add $128, CNT + jnz .Lwround_loop + + C FIXME CNT known to be zero, no index register needed + WKEYXOR(, X0,X1,X2,X3) + + WTRANSPOSE(X0,X1,X2,X3) + + movups X0, (DST, N) + movups X1, 16(DST, N) + movups X2, 32(DST, N) + movups X3, 48(DST, N) + + C FIXME: Adjust N, so we can use just jnc without an extra cmp. + add $64, N + jz .Lend + + cmp $-64, N + jbe .Lwblock_loop + +.Lwide_end: + + C The single-block loop here is slightly slower than the double-block C loop in serpent-encrypt.c. +C FIXME: Should use non-sse2 code only if we have a sngle block left. +C With two or three blocks, it should be better to do them in +C parallell. + .Lblock_loop: movl (SRC, N), x0 movl 4(SRC, N), x1 -- GitLab