From e951e4ddd7b66c5adc3d5daf48c9de149965cd0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Fri, 10 Jul 2020 20:53:09 +0200
Subject: [PATCH] x86_64: Replace salsa20_crypt assembly with salsa20_2core

---
 x86_64/salsa20-2core.asm | 318 +++++++++++++++++++++++++++++++++++++++
 x86_64/salsa20-crypt.asm | 247 ------------------------------
 2 files changed, 318 insertions(+), 247 deletions(-)
 create mode 100644 x86_64/salsa20-2core.asm
 delete mode 100644 x86_64/salsa20-crypt.asm

diff --git a/x86_64/salsa20-2core.asm b/x86_64/salsa20-2core.asm
new file mode 100644
index 00000000..36f7438d
--- /dev/null
+++ b/x86_64/salsa20-2core.asm
@@ -0,0 +1,318 @@
+C x86_64/salsa20-2core.asm
+
+ifelse(<
+   Copyright (C) 2012, 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+define(<DST>, <%rdi>)
+define(<SRC>, <%rsi>)
+define(<COUNT>, <%rdx>)
+
+C State, even elements in X, odd elements in Y
+define(<X0>, <%xmm0>)
+define(<X1>, <%xmm1>)
+define(<X2>, <%xmm2>)
+define(<X3>, <%xmm3>)
+define(<Y0>, <%xmm4>)
+define(<Y1>, <%xmm5>)
+define(<Y2>, <%xmm6>)
+define(<Y3>, <%xmm7>)
+
+define(<T0>, <%xmm8>)
+define(<T1>, <%xmm9>)
+define(<T2>, <%xmm10>)
+define(<T3>, <%xmm11>)
+
+define(<M0011>, <%xmm12>)
+
+include_src(<x86_64/salsa20.m4>)
+
+	.text
+	ALIGN(16)
+	C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_salsa20_2core)
+	W64_ENTRY(3, 13)
+
+	movups	(SRC), T0	C [0, 1, 2, 3]
+	movups	16(SRC), T1	C [4, 5, 6, 7]
+	movups	32(SRC), T2	C [8, 9, 10, 11]
+	movups	48(SRC), T3	C [12, 13, 14, 15]
+
+	pshufd	$0xa0, T0, X0	C X0: [0,0,2,2]
+	pshufd	$0xf5, T0, Y3	C Y3: [1,1,3,3]
+	pshufd	$0xa0, T1, X1	C X1: [4,4,6,6]
+	pshufd	$0xf5, T1, Y0	C Y0: [5,5,7,7]
+	pshufd	$0xa0, T2, X2	C X2: [8,8,10,10]
+	pshufd	$0xf5, T2, Y1	C Y1: [9,9,11,11]
+	pshufd	$0xa0, T3, X3	C [12,12,14,14]
+	pshufd	$0xf5, T3, Y2	C [13,13,15,15]
+
+	C Complicated counter increment. Could be done with
+	C mov $1, %eax; movd %eax, TMP;  paddq T2, TMP
+	C earlier, but then it gets more complicated to construct X2 and Y1.
+
+	mov	$1, %eax
+	movd	%eax, T0	C [1,0,0,0]
+	pshufd	$0x51, T0, T0	C [0,1,0,0]
+	pxor	T1, T1
+	paddd	T0, X2
+	pcmpeqd	X2, T1
+	pand	T0, T1
+	paddd	T1, Y1
+
+	C Load mask registers
+	mov	$-1, %eax
+	movd	%eax, M0011
+	pshufd	$0x09, M0011, M0011	C 01 01 00 00
+
+	C Swap, to get
+	C X0:  0 10  Y0:  5 15
+	C X1:  4 14  Y1:  9  3
+	C X2:  8  2  Y2: 13  7
+	C X3: 12  6  Y3:  1 11
+	SWAP(X0, X2, M0011)
+	SWAP(X1, X3, M0011)
+	SWAP(Y0, Y2, M0011)
+	SWAP(Y1, Y3, M0011)
+
+	shrl	$1, XREG(COUNT)
+
+	ALIGN(16)
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0  B0  A10 B10  Y0: A5  A5  A15 B15
+C X1: A4  B4  A14 B14  Y1: A9  B9  A3  B3
+C X2: A8  B8  A2  B2   Y2: A13 B13 A7  B7
+C X3: A12 B12 A6  B6   Y3: A1  B1  A11 B11
+
+	movaps	X0, T0
+	paddd	X3, T0
+	movaps	T0, T1
+	 movaps	Y0, T2
+	pslld	$7, T0
+	 paddd	Y3, T2
+	psrld	$25, T1
+	 movaps	T2, T3
+	pxor	T0, X1
+	 pslld	$7, T2
+	pxor	T1, X1
+	 psrld	$25, T3
+
+	movaps	X0, T0
+	 pxor	T2, Y1
+	paddd	X1, T0
+	 pxor	T3, Y1
+	movaps	T0, T1
+	 movaps	Y0, T2
+	pslld	$9, T0
+	 paddd	Y1, T2
+	psrld	$23, T1
+	 movaps	T2, T3
+	pxor	T0, X2
+	 pslld	$9, T2
+	pxor	T1, X2
+	 psrld	$23, T3
+
+	movaps	X1, T0
+	 pxor	T2, Y2
+	paddd	X2, T0
+	 pxor	T3, Y2
+	movaps	T0, T1
+	 movaps	Y1, T2
+	pslld	$13, T0
+	 paddd	Y2, T2
+	psrld	$19, T1
+	 movaps	T2, T3
+	pxor	T0, X3
+	 pslld	$13, T2
+	pxor	T1, X3
+	 psrld	$19, T3
+
+	movaps	X2, T0
+	 pxor	T2, Y3
+	paddd	X3, T0
+	 pxor	T3, Y3
+	movaps	T0, T1
+	 movaps	Y2, T2
+	pslld	$18, T0
+	 paddd	Y3, T2
+	psrld	$14, T1
+	 movaps	T2, T3
+	pxor	T0, X0
+	 pslld	$18, T2
+	pxor	T1, X0
+	 psrld	$14, T3
+	 pxor	T2, Y0
+	 pxor	T3, Y0
+
+C Register layout:
+C X0: A0  B0  A10 B10  Y0: A5  A5  A15 B15
+C Y1: A3  B3   A9  B9  X1: A4  B4  A14 B14 (Y1 swapped)
+C X2: A2  B2   A8  B8  Y2: A7  B7  A13 B13 (X2, Y2 swapped)
+C Y3: A1  B1  A11 B11  X3: A6  B6  A12 B12 (X3 swapped)
+
+	pshufd	$0x4e, Y1, Y1	C 10 11 00 01
+	pshufd	$0x4e, X2, X2
+	pshufd	$0x4e, Y2, Y2
+	pshufd	$0x4e, X3, X3
+
+	movaps	X0, T0
+	paddd	Y1, T0
+	movaps	T0, T1
+	 movaps	Y0, T2
+	pslld	$7, T0
+	 paddd	X1, T2
+	psrld	$25, T1
+	 movaps	T2, T3
+	pxor	T0, Y3
+	 pslld	$7, T2
+	pxor	T1, Y3
+	 psrld	$25, T3
+
+	movaps	Y3, T0
+	 pxor	T2, X3
+	paddd	X0, T0
+	 pxor	T3, X3
+	movaps	T0, T1
+	 movaps	X3, T2
+	pslld	$9, T0
+	 paddd	Y0, T2
+	psrld	$23, T1
+	 movaps	T2, T3
+	pxor	T0, X2
+	 pslld	$9, T2
+	pxor	T1, X2
+	 psrld	$23, T3
+
+	movaps	X2, T0
+	 pxor	T2, Y2
+	paddd	Y3, T0
+	 pxor	T3, Y2
+	movaps	T0, T1
+	 movaps	Y2, T2
+	pslld	$13, T0
+	 paddd	X3, T2
+	psrld	$19, T1
+	 movaps	T2, T3
+	pxor	T0, Y1
+	 pslld	$13, T2
+	pxor	T1, Y1
+	 psrld	$19, T3
+
+	movaps	Y1, T0
+	 pxor	T2, X1
+	paddd	X2, T0
+	 pxor	T3, X1
+	movaps	T0, T1
+	 movaps	X1, T2
+	pslld	$18, T0
+	 paddd	Y2, T2
+	psrld	$14, T1
+	 movaps	T2, T3
+	pxor	T0, X0
+	 pslld	$18, T2
+	pxor	T1, X0
+	 psrld	$14, T3
+	 pxor	T2, Y0
+	 pxor	T3, Y0
+
+	pshufd	$0x4e, Y1, Y1	C 10 11 00 01
+	pshufd	$0x4e, X2, X2
+	pshufd	$0x4e, Y2, Y2
+	pshufd	$0x4e, X3, X3
+
+	decl	XREG(COUNT)
+	jnz	.Loop
+
+	SWAP(X0, X2, M0011)
+	SWAP(X1, X3, M0011)
+	SWAP(Y0, Y2, M0011)
+	SWAP(Y1, Y3, M0011)
+
+	movaps	X0, T0
+	punpckldq	Y3, X0	C [A0, A1, B0, B1]
+	punpckhdq	Y3, T0	C [A2, A3, B2, B3]
+	movaps	X0, Y3
+	punpcklqdq	T0, X0	C [A0, A1, A2, A3]
+	punpckhqdq	T0, Y3	C [B0, B1, B2, B3]
+
+	movups	(SRC), T0
+	paddd	T0, X0
+	paddd	T0, Y3
+
+	movaps	X1, T1
+	punpckldq	Y0, X1	C [A4, A5, B4, B5]
+	punpckhdq	Y0, T1	C [A6, A7, B6, B7]
+	movaps	X1, Y0
+	punpcklqdq	T1, X1	C [A4, A5, A6, A7]
+	punpckhqdq	T1, Y0	C [B4, B5, B6, B7]
+
+	movups	16(SRC), T1
+	paddd	T1, X1
+	paddd	T1, Y0
+
+	movaps	X2, T2
+	punpckldq	Y1, X2	C [A8, A9, B8, B9]
+	punpckhdq	Y1, T2	C [A10, A11, B10, B11]
+	movaps	X2, Y1
+	punpcklqdq	T2, X2	C [A8, A9, A10, A11]
+	punpckhqdq	T2, Y1	C [B8, B9, B10, B11]
+
+	movups	32(SRC), T2
+	paddd	T2, X2
+	mov	$1, %eax
+	movd	%eax, M0011
+	paddq	M0011, T2
+	paddd	T2, Y1
+
+	movaps	X3, T3
+	punpckldq	Y2, X3	C [A12, A13, B12, B13]
+	punpckhdq	Y2, T3	C [A14, A15, B14, B15]
+	movaps	X3, Y2
+	punpcklqdq	T3, X3	C [A12, A13, A14, A15]
+	punpckhqdq	T3, Y2	C [B12, B13, B14, B15]
+
+	movups	48(SRC), T3
+	paddd	T3, X3
+	paddd	T3, Y2
+
+	movups	X0,(DST)
+	movups	X1,16(DST)
+	movups	X2,32(DST)
+	movups	X3,48(DST)  C XXX
+	movups	Y3,64(DST)
+	movups	Y0,80(DST)
+	movups	Y1,96(DST)
+	movups	Y2,112(DST) C XXX
+
+	W64_EXIT(3, 14)
+	ret
+EPILOGUE(_nettle_salsa20_2core)
diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm
deleted file mode 100644
index cc1d58ca..00000000
--- a/x86_64/salsa20-crypt.asm
+++ /dev/null
@@ -1,247 +0,0 @@
-C x86_64/salsa20-crypt.asm
-
-ifelse(<
-   Copyright (C) 2012 Niels Möller
-
-   This file is part of GNU Nettle.
-
-   GNU Nettle is free software: you can redistribute it and/or
-   modify it under the terms of either:
-
-     * the GNU Lesser General Public License as published by the Free
-       Software Foundation; either version 3 of the License, or (at your
-       option) any later version.
-
-   or
-
-     * the GNU General Public License as published by the Free
-       Software Foundation; either version 2 of the License, or (at your
-       option) any later version.
-
-   or both in parallel, as here.
-
-   GNU Nettle is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received copies of the GNU General Public License and
-   the GNU Lesser General Public License along with this program.  If
-   not, see http://www.gnu.org/licenses/.
->)
-
-define(<CTX>, <%rdi>)
-define(<LENGTH>, <%rsi>)
-define(<DST>, <%rdx>)
-define(<SRC>, <%rcx>)
-define(<T64>, <%r8>)
-define(<POS>, <%r9>)
-define(<X0>, <%xmm0>)
-define(<X1>, <%xmm1>)
-define(<X2>, <%xmm2>)
-define(<X3>, <%xmm3>)
-define(<T0>, <%xmm4>)
-define(<T1>, <%xmm5>)
-define(<M0101>, <%xmm6>)
-define(<M0110>, <%xmm7>)
-define(<M0011>, <%xmm8>)
-define(<COUNT>, <%rax>)
-
-include_src(<x86_64/salsa20.m4>)
-
-C Possible improvements:
-C 
-C Do two blocks (or more) at a time in parallel, to avoid limitations
-C due to data dependencies.
-C 
-C Avoid redoing the permutation of the input for each block (all but
-C the two counter words are constant). Could also keep the input in
-C registers.
-
-	.file "salsa20-crypt.asm"
-	
-	C salsa20_crypt(struct salsa20_ctx *ctx, size_t length,
-	C		uint8_t *dst, const uint8_t *src)
-	.text
-	ALIGN(16)
-PROLOGUE(nettle_salsa20_crypt)
-	W64_ENTRY(4, 9)	
-
-	test	LENGTH, LENGTH
-	jz	.Lend
-
-	C Load mask registers
-	mov	$-1, XREG(COUNT)
-	movd	XREG(COUNT), M0101
-	pshufd	$0x09, M0101, M0011	C 01 01 00 00
-	pshufd	$0x41, M0101, M0110	C 01 00 00 01
-	pshufd	$0x22, M0101, M0101	C 01 00 01 00
-	
-.Lblock_loop:
-	movups	(CTX), X0
-	movups	16(CTX), X1
-	movups	32(CTX), X2
-	movups	48(CTX), X3
-
-	C On input, each xmm register is one row. We start with
-	C
-	C	 0  1  2  3     C K K K
-	C	 4  5  6  7	K C I I
-	C	 8  9 10 11	B B C K
-	C	12 13 14 15	K K K C
-	C
-	C Diagrams are in little-endian order, with least significant word to
-	C the left. We rotate the columns, to get instead
-	C
-	C	 0  5 10 15	C C C C
-	C	 4  9 14  3	K B K K
-	C	 8 13  2  7	B K K I
-	C	12  1  6 11	K K I K
-	C 
-	C The original rows are now diagonals.
-	SWAP(X0, X1, M0101)
-	SWAP(X2, X3, M0101)
-	SWAP(X1, X3, M0110)
-	SWAP(X0, X2, M0011)	
-
-	movl	$10, XREG(COUNT)
-	ALIGN(16)
-.Loop:
-	QROUND(X0, X1, X2, X3)
-	C For the row operations, we first rotate the rows, to get
-	C	
-	C	0 5 10 15
-	C	3 4  9 14
-	C	2 7  8 13
-	C	1 6 11 12
-	C 
-	C Now the original rows are turned into into columns. (This
-	C SIMD hack described in djb's papers).
-
-	pshufd	$0x93, X1, X1	C	11 00 01 10 (least sign. left)
-	pshufd	$0x4e, X2, X2	C	10 11 00 01
-	pshufd	$0x39, X3, X3	C	01 10 11 00
-
-	QROUND(X0, X3, X2, X1)
-
-	C Inverse rotation of the rows
-	pshufd	$0x39, X1, X1	C	01 10 11 00
-	pshufd	$0x4e, X2, X2	C	10 11 00 01
-	pshufd	$0x93, X3, X3	C	11 00 01 10
-
-	decl	XREG(COUNT)
-	jnz	.Loop
-
-	SWAP(X0, X2, M0011)	
-	SWAP(X1, X3, M0110)
-	SWAP(X0, X1, M0101)
-	SWAP(X2, X3, M0101)
-
-	movups	(CTX), T0
-	movups	16(CTX), T1
-	paddd	T0, X0
-	paddd	T1, X1
-	movups	32(CTX), T0
-	movups	48(CTX), T1
-	paddd	T0, X2
-	paddd	T1, X3
-
-	C Increment block counter
-	incq	32(CTX)
-
-	cmp	$64, LENGTH
-	jc	.Lfinal_xor
-
-	movups	48(SRC), T1
-	pxor	T1, X3
-	movups	X3, 48(DST)
-.Lxor3:
-	movups	32(SRC), T0
-	pxor	T0, X2
-	movups	X2, 32(DST)
-.Lxor2:
-	movups	16(SRC), T1
-	pxor	T1, X1
-	movups	X1, 16(DST)
-.Lxor1:
-	movups	(SRC), T0	
-	pxor	T0, X0
-	movups	X0, (DST)
-
-	lea	64(SRC), SRC
-	lea	64(DST), DST
-	sub	$64, LENGTH
-	ja	.Lblock_loop
-.Lend:
-	W64_EXIT(4, 9)
-	ret
-
-.Lfinal_xor:
-	cmp	$32, LENGTH
-	jz	.Lxor2
-	jc	.Llt32
-	cmp	$48, LENGTH
-	jz	.Lxor3
-	jc	.Llt48
-	movaps	X3, T0
-	call	.Lpartial
-	jmp	.Lxor3
-.Llt48:
-	movaps	X2, T0
-	call	.Lpartial
-	jmp	.Lxor2
-.Llt32:
-	cmp	$16, LENGTH
-	jz	.Lxor1
-	jc	.Llt16
-	movaps	X1, T0
-	call	.Lpartial
-	jmp	.Lxor1
-.Llt16:
-	movaps	X0, T0
-	call	.Lpartial
-	jmp	.Lend
-
-.Lpartial:
-	mov	LENGTH, POS
-	and	$-16, POS
-	test	$8, LENGTH
-	jz	.Llt8
-	C This "movd" instruction should assemble to
-	C 66 49 0f 7e e0          movq   %xmm4,%r8
-	C Apparently, assemblers treat movd and movq (with the
-	C arguments we use) in the same way, except for osx, which
-	C barfs at movq.
-	movd	T0, T64
-	xor	(SRC, POS), T64
-	mov	T64, (DST, POS)
-	lea	8(POS), POS
-	pshufd	$0xee, T0, T0		C 10 11 10 11
-.Llt8:
-	C And this is also really a movq.
-	movd	T0, T64
-	test	$4, LENGTH
-	jz	.Llt4
-	mov	XREG(T64), XREG(COUNT)
-	xor	(SRC, POS), XREG(COUNT)
-	mov	XREG(COUNT), (DST, POS)
-	lea	4(POS), POS
-	shr	$32, T64
-.Llt4:
-	test	$2, LENGTH
-	jz	.Llt2
-	mov	WREG(T64), WREG(COUNT)
-	xor	(SRC, POS), WREG(COUNT)
-	mov	WREG(COUNT), (DST, POS)
-	lea	2(POS), POS
-	shr	$16, XREG(T64)
-.Llt2:
-	test	$1, LENGTH
-	jz	.Lret
-	xor	(SRC, POS), LREG(T64)
-	mov	LREG(T64), (DST, POS)
-
-.Lret:
-	ret
-
-EPILOGUE(nettle_salsa20_crypt)
-- 
GitLab