From 2cd7a85468fc3202c8a1e18f64712a0f00fd5636 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Thu, 21 Nov 2013 23:04:09 +0100
Subject: [PATCH] x86_64 assembly for poly1305.

---
 ChangeLog                    |   7 ++
 asm.m4                       |  13 ++
 configure.ac                 |   2 +-
 x86_64/poly1305-internal.asm | 229 +++++++++++++++++++++++++++++++++++
 4 files changed, 250 insertions(+), 1 deletion(-)
 create mode 100644 x86_64/poly1305-internal.asm

diff --git a/ChangeLog b/ChangeLog
index 19f015a7..08156bd3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,12 @@
 2013-11-21  Niels Möller  <nisse@lysator.liu.se>
 
+	* x86_64/poly1305-internal.asm: New file. Almost a factor of two
+	speedup.
+
+	* configure.ac (asm_replace_list): Added poly1305-internal.asm.
+
+	* asm.m4: Define struct ffsets for 64-bit poly1305_ctx.
+
 	* poly1305.h (POLY1305_DIGEST): Pass the encrypted nonce as an
 	additional argument to poly1305_digest.
 	(struct poly1305_ctx): Introduce unions, to support either 26-bit
diff --git a/asm.m4 b/asm.m4
index 200b1361..cf38fa6e 100644
--- a/asm.m4
+++ b/asm.m4
@@ -76,4 +76,17 @@ STRUCTURE(AES)
   STRUCT(TABLE2, AES_TABLE_SIZE)
   STRUCT(TABLE3, AES_TABLE_SIZE)
 
+C For 64-bit implementation
+STRUCTURE(P1305)
+  STRUCT(R0, 8)
+  STRUCT(R1, 8)
+  STRUCT(S1, 8)
+  STRUCT(PAD, 12)
+  STRUCT(H2, 4)
+  STRUCT(H0, 8)
+  STRUCT(H1, 8)
+  STRUCT(NONCE, 16)
+  STRUCT(BLOCK, 16)
+  STRUCT(INDEX, 4)
+
 divert
diff --git a/configure.ac b/configure.ac
index 037fbe97..fa2ab866 100644
--- a/configure.ac
+++ b/configure.ac
@@ -259,7 +259,7 @@ fi
 # to a new object file).
 asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		arcfour-crypt.asm camellia-crypt-internal.asm \
-		md5-compress.asm memxor.asm \
+		md5-compress.asm memxor.asm poly1305-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
 		sha1-compress.asm sha256-compress.asm sha512-compress.asm \
diff --git a/x86_64/poly1305-internal.asm b/x86_64/poly1305-internal.asm
new file mode 100644
index 00000000..076b30d8
--- /dev/null
+++ b/x86_64/poly1305-internal.asm
@@ -0,0 +1,229 @@
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+	.file "poly1305-internal.asm"
+
+C Registers mainly used by poly1305_block
+define(<CTX>, <%rdi>)
+define(<T0>, <%rcx>)
+define(<T1>, <%rsi>)
+define(<T2>, <%r8>)
+define(<H0>, <%r9>)
+define(<H1>, <%r10>)
+define(<H2>, <%r11>)
+	
+	C poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16])
+	.text
+	C Registers:
+	C  %rdi: ctx
+	C  %rsi: key
+	C  %r8: mask
+	ALIGN(16)
+PROLOGUE(nettle_poly1305_set_key)
+	W64_ENTRY(2,0)
+	mov	$0x0ffffffc0fffffff, %r8
+	mov	(%rsi), %rax
+	and	%r8, %rax
+	and	$-4, %r8
+	mov	%rax, (CTX)
+	mov	8(%rsi), %rax
+	and	%r8, %rax
+	mov	%rax, P1305_R1 (CTX)
+	shr	$2, %rax
+	imul	$5, %rax
+	mov	%rax, P1305_S1 (CTX)
+	xor	XREG(%rax), XREG(%rax)
+	mov	%rax, P1305_H0 (CTX)
+	mov	%rax, P1305_H1 (CTX)
+	mov	XREG(%rax), P1305_H2 (CTX)
+	mov	XREG(%rax), P1305_INDEX (CTX)
+	
+	W64_EXIT(2,0)
+	ret
+
+EPILOGUE(nettle_poly1305_set_key)
+
+C 64-bit multiplication mod 2^130 - 5
+C
+C (x_0 + B x_1 + B^2 x_1) * (r_0 + B r_1) =
+C     1   B B^2 B^3 
+C   x_0 r_0
+C       x_0 r_1
+C	x_1 r_0
+C	    x_1 r_1
+C	    x_2 r_0
+C               x_2 r_1
+C Then r_1 B^2 = r_1/4 (2^130) = 5/4 r_1.
+C and  r_1 B^3 = 5/4 B r_1
+C So we get
+C
+C  x_0 r_0 + x_1 (5/4 r_1) + B (x_0 r_1 + x_1 r_0 + x_2 5/4 r_1 + B x_2 r_0)	
+
+	C poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16])
+	
+PROLOGUE(nettle_poly1305_block)
+	mov	(%rsi), T0
+	mov	8(%rsi), T1
+	mov	$1,	T2
+C FIXME: Support windows ABI 
+	C Registers:
+	C Inputs:  CTX, T0, T1, T2,
+	C Outputs: H0, H1, H2, stored into the context.
+
+C_NAME(poly1305_block):
+	add	P1305_H0 (CTX), T0
+	adc	P1305_H1 (CTX), T1
+	adc	P1305_H2 (CTX), XREG(T2)
+	mov	P1305_R0 (CTX), %rax
+	mul	T0
+	mov	%rax, H0
+	mov	%rdx, H1
+	mov	P1305_S1 (CTX), %rax	C 5/4 r1
+	mov	%rax, H2
+	mul	T1
+	imul	T2, H2
+	imul	P1305_R0 (CTX), T2
+	add	%rax, H0
+	adc	%rdx, H1
+	mov	P1305_R0 (CTX), %rax
+	mul	T1
+	add	%rax, H2
+	adc	%rdx, T2
+	mov	P1305_R1 (CTX), %rax
+	mul	T0
+	add	%rax, H2
+	adc	%rdx, T2
+	mov	T2, %rax
+	shr	$2, %rax
+	imul	$5, %rax
+	and	$3, XREG(T2)
+	add	%rax, H0
+	adc	H2, H1
+	adc	$0, XREG(T2)
+	mov	H0, P1305_H0 (CTX)
+	mov	H1, P1305_H1 (CTX)
+	mov	XREG(T2), P1305_H2 (CTX)
+	ret
+EPILOGUE(nettle_poly1305_block)
+
+	C poly1305_digest (struct poly1305_ctx *ctx,
+ 	C		   size_t length, uint8_t *digest,
+	C		   const uint8_t *s)
+	C Registers:
+	C   %rdi: ctx
+	C   %rsi: length
+	C   %rdx: digest
+	C   %rcx: s
+	
+PROLOGUE(nettle_poly1305_digest)
+	W64_ENTRY(4, 0)
+	mov	P1305_INDEX (CTX), XREG(%rax)
+	push	%rsi
+	push	%rdx
+	push	%rcx
+	test	XREG(%rax), XREG(%rax)
+	jz	.Lfinal
+
+	C Pad with a 1 byte.
+	C FIXME: Or in, without storing in memory.
+	inc	XREG(%rax)	C Also clears high half
+	movb	$1, P1305_BLOCK-1 (CTX, %rax)
+	
+	mov	XREG(%rax), XREG(%rcx)
+	mov	$1, T1
+	and	$7, XREG(%rcx)	
+	shl	$3, XREG(%rcx)
+	shl	LREG(%rcx), T1
+	dec	T1
+	mov	P1305_BLOCK (CTX), T0
+	xor	T2, T2
+	cmp	$8, XREG(%rax)
+	jc	.Lfinal_lt8
+	C	If %rax == 16, we get T1 == 0,
+	C 	tweak so we get need T1 = -1 instead.
+	cmp	$16, XREG(%rax)
+	adc	$-1, T1
+	and	P1305_BLOCK+8 (CTX), T1
+	jmp	.Lfinal_block
+
+.Lfinal_lt8:
+	and	T1, T0
+	xor	T1, T1
+.Lfinal_block:
+
+	call	poly1305_block
+
+.Lfinal:
+
+	mov	P1305_H0 (CTX), H0
+	mov	P1305_H1 (CTX), H1
+	mov	P1305_H2 (CTX), XREG(H2)
+	mov	XREG(H2), XREG(%rax)
+	shr	$2, XREG(%rax)
+	and	$3, H2
+	imul	$5, XREG(%rax)
+	add	%rax, H0
+	adc	$0, H1
+	adc	$0, XREG(H2)
+
+	C Add 5, use result if >= 2^130
+	mov	$5, T0
+	xor	T1, T1
+	add	H0, T0
+	adc	H1, T1
+	adc	$0, XREG(H2)
+	cmp	$4, XREG(H2)
+	cmovnc	T0, H0
+	cmovnc	T1, H1
+
+	pop	%rcx
+	pop	%rdx
+	pop	%rsi
+
+	add	(%rcx), H0
+	adc	8(%rcx), H1
+
+	C Store, taking length into account
+	cmp	$8, %rsi
+	jc	.Ldigest_lt8
+	mov	H0, (%rdx)
+	jz	.Ldigest_done
+	cmp	$16, %rsi
+	jc	.Ldigest_lt16
+	mov	H1, 8(%rdx)
+	jmp	.Ldigest_done
+.Ldigest_lt16:
+	mov	H1, H0
+	add	$8, %rdx
+	sub	$8, %rsi
+.Ldigest_lt8:
+	movb	LREG(H0), (%rdx)
+	shr	$8, H0
+	inc	%rdx
+	dec	%rsi
+	jnz	.Ldigest_lt8
+.Ldigest_done:
+	xor	XREG(%rax), XREG(%rax)
+	mov	%rax, P1305_H0 (CTX)
+	mov	%rax, P1305_H1 (CTX)
+	mov	XREG(%rax), P1305_H2 (CTX)
+	mov	XREG(%rax), P1305_INDEX (CTX)
+	W64_EXIT(4, 0)
+	ret
+
-- 
GitLab