From e8ddf9577049a929adc0fbc8b7f418be9f6b8448 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 13 Mar 2013 10:02:32 +0100
Subject: [PATCH] x86_64 assembly for sha512.

---
 ChangeLog                  |   4 +
 configure.ac               |   4 +-
 x86_64/machine.m4          |   4 +
 x86_64/sha512-compress.asm | 196 +++++++++++++++++++++++++++++++++++++
 4 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 x86_64/sha512-compress.asm

diff --git a/ChangeLog b/ChangeLog
index 00837d45..074a9007 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2013-03-13  Niels Möller  <nisse@lysator.liu.se>
 
+	* configure.ac (asm_replace_list): Added sha512-compress.asm.
+	* x86_64/machine.m4 (OFFSET64): New macro.
+	* x86_64/sha512-compress.asm: New file, 20% speedup.
+
 	* sha512-compress.c (ROUND): Eliminated a temporary, analogous to
 	sha256 change below.
 
diff --git a/configure.ac b/configure.ac
index ecf38090..25695b6a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -249,7 +249,9 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		md5-compress.asm memxor.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
-		sha1-compress.asm sha256-compress.asm machine.m4"
+		sha1-compress.asm sha256-compress.asm sha512-compress.asm \
+		machine.m4"
+
 # Assembler files which generate additional object files if they are used.
 asm_optional_list=""
 
diff --git a/x86_64/machine.m4 b/x86_64/machine.m4
index b6f353c9..dc23dde8 100644
--- a/x86_64/machine.m4
+++ b/x86_64/machine.m4
@@ -2,6 +2,10 @@ C OFFSET(i)
 C Expands to 4*i, or to the empty string if i is zero
 define(<OFFSET>, <ifelse($1,0,,eval(4*$1))>)
 
+C OFFSET64(i)
+C Expands to 8*i, or to the empty string if i is zero
+define(<OFFSET64>, <ifelse($1,0,,eval(8*$1))>)
+
 dnl LREG(reg) gives the 8-bit register corresponding to the given 64-bit register.
 define(<LREG>,<ifelse(
 	$1, %rax, %al,
diff --git a/x86_64/sha512-compress.asm b/x86_64/sha512-compress.asm
new file mode 100644
index 00000000..3b0391a6
--- /dev/null
+++ b/x86_64/sha512-compress.asm
@@ -0,0 +1,196 @@
+C nettle, low-level cryptographics library
+C
+C Copyright (C) 2013 Niels Möller
+C
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+C MA 02111-1301, USA.
+
+	.file "sha512-compress.asm"
+define(<STATE>, <%rdi>)
+define(<INPUT>, <%rsi>)
+define(<K>, <%rdx>)
+define(<SA>, <%rax>)
+define(<SB>, <%rbx>)
+define(<SC>, <%rcx>)
+define(<SD>, <%r8>)
+define(<SE>, <%r9>)
+define(<SF>, <%r10>)
+define(<SG>, <%r11>)
+define(<SH>, <%r12>)
+define(<T0>, <%r13>)
+define(<T1>, <%rdi>)	C Overlap STATE
+define(<COUNT>, <%r14>)
+define(<W>, <%r15>)
+
+define(<EXPN>, <
+	mov	OFFSET64($1)(%rsp), W
+	mov	OFFSET64(eval(($1 + 14) % 16))(%rsp), T0
+	mov	T0, T1
+	shr	<$>6, T0
+	rol	<$>3, T1
+	xor	T1, T0
+	rol	<$>42, T1
+	xor	T1, T0
+	add	T0, W
+	mov	OFFSET64(eval(($1 + 1) % 16))(%rsp), T0
+	mov	T0, T1
+	shr	<$>7, T0
+	rol	<$>56, T1
+	xor	T1, T0
+	rol	<$>7, T1
+	xor	T1, T0
+	add	T0, W
+	add	OFFSET64(eval(($1 + 9) % 16))(%rsp), W
+	mov	W, OFFSET64($1)(%rsp)
+>)
+
+C ROUND(A,B,C,D,E,F,G,H,K)
+C
+C H += S1(E) + Choice(E,F,G) + K + W
+C D += H
+C H += S0(A) + Majority(A,B,C)
+C
+C Where
+C
+C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23
+C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
+C Choice (E, F, G) = G^(E&(F^G))
+C Majority (A,B,C) = (A&B) + (C&(A^B))
+
+define(<ROUND>, <
+	mov	$5, T0
+	mov	$5, T1
+	rol	<$>23, T0
+	rol	<$>46, T1
+	xor	T0, T1
+	rol	<$>27, T0
+	xor	T0, T1
+	add	W, $8
+	add	T1, $8
+	mov	$7, T0
+	xor	$6, T0
+	and	$5, T0
+	xor	$7, T0
+	add	OFFSET64($9)(K,COUNT,8), $8
+	add	T0, $8
+	add	$8, $4
+
+	mov	$1, T0
+	mov	$1, T1
+	rol	<$>25, T0
+	rol	<$>30, T1
+	xor	T0, T1
+	rol	<$>11, T0
+	xor	T0, T1
+	add	T1, $8
+	mov	$1, T0
+	mov	$1, T1
+	and	$2, T0
+	xor	$2, T1
+	add	T0, $8
+	and	$3, T1
+	add	T1, $8
+>)
+
+define(<NOEXPN>, <
+	mov	OFFSET64($1)(INPUT, COUNT, 8), W
+	bswap	W
+	mov	W, OFFSET64($1)(%rsp, COUNT, 8)
+>)
+
+	C void
+	C _nettle_sha512_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+
+	.text
+	ALIGN(4)
+
+PROLOGUE(_nettle_sha512_compress)
+	W64_ENTRY(3, 0)
+
+	sub	$184, %rsp
+	mov	%rbx, 128(%rsp)
+	mov	STATE, 136(%rsp)	C Save state, to free a register
+	mov	%rbp, 144(%rsp)
+	mov	%r12, 152(%rsp)
+	mov	%r13, 160(%rsp)
+	mov	%r14, 168(%rsp)
+	mov	%r15, 176(%rsp)
+
+	mov	(STATE),   SA
+	mov	8(STATE),  SB
+	mov	16(STATE),  SC
+	mov	24(STATE), SD
+	mov	32(STATE), SE
+	mov	40(STATE), SF
+	mov	48(STATE), SG
+	mov	56(STATE), SH
+	xor	COUNT, COUNT
+	ALIGN(4)
+
+.Loop1:
+	NOEXPN(0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,0)
+	NOEXPN(1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,1)
+	NOEXPN(2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,2)
+	NOEXPN(3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,3)
+	NOEXPN(4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,4)
+	NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5)
+	NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6)
+	NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7)
+	add	$8, COUNT
+	cmp	$16, COUNT
+	jne	.Loop1
+
+.Loop2:
+	EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,0)
+	EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,1)
+	EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,2)
+	EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,3)
+	EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,4)
+	EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5)
+	EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6)
+	EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7)
+	EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH,8)
+	EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG,9)
+	EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF,10)
+	EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE,11)
+	EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD,12)
+	EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,13)
+	EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,14)
+	EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,15)
+	add	$16, COUNT
+	cmp	$80, COUNT
+	jne	.Loop2
+
+	mov	136(%rsp), STATE
+
+	add	SA, (STATE)
+	add	SB, 8(STATE)
+	add	SC, 16(STATE)
+	add	SD, 24(STATE)
+	add	SE, 32(STATE)
+	add	SF, 40(STATE)
+	add	SG, 48(STATE)
+	add	SH, 56(STATE)
+
+	mov	128(%rsp), %rbx
+	mov	144(%rsp), %rbp
+	mov	152(%rsp), %r12
+	mov	160(%rsp), %r13
+	mov	168(%rsp),%r14
+	mov	176(%rsp),%r15
+
+	add	$184, %rsp
+	ret
+EPILOGUE(_nettle_sha512_compress)
-- 
GitLab