From 4f4fd79d345cad9c9eb1e5fc5d264c4ff72a5863 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Sun, 23 Oct 2005 21:07:07 +0200
Subject: [PATCH] New file, almost the same as sparc/arcfour-crypt.asm.

Rev: src/nettle/sparc64/arcfour-crypt.asm:1.1
---
 sparc64/arcfour-crypt.asm | 217 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 217 insertions(+)
 create mode 100644 sparc64/arcfour-crypt.asm

diff --git a/sparc64/arcfour-crypt.asm b/sparc64/arcfour-crypt.asm
new file mode 100644
index 00000000..0e407d17
--- /dev/null
+++ b/sparc64/arcfour-crypt.asm
@@ -0,0 +1,217 @@
+C -*- mode: asm; asm-comment-char: ?C; -*-  
+C nettle, low-level cryptographics library
+C 
+C Copyright (C) 2002, 2005 Niels Möller
+C  
+C The nettle library is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at your
+C option) any later version.
+C 
+C The nettle library is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+C or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+C License for more details.
+C 
+C You should have received a copy of the GNU Lesser General Public License
+C along with the nettle library; see the file COPYING.LIB.  If not, write to
+C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+C MA 02111-1307, USA.
+
+C	Define to YES, to enable the complex code to special case SRC
+C	and DST with compatible alignment.
+	
+define(<WITH_ALIGN>, <YES>)
+
+C	Registers
+
+define(<CTX>,	<%i0>)
+define(<LENGTH>,<%i1>)
+define(<DST>,	<%i2>)
+define(<SRC>,	<%i3>)
+
+define(<I1>,	<%i4>)
+define(<I2>,	<%i5>)
+define(<J>,	<%g1>)
+define(<SI>,	<%g2>)
+define(<SJ>,	<%g3>)
+define(<TMP>,	<%o0>)
+define(<TMP2>,	<%o1>)
+define(<N>,	<%o2>)
+define(<DATA>,	<%o3>)
+
+C	Computes the next byte of the key stream. As input, i must
+C	already point to the index for the current access, the index
+C	for the next access is stored in ni. The resulting key byte is
+C	stored in res.
+C	ARCFOUR_BYTE(i, ni, res)
+define(<ARCFOUR_BYTE>, <
+	ldub	[CTX + $1], SI
+	add	$1, 1, $2
+	add	J, SI, J
+	and	J, 0xff, J
+	ldub	[CTX + J], SJ
+	and	$2, 0xff, $2
+	stb	SI, [CTX + J]
+	add	SI, SJ, SI
+	and	SI, 0xff, SI
+	stb	SJ, [CTX + $1]
+	ldub	[CTX + SI], $3
+>)dnl
+			
+define(<FRAME_SIZE>, 192)
+
+	.file "arcfour-crypt.asm"
+
+	C arcfour_crypt(struct arcfour_ctx *ctx,
+	C               unsigned length, uint8_t *dst,
+	C               const uint8_t *src)
+
+	.section	".text"
+	.align 16
+	.proc	020
+	
+PROLOGUE(nettle_arcfour_crypt)
+
+	save	%sp, -FRAME_SIZE, %sp
+	cmp	LENGTH, 0
+	be	.Lend
+	nop
+	
+	C	Load both I and J
+	lduh	[CTX + ARCFOUR_I], I1
+	and	I1, 0xff, J
+	srl	I1, 8, I1
+
+	C	We want an even address for DST
+	andcc	DST, 1, %g0
+	add	I1, 1 ,I1
+	beq	.Laligned2
+	and	I1, 0xff, I1
+
+	mov	I1, I2
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I2, I1, TMP)
+	subcc	LENGTH, 1, LENGTH
+	add	SRC, 1, SRC
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
+	beq	.Ldone
+	add	DST, 1, DST
+
+.Laligned2:
+
+	cmp	LENGTH, 2
+	blu	.Lfinal1
+	C	Harmless delay slot instruction	
+	andcc	DST, 2, %g0
+	beq	.Laligned4
+	nop
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	add	SRC, 2, SRC
+	xor	DATA, TMP, DATA
+	sll	DATA, 8, DATA	
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	subcc	LENGTH, 2, LENGTH
+	or	DATA, TMP, DATA
+
+	sth	DATA, [DST]
+	beq	.Ldone
+	add	DST, 2, DST
+	
+.Laligned4:
+	cmp	LENGTH, 4
+	blu	.Lfinal2
+	C	Harmless delay slot instruction
+	srl	LENGTH, 2, N
+	
+.Loop:
+	C	Main loop, with aligned writes
+	
+	C	FIXME: Could check if SRC is aligned, and
+	C	use 32-bit reads in that case.
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	xor	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	ldub	[SRC + 2], TMP2
+	or	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I1, I2, TMP)
+	xor	TMP2, TMP, TMP
+	ldub	[SRC + 3], TMP2
+	or	TMP, DATA, DATA
+	sll	DATA, 8, DATA
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	or	TMP, DATA, DATA
+	subcc	N, 1, N
+	add	SRC, 4, SRC
+	st	DATA, [DST]
+	bne	.Loop
+	add	DST, 4, DST
+	
+	andcc	LENGTH, 3, LENGTH
+	beq	.Ldone
+	nop
+
+.Lfinal2:
+	C	DST address must be 2-aligned
+	cmp	LENGTH, 2
+	blu	.Lfinal1
+	nop
+
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I1, I2, TMP)
+	ldub	[SRC + 1], TMP2
+	add	SRC, 2, SRC
+	xor	DATA, TMP, DATA
+	sll	DATA, 8, DATA	
+
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	TMP2, TMP, TMP
+	or	DATA, TMP, DATA
+
+	sth	DATA, [DST]
+	beq	.Ldone
+	add	DST, 2, DST
+
+.Lfinal1:
+	mov	I1, I2
+	ldub	[SRC], DATA
+	ARCFOUR_BYTE(I2, I1, TMP)
+	xor	DATA, TMP, DATA
+	stb	DATA, [DST]
+
+.Ldone:
+	C	Save back I and J
+	sll	I2, 8, I2
+	or	I2, J, I2
+	stuh	I2, [CTX + ARCFOUR_I]
+
+.Lend:
+	ret
+	restore
+
+EPILOGUE(nettle_arcfour_crypt)
+
+C	Stats for AES 128 on sellafield.lysator.liu.se (UE450, 296 MHz)
+
+C 1:	nettle-1.13 C-code
+C 2:	New assembler code (basically the same as for sparc32)
+
+C	MB/s	cycles/byte
+C 1:	3.6	77.7
+C 2:	21.8	13.0
-- 
GitLab