From 4f4fd79d345cad9c9eb1e5fc5d264c4ff72a5863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 23 Oct 2005 21:07:07 +0200 Subject: [PATCH] New file, almost the same as sparc/arcfour-crypt.asm. Rev: src/nettle/sparc64/arcfour-crypt.asm:1.1 --- sparc64/arcfour-crypt.asm | 217 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 sparc64/arcfour-crypt.asm diff --git a/sparc64/arcfour-crypt.asm b/sparc64/arcfour-crypt.asm new file mode 100644 index 00000000..0e407d17 --- /dev/null +++ b/sparc64/arcfour-crypt.asm @@ -0,0 +1,217 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2002, 2005 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +C Define to YES, to enable the complex code to special case SRC +C and DST with compatible alignment. + +define(<WITH_ALIGN>, <YES>) + +C Registers + +define(<CTX>, <%i0>) +define(<LENGTH>,<%i1>) +define(<DST>, <%i2>) +define(<SRC>, <%i3>) + +define(<I1>, <%i4>) +define(<I2>, <%i5>) +define(<J>, <%g1>) +define(<SI>, <%g2>) +define(<SJ>, <%g3>) +define(<TMP>, <%o0>) +define(<TMP2>, <%o1>) +define(<N>, <%o2>) +define(<DATA>, <%o3>) + +C Computes the next byte of the key stream. As input, i must +C already point to the index for the current access, the index +C for the next access is stored in ni. The resulting key byte is +C stored in res. +C ARCFOUR_BYTE(i, ni, res) +define(<ARCFOUR_BYTE>, < + ldub [CTX + $1], SI + add $1, 1, $2 + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + and $2, 0xff, $2 + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + stb SJ, [CTX + $1] + ldub [CTX + SI], $3 +>)dnl + +define(<FRAME_SIZE>, 192) + + .file "arcfour-crypt.asm" + + C arcfour_crypt(struct arcfour_ctx *ctx, + C unsigned length, uint8_t *dst, + C const uint8_t *src) + + .section ".text" + .align 16 + .proc 020 + +PROLOGUE(nettle_arcfour_crypt) + + save %sp, -FRAME_SIZE, %sp + cmp LENGTH, 0 + be .Lend + nop + + C Load both I and J + lduh [CTX + ARCFOUR_I], I1 + and I1, 0xff, J + srl I1, 8, I1 + + C We want an even address for DST + andcc DST, 1, %g0 + add I1, 1 ,I1 + beq .Laligned2 + and I1, 0xff, I1 + + mov I1, I2 + ldub [SRC], DATA + ARCFOUR_BYTE(I2, I1, TMP) + subcc LENGTH, 1, LENGTH + add SRC, 1, SRC + xor DATA, TMP, DATA + stb DATA, [DST] + beq .Ldone + add DST, 1, DST + +.Laligned2: + + cmp LENGTH, 2 + blu .Lfinal1 + C Harmless delay slot instruction + andcc DST, 2, %g0 + beq .Laligned4 + nop + + ldub [SRC], DATA + ARCFOUR_BYTE(I1, I2, TMP) + ldub [SRC + 1], TMP2 + add SRC, 2, SRC + xor DATA, TMP, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + subcc LENGTH, 2, LENGTH + or DATA, TMP, DATA + + sth DATA, [DST] + beq .Ldone + add DST, 2, DST + +.Laligned4: + cmp LENGTH, 4 + blu .Lfinal2 + C Harmless delay slot instruction + srl LENGTH, 2, N + +.Loop: + C Main loop, with aligned writes + + C FIXME: Could check if SRC is aligned, and + C use 32-bit reads in that case. + + ldub [SRC], DATA + ARCFOUR_BYTE(I1, I2, TMP) + ldub [SRC + 1], TMP2 + xor TMP, DATA, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + ldub [SRC + 2], TMP2 + or TMP, DATA, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I1, I2, TMP) + xor TMP2, TMP, TMP + ldub [SRC + 3], TMP2 + or TMP, DATA, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + or TMP, DATA, DATA + subcc N, 1, N + add SRC, 4, SRC + st DATA, [DST] + bne .Loop + add DST, 4, DST + + andcc LENGTH, 3, LENGTH + beq .Ldone + nop + +.Lfinal2: + C DST address must be 2-aligned + cmp LENGTH, 2 + blu .Lfinal1 + nop + + ldub [SRC], DATA + ARCFOUR_BYTE(I1, I2, TMP) + ldub [SRC + 1], TMP2 + add SRC, 2, SRC + xor DATA, TMP, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + or DATA, TMP, DATA + + sth DATA, [DST] + beq .Ldone + add DST, 2, DST + +.Lfinal1: + mov I1, I2 + ldub [SRC], DATA + ARCFOUR_BYTE(I2, I1, TMP) + xor DATA, TMP, DATA + stb DATA, [DST] + +.Ldone: + C Save back I and J + sll I2, 8, I2 + or I2, J, I2 + stuh I2, [CTX + ARCFOUR_I] + +.Lend: + ret + restore + +EPILOGUE(nettle_arcfour_crypt) + +C Stats for AES 128 on sellafield.lysator.liu.se (UE450, 296 MHz) + +C 1: nettle-1.13 C-code +C 2: New assembler code (basically the same as for sparc32) + +C MB/s cycles/byte +C 1: 3.6 77.7 +C 2: 21.8 13.0 -- GitLab