diff --git a/ChangeLog b/ChangeLog index 4a54c1911138df70e56d1a70a77b6e9dd7094672..e9d8c942c325f5da61be26b9828363ef42d781bf 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2007-04-05 Niels M�ller <nisse@lysator.liu.se> + + * Moved in CVS tree. Also renamed directory sparc to sparc32. + 2007-02-24 Niels M�ller <nisse@lysator.liu.se> * Makefile.in (clean-here): Remove .lib directory. diff --git a/Makefile.in b/Makefile.in index 3d4deb2ada1e6b240a6f7011120f03590fca13bb..efff06018ca433cc511805042a7d6d5a117f3e66 100644 --- a/Makefile.in +++ b/Makefile.in @@ -38,7 +38,7 @@ clean distclean mostlyclean maintainer-clean tags: check-here: true -# These targets aren't supported, but they are expected by the +# FIXME: Remove. These targets aren't supported, but they are expected by the # automake generated Makefiles in the lsh build. dvi installcheck uninstallcheck: true @@ -302,7 +302,7 @@ distdir: $(DISTFILES) else cp "$(srcdir)/$$f" "$(distdir)" ; \ fi ; \ done - set -e; for d in sparc x86 ; do \ + set -e; for d in sparc32 sparc64 x86 ; do \ mkdir "$(distdir)/$$d" ; \ cp $(srcdir)/$$d/*.asm $(srcdir)/$$d/*.m4 "$(distdir)/$$d" ; \ done diff --git a/configure.ac b/configure.ac index badac4ed777024a5fd9b0bd36b31a3040b3af201..8057d71041b7e5a568a69340f1339a4275c83940 100644 --- a/configure.ac +++ b/configure.ac @@ -125,7 +125,7 @@ case "$host_cpu" in #error 64-bit sparc #endif ], [], [ - asm_path=sparc + asm_path=sparc32 ], [ asm_path=sparc64 ]) diff --git a/sparc32/aes-decrypt-internal.asm b/sparc32/aes-decrypt-internal.asm new file mode 100644 index 0000000000000000000000000000000000000000..750e3d21ec35c0d70e022d308aa9ea69ecf04e3a --- /dev/null +++ b/sparc32/aes-decrypt-internal.asm @@ -0,0 +1,132 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2002, 2005 Niels M�ller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +include_src(<sparc/aes.m4>) + +C Arguments +define(<CTX>, <%i0>) +define(<T>, <%i1>) +define(<LENGTH>,<%i2>) +define(<DST>, <%i3>) +define(<SRC>, <%i4>) + +C AES state, two copies for unrolling + +define(<W0>, <%l0>) +define(<W1>, <%l1>) +define(<W2>, <%l2>) +define(<W3>, <%l3>) + +define(<X0>, <%l4>) +define(<X1>, <%l5>) +define(<X2>, <%l6>) +define(<X3>, <%l7>) + +C %o0-%03 are used for loop invariants T0-T3 +define(<KEY>, <%o4>) +define(<ROUND>, <%o5>) + +C %g1, %g2, %g3 are TMP1, TMP2 and TMP3 + +C The sparc32 stack frame looks like +C +C %fp - 4: OS-dependent link field +C %fp - 8: OS-dependent link field +C %fp - 104: OS register save area. +define(<FRAME_SIZE>, 104) + + .file "aes-decrypt-internal.asm" + + C _aes_decrypt(struct aes_context *ctx, + C const struct aes_table *T, + C unsigned length, uint8_t *dst, + C uint8_t *src) + + .section ".text" + .align 16 + .proc 020 + +PROLOGUE(_nettle_aes_decrypt) + + save %sp, -FRAME_SIZE, %sp + cmp LENGTH, 0 + be .Lend + + C Loop invariants + add T, AES_TABLE0, T0 + add T, AES_TABLE1, T1 + add T, AES_TABLE2, T2 + add T, AES_TABLE3, T3 + +.Lblock_loop: + C Read src, and add initial subkey + add CTX, AES_KEYS, KEY + AES_LOAD(0, SRC, KEY, W0) + AES_LOAD(1, SRC, KEY, W1) + AES_LOAD(2, SRC, KEY, W2) + AES_LOAD(3, SRC, KEY, W3) + + C Must be even, and includes the final round + ld [AES_NROUNDS + CTX], ROUND + add SRC, 16, SRC + add KEY, 16, KEY + + srl ROUND, 1, ROUND + C Last two rounds handled specially + sub ROUND, 1, ROUND +.Lround_loop: + C The AES_ROUND macro uses T0,... T3 + C Transform W -> X + AES_ROUND(0, W0, W3, W2, W1, KEY, X0) + AES_ROUND(1, W1, W0, W3, W2, KEY, X1) + AES_ROUND(2, W2, W1, W0, W3, KEY, X2) + AES_ROUND(3, W3, W2, W1, W0, KEY, X3) + + C Transform X -> W + AES_ROUND(4, X0, X3, X2, X1, KEY, W0) + AES_ROUND(5, X1, X0, X3, X2, KEY, W1) + AES_ROUND(6, X2, X1, X0, X3, KEY, W2) + AES_ROUND(7, X3, X2, X1, X0, KEY, W3) + + subcc ROUND, 1, ROUND + bne .Lround_loop + add KEY, 32, KEY + + C Penultimate round + AES_ROUND(0, W0, W3, W2, W1, KEY, X0) + AES_ROUND(1, W1, W0, W3, W2, KEY, X1) + AES_ROUND(2, W2, W1, W0, W3, KEY, X2) + AES_ROUND(3, W3, W2, W1, W0, KEY, X3) + + add KEY, 16, KEY + C Final round + AES_FINAL_ROUND(0, T, X0, X3, X2, X1, KEY, DST) + AES_FINAL_ROUND(1, T, X1, X0, X3, X2, KEY, DST) + AES_FINAL_ROUND(2, T, X2, X1, X0, X3, KEY, DST) + AES_FINAL_ROUND(3, T, X3, X2, X1, X0, KEY, DST) + + subcc LENGTH, 16, LENGTH + bne .Lblock_loop + add DST, 16, DST + +.Lend: + ret + restore +EPILOGUE(_nettle_aes_decrypt) diff --git a/sparc32/aes-encrypt-internal.asm b/sparc32/aes-encrypt-internal.asm new file mode 100644 index 0000000000000000000000000000000000000000..92d6fc0e3ba49f9fc116b522d324ed9e6f811886 --- /dev/null +++ b/sparc32/aes-encrypt-internal.asm @@ -0,0 +1,156 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2002, 2005 Niels M�ller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +include_src(<sparc/aes.m4>) + +C Arguments +define(<CTX>, <%i0>) +define(<T>, <%i1>) +define(<LENGTH>,<%i2>) +define(<DST>, <%i3>) +define(<SRC>, <%i4>) + +C AES state, two copies for unrolling + +define(<W0>, <%l0>) +define(<W1>, <%l1>) +define(<W2>, <%l2>) +define(<W3>, <%l3>) + +define(<X0>, <%l4>) +define(<X1>, <%l5>) +define(<X2>, <%l6>) +define(<X3>, <%l7>) + +C %o0-%03 are used for loop invariants T0-T3 +define(<KEY>, <%o4>) +define(<ROUND>, <%o5>) + +C %g1, %g2, %g3 are TMP1, TMP2 and TMP3 + +C I'm still slightly confused by the frame layout, specified in +C "SYSTEM V APPLICATION BINARY INTERFACE SPARC Processor Supplement". +C However, Sun's cc generates a 104 byte stack frame for a function +C with no local variables, so that should be good enough for us too. + +C The sparc32 stack frame looks like +C +C %fp - 4: OS-dependent link field +C %fp - 8: OS-dependent link field +C %fp - 104: OS register save area +define(<FRAME_SIZE>, 104) + + .file "aes-encrypt-internal.asm" + + C _aes_encrypt(struct aes_context *ctx, + C const struct aes_table *T, + C unsigned length, uint8_t *dst, + C uint8_t *src) + + .section ".text" + .align 16 + .proc 020 + +PROLOGUE(_nettle_aes_encrypt) + + save %sp, -FRAME_SIZE, %sp + cmp LENGTH, 0 + be .Lend + + C Loop invariants + add T, AES_TABLE0, T0 + add T, AES_TABLE1, T1 + add T, AES_TABLE2, T2 + add T, AES_TABLE3, T3 + +.Lblock_loop: + C Read src, and add initial subkey + add CTX, AES_KEYS, KEY + AES_LOAD(0, SRC, KEY, W0) + AES_LOAD(1, SRC, KEY, W1) + AES_LOAD(2, SRC, KEY, W2) + AES_LOAD(3, SRC, KEY, W3) + + C Must be even, and includes the final round + ld [AES_NROUNDS + CTX], ROUND + add SRC, 16, SRC + add KEY, 16, KEY + + srl ROUND, 1, ROUND + C Last two rounds handled specially + sub ROUND, 1, ROUND +.Lround_loop: + C The AES_ROUND macro uses T0,... T3 + C Transform W -> X + AES_ROUND(0, W0, W1, W2, W3, KEY, X0) + AES_ROUND(1, W1, W2, W3, W0, KEY, X1) + AES_ROUND(2, W2, W3, W0, W1, KEY, X2) + AES_ROUND(3, W3, W0, W1, W2, KEY, X3) + + C Transform X -> W + AES_ROUND(4, X0, X1, X2, X3, KEY, W0) + AES_ROUND(5, X1, X2, X3, X0, KEY, W1) + AES_ROUND(6, X2, X3, X0, X1, KEY, W2) + AES_ROUND(7, X3, X0, X1, X2, KEY, W3) + + subcc ROUND, 1, ROUND + bne .Lround_loop + add KEY, 32, KEY + + C Penultimate round + AES_ROUND(0, W0, W1, W2, W3, KEY, X0) + AES_ROUND(1, W1, W2, W3, W0, KEY, X1) + AES_ROUND(2, W2, W3, W0, W1, KEY, X2) + AES_ROUND(3, W3, W0, W1, W2, KEY, X3) + + add KEY, 16, KEY + C Final round + AES_FINAL_ROUND(0, T, X0, X1, X2, X3, KEY, DST) + AES_FINAL_ROUND(1, T, X1, X2, X3, X0, KEY, DST) + AES_FINAL_ROUND(2, T, X2, X3, X0, X1, KEY, DST) + AES_FINAL_ROUND(3, T, X3, X0, X1, X2, KEY, DST) + + subcc LENGTH, 16, LENGTH + bne .Lblock_loop + add DST, 16, DST + +.Lend: + ret + restore +EPILOGUE(_nettle_aes_encrypt) + +C Some stats from adriana.lysator.liu.se (SS1000$, 85 MHz), for AES 128 + +C 1: nettle-1.13 C-code +C 2: nettle-1.13 assembler +C 3: New C-code +C 4: New assembler, first correct version +C 5: New assembler, with basic scheduling of AES_ROUND. +C 6: New assembpler, with loop invariants T0-T3. +C 7: New assembler, with basic scheduling also of AES_FINAL_ROUND. + +C MB/s cycles/block Code size (bytes) +C 1 1.2 1107 592 +C 2 2.3 572 1032 +C 3 2.1 627 +C 4 1.8 722 +C 5 2.6 496 +C 6 3.0 437 +C 7 3.1 415 1448 diff --git a/sparc32/aes.m4 b/sparc32/aes.m4 new file mode 100644 index 0000000000000000000000000000000000000000..05f465e014d42b4acc2bdcc998fe58ffb54fba5a --- /dev/null +++ b/sparc32/aes.m4 @@ -0,0 +1,83 @@ +C Used as temporaries by the AES macros +define(<TMP1>, <%g1>) +define(<TMP2>, <%g2>) +define(<TMP3>, <%g3>) + +C Loop invariants used by AES_ROUND +define(<T0>, <%o0>) +define(<T1>, <%o1>) +define(<T2>, <%o2>) +define(<T3>, <%o3>) + +C AES_LOAD(i, src, key, res) +define(<AES_LOAD>, < + ldub [$2 + 4*$1], $4 + ldub [$2 + 4*$1 + 1], TMP1 + ldub [$2 + 4*$1 + 2], TMP2 + sll TMP1, 8, TMP1 + + or $4, TMP1, $4 + ldub [$2 + 4*$1+3], TMP1 + sll TMP2, 16, TMP2 + or $4, TMP2, $4 + + sll TMP1, 24, TMP1 + C Get subkey + ld [$3 + 4*$1], TMP2 + or $4, TMP1, $4 + xor $4, TMP2, $4>)dnl + +C AES_ROUND(i, a, b, c, d, key, res) +C Computes one word of the AES round +C FIXME: Could use registers pointing directly to the four tables +C FIXME: Needs better instruction scheduling, and perhaps more temporaries +C Alternatively, we can use a single table and some rotations +define(<AES_ROUND>, < + and $2, 0xff, TMP1 C 0 + srl $3, 6, TMP2 C 1 + sll TMP1, 2, TMP1 C 0 + and TMP2, 0x3fc, TMP2 C 1 + ld [T0 + TMP1], $7 C 0 E0 + srl $4, 14, TMP1 C 2 + ld [T1 + TMP2], TMP2 C 1 + and TMP1, 0x3fc, TMP1 C 2 + xor $7, TMP2, $7 C 1 E1 + srl $5, 22, TMP2 C 3 + ld [T2 + TMP1], TMP1 C 2 + and TMP2, 0x3fc, TMP2 C 3 + xor $7, TMP1, $7 C 2 E2 + ld [$6 + 4*$1], TMP1 C 4 + ld [T3 + TMP2], TMP2 C 3 + xor $7, TMP1, $7 C 4 E4 + xor $7, TMP2, $7 C 3 E3 +>)dnl + +C AES_FINAL_ROUND(i, T, a, b, c, d, key, dst) +C Compute one word in the final round function. Output is converted to +C octets and stored at dst. Relies on AES_SBOX being zero. +define(<AES_FINAL_ROUND>, < + C Load subkey + ld [$7 + 4*$1], TMP3 + + and $3, 0xff, TMP1 C 0 + srl $4, 8, TMP2 C 1 + ldub [T + TMP1], TMP1 C 0 + and TMP2, 0xff, TMP2 C 1 + xor TMP3, TMP1, TMP1 C 0 + ldub [T + TMP2], TMP2 C 1 + stb TMP1, [$8 + 4*$1] C 0 E0 + srl $5, 16, TMP1 C 2 + srl TMP3, 8, TMP3 C 1 + and TMP1, 0xff, TMP1 C 2 + xor TMP3, TMP2, TMP2 C 1 + ldub [T + TMP1], TMP1 C 2 + stb TMP2, [$8 + 4*$1 + 1] C 1 E1 + srl $6, 24, TMP2 C 3 + srl TMP3, 8, TMP3 C 2 + ldub [T + TMP2], TMP2 C 3 + xor TMP3, TMP1, TMP1 C 2 + srl TMP3, 8, TMP3 C 3 + stb TMP1, [$8 + 4*$1 + 2] C 2 E2 + xor TMP3, TMP2, TMP2 C 3 + stb TMP2, [$8 + 4*$1 + 3] C 3 E3 +>) diff --git a/sparc32/arcfour-crypt.asm b/sparc32/arcfour-crypt.asm new file mode 100644 index 0000000000000000000000000000000000000000..4d8dac948b4f172ffecc0b39a162c5483f3c9fb5 --- /dev/null +++ b/sparc32/arcfour-crypt.asm @@ -0,0 +1,230 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2002, 2005 Niels M�ller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +C Define to YES, to enable the complex code to special case SRC +C and DST with compatible alignment. + +define(<WITH_ALIGN>, <YES>) + +C Registers + +define(<CTX>, <%i0>) +define(<LENGTH>,<%i1>) +define(<DST>, <%i2>) +define(<SRC>, <%i3>) + +define(<I1>, <%i4>) +define(<I2>, <%i5>) +define(<J>, <%g1>) +define(<SI>, <%g2>) +define(<SJ>, <%g3>) +define(<TMP>, <%o0>) +define(<TMP2>, <%o1>) +define(<N>, <%o2>) +define(<DATA>, <%o3>) + +C Computes the next byte of the key stream. As input, i must +C already point to the index for the current access, the index +C for the next access is stored in ni. The resulting key byte is +C stored in res. +C ARCFOUR_BYTE(i, ni, res) +define(<ARCFOUR_BYTE>, < + ldub [CTX + $1], SI + add $1, 1, $2 + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + and $2, 0xff, $2 + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + stb SJ, [CTX + $1] + ldub [CTX + SI], $3 +>)dnl + +C FIXME: Consider using the callers window +define(<FRAME_SIZE>, 104) + + .file "arcfour-crypt.asm" + + C arcfour_crypt(struct arcfour_ctx *ctx, + C unsigned length, uint8_t *dst, + C const uint8_t *src) + + .section ".text" + .align 16 + .proc 020 + +PROLOGUE(nettle_arcfour_crypt) + + save %sp, -FRAME_SIZE, %sp + cmp LENGTH, 0 + be .Lend + nop + + C Load both I and J + lduh [CTX + ARCFOUR_I], I1 + and I1, 0xff, J + srl I1, 8, I1 + + C We want an even address for DST + andcc DST, 1, %g0 + add I1, 1 ,I1 + beq .Laligned2 + and I1, 0xff, I1 + + mov I1, I2 + ldub [SRC], DATA + ARCFOUR_BYTE(I2, I1, TMP) + subcc LENGTH, 1, LENGTH + add SRC, 1, SRC + xor DATA, TMP, DATA + stb DATA, [DST] + beq .Ldone + add DST, 1, DST + +.Laligned2: + + cmp LENGTH, 2 + blu .Lfinal1 + C Harmless delay slot instruction + andcc DST, 2, %g0 + beq .Laligned4 + nop + + ldub [SRC], DATA + ARCFOUR_BYTE(I1, I2, TMP) + ldub [SRC + 1], TMP2 + add SRC, 2, SRC + xor DATA, TMP, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + subcc LENGTH, 2, LENGTH + or DATA, TMP, DATA + + sth DATA, [DST] + beq .Ldone + add DST, 2, DST + +.Laligned4: + cmp LENGTH, 4 + blu .Lfinal2 + C Harmless delay slot instruction + srl LENGTH, 2, N + +.Loop: + C Main loop, with aligned writes + + C FIXME: Could check if SRC is aligned, and + C use 32-bit reads in that case. + + ldub [SRC], DATA + ARCFOUR_BYTE(I1, I2, TMP) + ldub [SRC + 1], TMP2 + xor TMP, DATA, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + ldub [SRC + 2], TMP2 + or TMP, DATA, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I1, I2, TMP) + xor TMP2, TMP, TMP + ldub [SRC + 3], TMP2 + or TMP, DATA, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + or TMP, DATA, DATA + subcc N, 1, N + add SRC, 4, SRC + st DATA, [DST] + bne .Loop + add DST, 4, DST + + andcc LENGTH, 3, LENGTH + beq .Ldone + nop + +.Lfinal2: + C DST address must be 2-aligned + cmp LENGTH, 2 + blu .Lfinal1 + nop + + ldub [SRC], DATA + ARCFOUR_BYTE(I1, I2, TMP) + ldub [SRC + 1], TMP2 + add SRC, 2, SRC + xor DATA, TMP, DATA + sll DATA, 8, DATA + + ARCFOUR_BYTE(I2, I1, TMP) + xor TMP2, TMP, TMP + or DATA, TMP, DATA + + sth DATA, [DST] + beq .Ldone + add DST, 2, DST + +.Lfinal1: + mov I1, I2 + ldub [SRC], DATA + ARCFOUR_BYTE(I2, I1, TMP) + xor DATA, TMP, DATA + stb DATA, [DST] + +.Ldone: + C Save back I and J + sll I2, 8, I2 + or I2, J, I2 + stuh I2, [CTX + ARCFOUR_I] + +.Lend: + ret + restore + +EPILOGUE(nettle_arcfour_crypt) + +C Some stats from adriana.lysator.liu.se (SS1000E, 85 MHz), for AES 128 + +C 1: nettle-1.13 C-code +C 2: First working version of the assembler code +C 3: Moved load of source byte +C 4: Better instruction scheduling +C 5: Special case SRC and DST with compatible alignment +C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI]) +C 7: Unrolled only twice, with byte-accesses +C 8: Unrolled, using 8-bit reads and aligned 32-bit writes. + +C MB/s cycles/byte Code size (bytes) +C 1: 6.6 12.4 132 +C 2: 5.6 14.5 116 +C 3: 6.0 13.5 116 +C 4: 6.5 12.4 116 +C 5: 7.9 10.4 496 +C 6: 8.3 9.7 496 +C 7: 6.7 12.1 268 +C 8: 8.3 9.8 768 diff --git a/sparc32/machine.m4 b/sparc32/machine.m4 new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391