diff --git a/sparc/arcfour-crypt.asm b/sparc/arcfour-crypt.asm index beadd91eb7fe67c1919e27c9a88bfe6b03e9232f..0dd9e363f18da388a733981b665779508566604b 100644 --- a/sparc/arcfour-crypt.asm +++ b/sparc/arcfour-crypt.asm @@ -18,7 +18,12 @@ C along with the nettle library; see the file COPYING.LIB. If not, write to C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, C MA 02111-1307, USA. -C Registers +C Define to YES, to enable the complex code to special case SRC +C and DST with compatible alignment. + +define(<WITH_ALIGN>, <NO>) + +C Registers define(<CTX>, <%i0>) define(<LENGTH>,<%i1>) @@ -30,9 +35,103 @@ define(<J>, <%i5>) define(<SI>, <%g1>) define(<SJ>, <%g2>) define(<TMP>, <%g3>) +define(<N>, <%o0>) +define(<WORD>, <%o1>) -C FIXME: Consider using the callers window +C Encrypts n bytes, one byte at a time. +C ARCFOUR_BYTE_LOOP(n, label) +define(<ARCFOUR_BYTE_LOOP>, < +$2: + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + subcc $1,1,$1 + ldub [SRC], TMP + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + add SRC, 1, SRC + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + stb SJ, [CTX + I] + xor TMP, SI, TMP + stb TMP, [DST] + bne $2 + add DST, 1, DST +>)dnl + +C Encrypts 4n bytes, four at a time. Requires proper alignmentof +C SRC and DST. +C ARCFOUR_WORD_LOOP(n, label) +define(<ARCFOUR_WORD_LOOP>, < +$2: + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + ld [SRC], WORD + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], TMP + stb SJ, [CTX + I] + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + add SRC, 4, SRC + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + sll TMP, 8, TMP + stb SJ, [CTX + I] + or TMP, SI, TMP + + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + subcc $1, 1, $1 + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + sll TMP, 8, TMP + stb SJ, [CTX + I] + or TMP, SI, TMP + + add I, 1, I + and I, 0xff, I + ldub [CTX + I], SI + C empty slot + add J, SI, J + and J, 0xff, J + ldub [CTX + J], SJ + stb SI, [CTX + J] + add SI, SJ, SI + and SI, 0xff, SI + ldub [CTX + SI], SI + sll TMP, 8, TMP + stb SJ, [CTX + I] + or TMP, SI, TMP + xor WORD, TMP, WORD + st WORD, [DST] + + bne $2 + add DST, 4, DST +>)dnl + +C FIXME: Consider using the callers window define(<FRAME_SIZE>, 104) .file "arcfour-crypt.asm" @@ -56,26 +155,44 @@ PROLOGUE(nettle_arcfour_crypt) and I, 0xff, J srl I, 8, I -.Loop: - add I, 1, I - and I, 0xff, I - ldub [CTX + I], SI - subcc LENGTH,1,LENGTH - ldub [SRC], TMP - add J, SI, J - and J, 0xff, J - ldub [CTX + J], SJ - add SRC, 1, SRC - stb SI, [CTX + J] - add SI, SJ, SI - and SI, 0xff, SI - ldub [CTX + SI], SI - stb SJ, [CTX + I] - xor TMP, SI, TMP - stb TMP, [DST] - bne .Loop - add DST, 1, DST +ifelse(WITH_ALIGN, YES, < + C Check if SRC and DST have compatible alignment + xor SRC, DST, TMP + andcc TMP, 3, TMP + + bne .Lrest + nop + + andcc DST, 3, N + bz .Laligned + nop + + sub N, 4, N + neg N + cmp N, LENGTH + bgeu .Lrest + nop + + sub LENGTH, N, LENGTH + + ARCFOUR_BYTE_LOOP(N, .Lunalignedloop) + +.Laligned: + srl LENGTH, 2, N + cmp N, 0 + be .Lrest + nop + + ARCFOUR_WORD_LOOP(N, .Lalignedloop) + + andcc LENGTH, 3, LENGTH + bz .Ldone + nop +>) +.Lrest: + ARCFOUR_BYTE_LOOP(LENGTH, .Loop) +.Ldone: C Save back I and J sll I, 8, I or I, J, I @@ -93,9 +210,11 @@ C 1: nettle-1.13 C-code C 2: First working version of the assembler code C 3: Moved load of source byte C 4: Better instruction scheduling +C 5: Special case SRC and DST with compatible alignment C MB/s cycles/byte Code size (bytes) C 1: 6.6 12.4 132 C 2: 5.6 14.5 116 C 3: 6.0 13.5 116 C 4: 6.5 12.4 116 +C 5: 7.9 10.4 496