Commit 4319568d authored by Niels Möller's avatar Niels Möller
Browse files

Tried unrolling only twice.

Rev: src/nettle/sparc/arcfour-crypt.asm:1.7
parent 379fc214
...@@ -30,107 +30,34 @@ define(<LENGTH>,<%i1>) ...@@ -30,107 +30,34 @@ define(<LENGTH>,<%i1>)
define(<DST>, <%i2>) define(<DST>, <%i2>)
define(<SRC>, <%i3>) define(<SRC>, <%i3>)
define(<I>, <%i4>) define(<I1>, <%i4>)
define(<J>, <%i5>) define(<I2>, <%i5>)
define(<SI>, <%g1>) define(<J>, <%g1>)
define(<SJ>, <%g2>) define(<SI>, <%g2>)
define(<TMP>, <%g3>) define(<SJ>, <%g3>)
define(<N>, <%o0>) define(<TMP>, <%o0>)
define(<WORD>, <%o1>) define(<N>, <%o1>)
define(<DATA>, <%o2>)
C Encrypts n bytes, one byte at a time.
C ARCFOUR_BYTE_LOOP(n, label) C Computes the next byte of the key stream. As input, i must
define(<ARCFOUR_BYTE_LOOP>, < C already point to the index for the current access, the index
$2: C for the next access is stored in ni. The resulting key byte is
add I, 1, I C stored in res.
and I, 0xff, I C ARCFOUR_BYTE(i, ni, res)
ldub [CTX + I], SI define(<ARCFOUR_BYTE>, <
subcc $1,1,$1 ldub [CTX + $1], SI
ldub [SRC], TMP add $1, 1, $2
add J, SI, J add J, SI, J
and J, 0xff, J and J, 0xff, J
ldub [CTX + J], SJ ldub [CTX + J], SJ
add SRC, 1, SRC and $2, 0xff, $2
stb SI, [CTX + J] stb SI, [CTX + J]
add SI, SJ, SI add SI, SJ, SI
and SI, 0xff, SI and SI, 0xff, SI
stb SJ, [CTX + I] stb SJ, [CTX + $1]
ldub [CTX + SI], SI ldub [CTX + SI], $3
xor TMP, SI, TMP
stb TMP, [DST]
bne $2
add DST, 1, DST
>)dnl >)dnl
C Encrypts 4n bytes, four at a time. Requires proper alignmentof
C SRC and DST.
C ARCFOUR_WORD_LOOP(n, label)
define(<ARCFOUR_WORD_LOOP>, <
$2:
add I, 1, I
and I, 0xff, I
ldub [CTX + I], SI
ld [SRC], WORD
add J, SI, J
and J, 0xff, J
ldub [CTX + J], SJ
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
stb SJ, [CTX + I]
ldub [CTX + SI], TMP
add I, 1, I
and I, 0xff, I
ldub [CTX + I], SI
add SRC, 4, SRC
add J, SI, J
and J, 0xff, J
ldub [CTX + J], SJ
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
stb SJ, [CTX + I]
ldub [CTX + SI], SI
sll TMP, 8, TMP
or TMP, SI, TMP
add I, 1, I
and I, 0xff, I
ldub [CTX + I], SI
subcc $1, 1, $1
add J, SI, J
and J, 0xff, J
ldub [CTX + J], SJ
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
stb SJ, [CTX + I]
ldub [CTX + SI], SI
sll TMP, 8, TMP
or TMP, SI, TMP
add I, 1, I
and I, 0xff, I
ldub [CTX + I], SI
C empty slot
add J, SI, J
and J, 0xff, J
ldub [CTX + J], SJ
stb SI, [CTX + J]
add SI, SJ, SI
and SI, 0xff, SI
stb SJ, [CTX + I]
ldub [CTX + SI], SI
sll TMP, 8, TMP
or TMP, SI, TMP
xor WORD, TMP, WORD
st WORD, [DST]
bne $2
add DST, 4, DST
>)dnl
C FIXME: Consider using the callers window C FIXME: Consider using the callers window
define(<FRAME_SIZE>, 104) define(<FRAME_SIZE>, 104)
...@@ -151,52 +78,48 @@ PROLOGUE(nettle_arcfour_crypt) ...@@ -151,52 +78,48 @@ PROLOGUE(nettle_arcfour_crypt)
be .Lend be .Lend
C Load both I and J C Load both I and J
lduh [CTX + ARCFOUR_I], I lduh [CTX + ARCFOUR_I], I1
and I, 0xff, J and I1, 0xff, J
srl I, 8, I srl I1, 8, I1
ifelse(WITH_ALIGN, YES, < andcc LENGTH, 1, %g0
C Check if SRC and DST have compatible alignment beq .Loop
xor SRC, DST, TMP
andcc TMP, 3, TMP
bne .Lrest add I1, 1 ,I1
nop and I1, 0xff, I1
andcc DST, 3, N
bz .Laligned
nop
sub N, 4, N
neg N
cmp N, LENGTH
bgeu .Lrest
nop
sub LENGTH, N, LENGTH
ARCFOUR_BYTE_LOOP(N, .Lunalignedloop)
.Laligned: ARCFOUR_BYTE(I1, I2, TMP)
srl LENGTH, 2, N ldub [SRC], DATA
cmp N, 0 subcc LENGTH, 1, LENGTH
be .Lrest add SRC, 1, SRC
nop xor DATA, TMP, DATA
stb DATA, [DST]
ARCFOUR_WORD_LOOP(N, .Lalignedloop) beq .Ldone
add DST, 1, DST
andcc LENGTH, 3, LENGTH mov I2, I1
bz .Ldone .Loop:
nop ARCFOUR_BYTE(I1, I2, TMP)
>) ldub [SRC], DATA
.Lrest: add SRC, 2, SRC
ARCFOUR_BYTE_LOOP(LENGTH, .Loop) xor DATA, TMP, DATA
stb DATA, [DST]
ARCFOUR_BYTE(I2, I1, TMP)
ldub [SRC - 1], DATA
subcc LENGTH, 2, LENGTH
add DST, 2, DST
xor DATA, TMP, DATA
bne .Loop
stb DATA, [DST - 1]
mov I2, I1
.Ldone: .Ldone:
C Save back I and J C Save back I and J
sll I, 8, I sll I1, 8, I1
or I, J, I or I1, J, I1
stuh I, [CTX + ARCFOUR_I] stuh I1, [CTX + ARCFOUR_I]
.Lend: .Lend:
ret ret
...@@ -212,6 +135,7 @@ C 3: Moved load of source byte ...@@ -212,6 +135,7 @@ C 3: Moved load of source byte
C 4: Better instruction scheduling C 4: Better instruction scheduling
C 5: Special case SRC and DST with compatible alignment C 5: Special case SRC and DST with compatible alignment
C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI]) C 6: After bugfix (reorder of ld [CTX+SI+SJ] and st [CTX + SI])
C 7: Unrolled only twice, with byte-accesses
C MB/s cycles/byte Code size (bytes) C MB/s cycles/byte Code size (bytes)
C 1: 6.6 12.4 132 C 1: 6.6 12.4 132
...@@ -220,3 +144,4 @@ C 3: 6.0 13.5 116 ...@@ -220,3 +144,4 @@ C 3: 6.0 13.5 116
C 4: 6.5 12.4 116 C 4: 6.5 12.4 116
C 5: 7.9 10.4 496 C 5: 7.9 10.4 496
C 6: 8.3 9.7 496 C 6: 8.3 9.7 496
C 7: 6.7 12.1 268
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment