Skip to content
Snippets Groups Projects
Commit 644dc4fd authored by Niels Möller's avatar Niels Möller
Browse files

Improved loop logic, and unrolled

loop twice. Gave a modest speedup.

Rev: nettle/x86/arcfour-crypt.asm:1.2
parent 7dcba661
No related branches found
No related tags found
No related merge requests found
...@@ -38,48 +38,69 @@ C Input arguments: ...@@ -38,48 +38,69 @@ C Input arguments:
C src = 32(%esp) C src = 32(%esp)
C Register usage: C Register usage:
C %ebp = ctx C %ebp = ctx
C %esi = src (updated through out loop) C %esi = src
C %edi = dst (updated through out loop) C %edi = dst
C %edx = src + length (end of source area) C %edx = loop counter
C %eax = i C %eax = i
C %ebx = j C %ebx = j
C %cl = si C %cl = si
C %ch = sj C %ch = sj
movl 24(%esp), %edx C length movl 24(%esp), %edx C length
testl %edx,%edx
jz .Lend
movl 20(%esp), %ebp C ctx movl 20(%esp), %ebp C ctx
movl 28(%esp), %edi movl 28(%esp), %edi C dst
movl 32(%esp), %esi movl 32(%esp), %esi C src
addl %esi, %edx C Keep src + length
lea (%edx, %edi), %edi
lea (%edx, %esi), %esi
negl %edx
jnc .Lend
movzbl ARCFOUR_I (%ebp), %eax C i movzbl ARCFOUR_I (%ebp), %eax C i
movzbl ARCFOUR_J (%ebp), %ebx C j movzbl ARCFOUR_J (%ebp), %ebx C j
incb %al
sarl $1, %edx
jc .Lloop_odd
ALIGN(4)
.Lloop: .Lloop:
C incb %al movb (%ebp, %eax), %cl C si.
addb %cl, %bl
movb (%ebp, %ebx), %ch C sj
movb %ch, (%ebp, %eax) C S[i] = sj
incl %eax incl %eax
andl $0xff, %eax movzbl %al, %eax
movzbl (%ebp, %eax), %ecx C si. Clears high bytes movb %cl, (%ebp, %ebx) C S[j] = si
addb %ch, %cl
movzbl %cl, %ecx C Clear, so it can be used
C for indexing.
movb (%ebp, %ecx), %cl
xorb (%esi, %edx, 2), %cl
movb %cl, (%edi, %edx, 2)
C FIXME: Could exchange cl and ch in the second half
C and try to interleave instructions better.
.Lloop_odd:
movb (%ebp, %eax), %cl C si.
addb %cl, %bl addb %cl, %bl
C The addl andl is preferable on PPro and PII, but slows thing down on AMD Duron.
C addl %ecx, %ebx
C andl $0xff, %ebx
movb (%ebp, %ebx), %ch C sj movb (%ebp, %ebx), %ch C sj
movb %ch, (%ebp, %eax) C S[i] = sj movb %ch, (%ebp, %eax) C S[i] = sj
incl %eax
movzbl %al, %eax
movb %cl, (%ebp, %ebx) C S[j] = si movb %cl, (%ebp, %ebx) C S[j] = si
addb %ch, %cl addb %ch, %cl
movzbl %cl, %ecx C Clear, so it can be used movzbl %cl, %ecx C Clear, so it can be used
C for indexing. C for indexing.
movb (%ebp, %ecx), %cl movb (%ebp, %ecx), %cl
xorb (%esi), %cl xorb 1(%esi, %edx, 2), %cl
incl %esi incl %edx
movb %cl, (%edi) movb %cl, -1(%edi, %edx, 2)
incl %edi
cmpl %esi, %edx jnz .Lloop
jne .Lloop
C .Lloop_done:
decb %al
movb %al, ARCFOUR_I (%ebp) C Store the new i and j. movb %al, ARCFOUR_I (%ebp) C Store the new i and j.
movb %bl, ARCFOUR_J (%ebp) movb %bl, ARCFOUR_J (%ebp)
.Lend: .Lend:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment