Commit 993ae2d6 authored by Niels Möller's avatar Niels Möller

Optimized ARM memxor.

parent 39c03743
2013-02-19 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Software pipelining for the aligned case.
(memxor3): Use 3-way unrolling also for aligned memxor3.
Both loops benchmarked at 7 cycles (0.58 cycles per byte), but
memxor3 seems to have a strange dependency on instruction
alignment.
2013-02-12 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
......
......@@ -40,12 +40,13 @@ define(<TNC>, <r7>)
.arm
C memxor(uint8_t *dst, const uint8_t *src, size_t n)
.align 2
.align 4
PROLOGUE(memxor)
cmp N, #0
beq .Lmemxor_ret
push {r4, r5, r6, r7}
C FIXME: Delay push until we know how many registers we need.
push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
cmp N, #7
bcs .Lmemxor_large
......@@ -60,7 +61,7 @@ PROLOGUE(memxor)
bne .Lmemxor_bytes
.Lmemxor_done:
pop {r4,r5,r6,r7}
pop {r4,r5,r6,r7,r8,r10,r11,r14}
.Lmemxor_ret:
bx lr
......@@ -140,16 +141,42 @@ PROLOGUE(memxor)
subs N, #8
bcc .Lmemxor_same_end
ldmia SRC!, {r3, r4, r5}
C Keep address for loads in r14
mov r14, DST
ldmia r14!, {r6, r7, r8}
subs N, #12
eor r10, r3, r6
eor r11, r4, r7
eor r12, r5, r8
bcc .Lmemxor_same_final_store
subs N, #12
ldmia r14!, {r6, r7, r8}
bcc .Lmemxor_same_wind_down
C 7 cycles per iteration, 0.58 cycles/byte
C Loopmixer could perhaps get it down to 6 cycles.
.Lmemxor_same_loop:
C 8 cycles per iteration, 0.67 cycles/byte
C r10-r12 contains values to be stored at DST
C r6-r8 contains values read from r14, in advance
ldmia SRC!, {r3, r4, r5}
ldmia DST, {r6, r7, r12}
subs N, #12
eor r3, r6
eor r4, r7
eor r5, r12
stmia DST!, {r3, r4, r5}
stmia DST!, {r10, r11, r12}
eor r10, r3, r6
eor r11, r4, r7
eor r12, r5, r8
ldmia r14!, {r6, r7, r8}
bcs .Lmemxor_same_loop
.Lmemxor_same_wind_down:
C Wind down code
ldmia SRC!, {r3, r4, r5}
stmia DST!, {r10, r11, r12}
eor r10, r3, r6
eor r11, r4, r7
eor r12, r5, r8
.Lmemxor_same_final_store:
stmia DST!, {r10, r11, r12}
.Lmemxor_same_end:
C We have 0-11 bytes left to do, and N holds number of bytes -12.
......@@ -312,40 +339,51 @@ PROLOGUE(memxor3)
bne .Lmemxor3_au ;
C a, b and dst all have the same alignment.
sub AP, #4
sub BP, #4
sub DST, #4
tst N, #4
it ne
subne N, #4
bne .Lmemxor3_aligned_word_loop
ldr r4, [AP], #-4
ldr r5, [BP], #-4
eor r4, r5
str r4, [DST], #-4
subs N, #8
bcc .Lmemxor3_aligned_word_end
C This loop runs at 7 cycles per iteration, but it seems to
C have a strange alignment requirement. For this speed, the
C loop started at offset 0x2ac in the object file, and all
C other offsets made it slower.
.Lmemxor3_aligned_word_loop:
ldr r4, [AP, #-4]
ldr r5, [AP], #-8
ldr r6, [BP, #-4]
ldr r7, [BP], #-8
ldmdb AP!, {r4,r5,r6}
ldmdb BP!, {r7,r8,r10}
subs N, #12
eor r4, r7
eor r5, r8
eor r6, r10
stmdb DST!, {r4, r5,r6}
bcs .Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
C We have 0-11 bytes left to do, and N holds number of bytes -12.
adds N, #4
bcc .Lmemxor3_aligned_lt_8
C Do 8 bytes more, leftover is in N
ldmdb AP!, {r4, r5}
ldmdb BP!, {r6, r7}
eor r4, r6
eor r5, r7
subs N, #8
str r4, [DST, #-4]
str r5, [DST], #-8
stmdb DST!, {r4,r5}
beq .Lmemxor3_done
b .Lmemxor3_bytes
bcs .Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
adds N, #8
.Lmemxor3_aligned_lt_8:
adds N, #4
bcc .Lmemxor3_aligned_lt_4
ldr r4, [AP,#-4]!
ldr r5, [BP,#-4]!
eor r4, r5
str r4, [DST,#-4]!
beq .Lmemxor3_done
b .Lmemxor3_bytes
.Lmemxor3_aligned_lt_4:
adds N, #4
beq .Lmemxor3_done
add AP, #4
add BP, #4
add DST, #4
b .Lmemxor3_bytes
.Lmemxor3_uu:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment