Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Dmitry Baryshkov
nettle
Commits
993ae2d6
Commit
993ae2d6
authored
Feb 19, 2013
by
Niels Möller
Browse files
Optimized ARM memxor.
parent
39c03743
Changes
2
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
993ae2d6
2013-02-19 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Software pipelining for the aligned case.
(memxor3): Use 3-way unrolling also for aligned memxor3.
Both loops benchmarked at 7 cycles (0.58 cycles per byte), but
memxor3 seems to have a strange dependency on instruction
alignment.
2013-02-12 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
...
...
armv7/memxor.asm
View file @
993ae2d6
...
...
@@ -40,12 +40,13 @@ define(<TNC>, <r7>)
.arm
C
memxor
(
uint8_t
*
ds
t
,
const
uint8_t
*
src
,
si
ze_t
n
)
.align
2
.align
4
PROLOGUE
(
memxor
)
cmp
N
,
#
0
beq
.Lmemxor_ret
push
{
r4
,
r5
,
r6
,
r7
}
C
FIXME
:
Delay
push
until
we
know
how
many
registers
we
need.
push
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
C
lr
is
the
link
register
cmp
N
,
#
7
bcs
.Lmemxor_large
...
...
@@ -60,7 +61,7 @@ PROLOGUE(memxor)
bne
.Lmemxor_bytes
.Lmemxor_done:
pop
{
r4
,
r5
,
r6
,
r7
}
pop
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
.Lmemxor_ret:
bx
lr
...
...
@@ -140,16 +141,42 @@ PROLOGUE(memxor)
subs
N
,
#
8
bcc
.Lmemxor_same_end
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
C
Keep
address
for
loads
in
r14
mov
r14
,
DS
T
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
subs
N
,
#
12
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
bcc
.Lmemxor_same_final_store
subs
N
,
#
12
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcc
.Lmemxor_same_wind_down
C
7
cycles
per
iteration
,
0.58
cycles
/
byte
C
Loopmixer
could
perhaps
get
it
down
to
6
cycles.
.Lmemxor_same_loop:
C
8
cycles
per
iteration
,
0.67
cycles
/
byte
C
r10
-
r12
contains
values
to
be
stored
at
DS
T
C
r6
-
r8
contains
values
read
from
r14
,
in
advance
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
ldmia
DS
T
,
{
r6
,
r7
,
r12
}
subs
N
,
#
12
eor
r3
,
r6
eor
r4
,
r7
eor
r5
,
r12
stmia
DS
T
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcs
.Lmemxor_same_loop
.Lmemxor_same_wind_down:
C
Wind
down
code
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
.Lmemxor_same_final_store:
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
.Lmemxor_same_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
...
...
@@ -312,40 +339,51 @@ PROLOGUE(memxor3)
bne
.Lmemxor3_au
;
C
a
,
b
and
ds
t
al
l
have
the
same
al
ignment.
sub
AP
,
#
4
sub
BP
,
#
4
sub
DS
T
,
#
4
tst
N
,
#
4
it
ne
subne
N
,
#
4
bne
.Lmemxor3_aligned_word_loop
ldr
r4
,
[
AP
],
#
-
4
ldr
r5
,
[
BP
],
#
-
4
eor
r4
,
r5
str
r4
,
[
DS
T
],
#
-
4
subs
N
,
#
8
bcc
.Lmemxor3_aligned_word_end
C
This
loop
runs
at
7
cycles
per
iteration
,
but
it
seems
to
C
have
a
strange
al
ignment
requirement.
For
this
sp
eed
,
the
C
loop
started
at
offset
0x2ac
in
the
object
file
,
and
al
l
C
other
offsets
made
it
slower.
.Lmemxor3_aligned_word_loop:
ldr
r4
,
[
AP
,
#
-
4
]
ldr
r5
,
[
AP
],
#
-
8
ldr
r6
,
[
BP
,
#
-
4
]
ldr
r7
,
[
BP
],
#
-
8
ldmdb
AP
!
,
{
r4
,
r5
,
r6
}
ldmdb
BP
!
,
{
r7
,
r8
,
r10
}
subs
N
,
#
12
eor
r4
,
r7
eor
r5
,
r8
eor
r6
,
r10
stmdb
DS
T
!
,
{
r4
,
r5
,
r6
}
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_8
C
Do
8
byte
s
more
,
leftover
is
in
N
ldmdb
AP
!
,
{
r4
,
r5
}
ldmdb
BP
!
,
{
r6
,
r7
}
eor
r4
,
r6
eor
r5
,
r7
s
ubs
N
,
#
8
str
r4
,
[
DS
T
,
#
-
4
]
str
r5
,
[
DS
T
],
#
-
8
s
tmdb
DS
T
!
,
{
r4
,
r5
}
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
adds
N
,
#
8
.Lmemxor3_aligned_lt_8:
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_4
ldr
r4
,
[
AP
,
#
-
4
]
!
ldr
r5
,
[
BP
,
#
-
4
]
!
eor
r4
,
r5
str
r4
,
[
DS
T
,
#
-
4
]
!
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
.Lmemxor3_aligned_lt_4:
adds
N
,
#
4
beq
.Lmemxor3_done
add
AP
,
#
4
add
BP
,
#
4
add
DS
T
,
#
4
b
.Lmemxor3_bytes
.Lmemxor3_uu:
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment