Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Nettle
nettle
Commits
993ae2d6
Commit
993ae2d6
authored
Feb 19, 2013
by
Niels Möller
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Optimized ARM memxor.
parent
39c03743
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
80 additions
and
34 deletions
+80
-34
ChangeLog
ChangeLog
+8
-0
armv7/memxor.asm
armv7/memxor.asm
+72
-34
No files found.
ChangeLog
View file @
993ae2d6
2013-02-19 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Software pipelining for the aligned case.
(memxor3): Use 3-way unrolling also for aligned memxor3.
Both loops benchmarked at 7 cycles (0.58 cycles per byte), but
memxor3 seems to have a strange dependency on instruction
alignment.
2013-02-12 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
...
...
armv7/memxor.asm
View file @
993ae2d6
...
...
@@ -40,12 +40,13 @@ define(<TNC>, <r7>)
.arm
C
memxor
(
uint8_t
*
ds
t
,
const
uint8_t
*
src
,
si
ze_t
n
)
.align
2
.align
4
PROLOGUE
(
memxor
)
cmp
N
,
#
0
beq
.Lmemxor_ret
push
{
r4
,
r5
,
r6
,
r7
}
C
FIXME
:
Delay
push
until
we
know
how
many
registers
we
need.
push
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
C
lr
is
the
link
register
cmp
N
,
#
7
bcs
.Lmemxor_large
...
...
@@ -60,7 +61,7 @@ PROLOGUE(memxor)
bne
.Lmemxor_bytes
.Lmemxor_done:
pop
{
r4
,
r5
,
r6
,
r7
}
pop
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
.Lmemxor_ret:
bx
lr
...
...
@@ -140,16 +141,42 @@ PROLOGUE(memxor)
subs
N
,
#
8
bcc
.Lmemxor_same_end
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
C
Keep
address
for
loads
in
r14
mov
r14
,
DS
T
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
subs
N
,
#
12
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
bcc
.Lmemxor_same_final_store
subs
N
,
#
12
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcc
.Lmemxor_same_wind_down
C
7
cycles
per
iteration
,
0.58
cycles
/
byte
C
Loopmixer
could
perhaps
get
it
down
to
6
cycles.
.Lmemxor_same_loop:
C
8
cycles
per
iteration
,
0.67
cycles
/
byte
C
r10
-
r12
contains
values
to
be
stored
at
DS
T
C
r6
-
r8
contains
values
read
from
r14
,
in
advance
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
ldmia
DS
T
,
{
r6
,
r7
,
r12
}
subs
N
,
#
12
eor
r3
,
r6
eor
r4
,
r7
eor
r5
,
r12
stmia
DS
T
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcs
.Lmemxor_same_loop
.Lmemxor_same_wind_down:
C
Wind
down
code
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
.Lmemxor_same_final_store:
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
.Lmemxor_same_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
...
...
@@ -312,40 +339,51 @@ PROLOGUE(memxor3)
bne
.Lmemxor3_au
;
C
a
,
b
and
ds
t
al
l
have
the
same
al
ignment.
sub
AP
,
#
4
sub
BP
,
#
4
sub
DS
T
,
#
4
tst
N
,
#
4
it
ne
subne
N
,
#
4
bne
.Lmemxor3_aligned_word_loop
ldr
r4
,
[
AP
],
#
-
4
ldr
r5
,
[
BP
],
#
-
4
eor
r4
,
r5
str
r4
,
[
DS
T
],
#
-
4
subs
N
,
#
8
bcc
.Lmemxor3_aligned_word_end
C
This
loop
runs
at
7
cycles
per
iteration
,
but
it
seems
to
C
have
a
strange
al
ignment
requirement.
For
this
sp
eed
,
the
C
loop
started
at
offset
0x2ac
in
the
object
file
,
and
al
l
C
other
offsets
made
it
slower.
.Lmemxor3_aligned_word_loop:
ldr
r4
,
[
AP
,
#
-
4
]
ldr
r5
,
[
AP
],
#
-
8
ldr
r6
,
[
BP
,
#
-
4
]
ldr
r7
,
[
BP
],
#
-
8
ldmdb
AP
!
,
{
r4
,
r5
,
r6
}
ldmdb
BP
!
,
{
r7
,
r8
,
r10
}
subs
N
,
#
12
eor
r4
,
r7
eor
r5
,
r8
eor
r6
,
r10
stmdb
DS
T
!
,
{
r4
,
r5
,
r6
}
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_8
C
Do
8
byte
s
more
,
leftover
is
in
N
ldmdb
AP
!
,
{
r4
,
r5
}
ldmdb
BP
!
,
{
r6
,
r7
}
eor
r4
,
r6
eor
r5
,
r7
s
ubs
N
,
#
8
str
r4
,
[
DS
T
,
#
-
4
]
str
r5
,
[
DS
T
],
#
-
8
s
tmdb
DS
T
!
,
{
r4
,
r5
}
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
adds
N
,
#
8
.Lmemxor3_aligned_lt_8:
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_4
ldr
r4
,
[
AP
,
#
-
4
]
!
ldr
r5
,
[
BP
,
#
-
4
]
!
eor
r4
,
r5
str
r4
,
[
DS
T
,
#
-
4
]
!
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
.Lmemxor3_aligned_lt_4:
adds
N
,
#
4
beq
.Lmemxor3_done
add
AP
,
#
4
add
BP
,
#
4
add
DS
T
,
#
4
b
.Lmemxor3_bytes
.Lmemxor3_uu:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment