Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Nettle
nettle
Commits
11609bf3
Commit
11609bf3
authored
Feb 19, 2013
by
Niels Möller
Browse files
Options
Browse Files
Download
Plain Diff
Merged some ARM memxor changes.
parents
2d9a849e
16d2a186
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
103 additions
and
48 deletions
+103
-48
ChangeLog
ChangeLog
+8
-0
armv7/memxor.asm
armv7/memxor.asm
+95
-48
No files found.
ChangeLog
View file @
11609bf3
2013-02-19 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Software pipelining for the aligned
case. Runs at 6 cycles (0.5 cycles per byte). Delayed push of
registers until we know how many registers we need.
(memxor3): Use 3-way unrolling also for aligned memxor3.
Runs at 8 cycles (0.67 cycles per byte)
2013-02-14 Niels Möller <nisse@lysator.liu.se>
* examples/rsa-keygen.c (uint_arg): New function.
...
...
armv7/memxor.asm
View file @
11609bf3
...
...
@@ -30,7 +30,7 @@ define(<DST>, <r0>)
define
(
<
SRC
>
,
<
r1
>
)
define
(
<
N
>
,
<
r2
>
)
define
(
<
CNT
>
,
<
r6
>
)
define
(
<
TNC
>
,
<
r
7
>
)
define
(
<
TNC
>
,
<
r
12
>
)
.syntax
unified
...
...
@@ -40,12 +40,10 @@ define(<TNC>, <r7>)
.arm
C
memxor
(
uint8_t
*
ds
t
,
const
uint8_t
*
src
,
si
ze_t
n
)
.align
2
.align
4
PROLOGUE
(
memxor
)
cmp
N
,
#
0
beq
.Lmemxor_ret
push
{
r4
,
r5
,
r6
,
r7
}
beq
.Lmemxor_done
cmp
N
,
#
7
bcs
.Lmemxor_large
...
...
@@ -53,21 +51,19 @@ PROLOGUE(memxor)
C
Si
mple
byte
loop
.Lmemxor_bytes:
ldrb
r3
,
[
SRC
],
#
+
1
ldrb
r
4
,
[
DS
T
]
eor
r3
,
r
4
ldrb
r
12
,
[
DS
T
]
eor
r3
,
r
12
strb
r3
,
[
DS
T
],
#
+
1
subs
N
,
#
1
bne
.Lmemxor_bytes
.Lmemxor_done:
pop
{
r4
,
r5
,
r6
,
r7
}
.Lmemxor_ret:
bx
lr
.Lmemxor_align_loop:
ldrb
r3
,
[
SRC
],
#
+
1
ldrb
r
4
,
[
DS
T
]
eor
r3
,
r
4
ldrb
r
12
,
[
DS
T
]
eor
r3
,
r
12
strb
r3
,
[
DS
T
],
#
+
1
sub
N
,
#
1
...
...
@@ -78,7 +74,7 @@ PROLOGUE(memxor)
C
We
have
at
least
4
byte
s
left
to
do
here.
sub
N
,
#
4
ands
CNT
,
SRC
,
#
3
ands
r3
,
SRC
,
#
3
beq
.Lmemxor_same
C
Di
fferent
al
ignment
case.
...
...
@@ -92,7 +88,9 @@ PROLOGUE(memxor)
C
With
little
-
endian
,
we
need
to
do
C
DS
T
[
i
]
^
=
(
SRC
[
i
]
>>
CNT
)
^
(
SRC
[
i
+
1
]
<<
TNC
)
lsl
CNT
,
#
3
push
{
r4
,
r5
,
r6
}
lsl
CNT
,
r3
,
#
3
bic
SRC
,
#
3
rsb
TNC
,
CNT
,
#
32
...
...
@@ -119,12 +117,15 @@ PROLOGUE(memxor)
subs
N
,
#
8
bcs
.Lmemxor_word_loop
adds
N
,
#
8
beq
.Lmemxor_done
beq
.Lmemxor_
odd_
done
C
We
have
TNC
/
8
left
-
over
byte
s
in
r4
,
high
end
lsr
r4
,
CNT
ldr
r3
,
[
DS
T
]
eor
r3
,
r4
pop
{
r4
,
r5
,
r6
}
C
Store
byte
s
,
one
by
one.
.Lmemxor_leftover:
strb
r3
,
[
DS
T
],
#
+
1
...
...
@@ -133,24 +134,55 @@ PROLOGUE(memxor)
subs
TNC
,
#
8
lsr
r3
,
#
8
bne
.Lmemxor_leftover
b
.Lmemxor_bytes
.Lmemxor_odd_done:
pop
{
r4
,
r5
,
r6
}
bx
lr
.Lmemxor_same:
push
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
C
lr
is
the
link
register
subs
N
,
#
8
bcc
.Lmemxor_same_end
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
C
Keep
address
for
loads
in
r14
mov
r14
,
DS
T
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
subs
N
,
#
12
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
bcc
.Lmemxor_same_final_store
subs
N
,
#
12
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcc
.Lmemxor_same_wind_down
C
6
cycles
per
iteration
,
0.50
cycles
/
byte
.
For
this
sp
eed
,
C
loop
starts
at
offset
0x11c
in
the
object
file.
.Lmemxor_same_loop:
C
8
cycles
per
iteration
,
0.67
cycles
/
byte
C
r10
-
r12
contains
values
to
be
stored
at
DS
T
C
r6
-
r8
contains
values
read
from
r14
,
in
advance
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
ldmia
DS
T
,
{
r6
,
r7
,
r12
}
subs
N
,
#
12
eor
r3
,
r6
eor
r4
,
r7
eor
r5
,
r12
stmia
DS
T
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcs
.Lmemxor_same_loop
.Lmemxor_same_wind_down:
C
Wind
down
code
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
.Lmemxor_same_final_store:
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
.Lmemxor_same_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
adds
N
,
#
4
...
...
@@ -161,16 +193,18 @@ PROLOGUE(memxor)
eor
r3
,
r6
eor
r4
,
r7
stmia
DS
T
!
,
{
r3
,
r4
}
pop
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
beq
.Lmemxor_done
b
.Lmemxor_bytes
.Lmemxor_same_lt_8:
pop
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
adds
N
,
#
4
bcc
.Lmemxor_same_lt_4
ldr
r3
,
[
SRC
],
#
+
4
ldr
r
4
,
[
DS
T
]
eor
r3
,
r
4
ldr
r
12
,
[
DS
T
]
eor
r3
,
r
12
str
r3
,
[
DS
T
],
#
+
4
beq
.Lmemxor_done
b
.Lmemxor_bytes
...
...
@@ -312,40 +346,53 @@ PROLOGUE(memxor3)
bne
.Lmemxor3_au
;
C
a
,
b
and
ds
t
al
l
have
the
same
al
ignment.
sub
AP
,
#
4
sub
BP
,
#
4
sub
DS
T
,
#
4
tst
N
,
#
4
it
ne
subne
N
,
#
4
bne
.Lmemxor3_aligned_word_loop
ldr
r4
,
[
AP
],
#
-
4
ldr
r5
,
[
BP
],
#
-
4
eor
r4
,
r5
str
r4
,
[
DS
T
],
#
-
4
subs
N
,
#
8
bcc
.Lmemxor3_aligned_word_end
C
This
loop
runs
at
8
cycles
per
iteration.
It
has
been
C
observed
running
at
only
7
cycles
,
for
this
sp
eed
,
the
loop
C
started
at
offset
0x2ac
in
the
object
file.
C
FIXME
:
consider
software
pipelining
,
si
milarly
to
the
memxor
C
loop.
.Lmemxor3_aligned_word_loop:
ldr
r4
,
[
AP
,
#
-
4
]
ldr
r5
,
[
AP
],
#
-
8
ldr
r6
,
[
BP
,
#
-
4
]
ldr
r7
,
[
BP
],
#
-
8
ldmdb
AP
!
,
{
r4
,
r5
,
r6
}
ldmdb
BP
!
,
{
r7
,
r8
,
r10
}
subs
N
,
#
12
eor
r4
,
r7
eor
r5
,
r8
eor
r6
,
r10
stmdb
DS
T
!
,
{
r4
,
r5
,
r6
}
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_8
C
Do
8
byte
s
more
,
leftover
is
in
N
ldmdb
AP
!
,
{
r4
,
r5
}
ldmdb
BP
!
,
{
r6
,
r7
}
eor
r4
,
r6
eor
r5
,
r7
s
ubs
N
,
#
8
str
r4
,
[
DS
T
,
#
-
4
]
str
r5
,
[
DS
T
],
#
-
8
s
tmdb
DS
T
!
,
{
r4
,
r5
}
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
adds
N
,
#
8
.Lmemxor3_aligned_lt_8:
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_4
ldr
r4
,
[
AP
,
#
-
4
]
!
ldr
r5
,
[
BP
,
#
-
4
]
!
eor
r4
,
r5
str
r4
,
[
DS
T
,
#
-
4
]
!
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
.Lmemxor3_aligned_lt_4:
adds
N
,
#
4
beq
.Lmemxor3_done
add
AP
,
#
4
add
BP
,
#
4
add
DS
T
,
#
4
b
.Lmemxor3_bytes
.Lmemxor3_uu:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment