Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Open sidebar
Nettle
nettle
Commits
11609bf3
Commit
11609bf3
authored
Feb 19, 2013
by
Niels Möller
Browse files
Options
Browse Files
Download
Plain Diff
Merged some ARM memxor changes.
parents
2d9a849e
16d2a186
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
103 additions
and
48 deletions
+103
-48
ChangeLog
ChangeLog
+8
-0
armv7/memxor.asm
armv7/memxor.asm
+95
-48
No files found.
ChangeLog
View file @
11609bf3
2013-02-19 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor): Software pipelining for the aligned
case. Runs at 6 cycles (0.5 cycles per byte). Delayed push of
registers until we know how many registers we need.
(memxor3): Use 3-way unrolling also for aligned memxor3.
Runs at 8 cycles (0.67 cycles per byte)
2013-02-14 Niels Möller <nisse@lysator.liu.se>
2013-02-14 Niels Möller <nisse@lysator.liu.se>
* examples/rsa-keygen.c (uint_arg): New function.
* examples/rsa-keygen.c (uint_arg): New function.
...
...
armv7/memxor.asm
View file @
11609bf3
...
@@ -30,7 +30,7 @@ define(<DST>, <r0>)
...
@@ -30,7 +30,7 @@ define(<DST>, <r0>)
define
(
<
SRC
>
,
<
r1
>
)
define
(
<
SRC
>
,
<
r1
>
)
define
(
<
N
>
,
<
r2
>
)
define
(
<
N
>
,
<
r2
>
)
define
(
<
CNT
>
,
<
r6
>
)
define
(
<
CNT
>
,
<
r6
>
)
define
(
<
TNC
>
,
<
r
7
>
)
define
(
<
TNC
>
,
<
r
12
>
)
.syntax
unified
.syntax
unified
...
@@ -40,12 +40,10 @@ define(<TNC>, <r7>)
...
@@ -40,12 +40,10 @@ define(<TNC>, <r7>)
.arm
.arm
C
memxor
(
uint8_t
*
ds
t
,
const
uint8_t
*
src
,
si
ze_t
n
)
C
memxor
(
uint8_t
*
ds
t
,
const
uint8_t
*
src
,
si
ze_t
n
)
.align
2
.align
4
PROLOGUE
(
memxor
)
PROLOGUE
(
memxor
)
cmp
N
,
#
0
cmp
N
,
#
0
beq
.Lmemxor_ret
beq
.Lmemxor_done
push
{
r4
,
r5
,
r6
,
r7
}
cmp
N
,
#
7
cmp
N
,
#
7
bcs
.Lmemxor_large
bcs
.Lmemxor_large
...
@@ -53,21 +51,19 @@ PROLOGUE(memxor)
...
@@ -53,21 +51,19 @@ PROLOGUE(memxor)
C
Si
mple
byte
loop
C
Si
mple
byte
loop
.Lmemxor_bytes:
.Lmemxor_bytes:
ldrb
r3
,
[
SRC
],
#
+
1
ldrb
r3
,
[
SRC
],
#
+
1
ldrb
r
4
,
[
DS
T
]
ldrb
r
12
,
[
DS
T
]
eor
r3
,
r
4
eor
r3
,
r
12
strb
r3
,
[
DS
T
],
#
+
1
strb
r3
,
[
DS
T
],
#
+
1
subs
N
,
#
1
subs
N
,
#
1
bne
.Lmemxor_bytes
bne
.Lmemxor_bytes
.Lmemxor_done:
.Lmemxor_done:
pop
{
r4
,
r5
,
r6
,
r7
}
.Lmemxor_ret:
bx
lr
bx
lr
.Lmemxor_align_loop:
.Lmemxor_align_loop:
ldrb
r3
,
[
SRC
],
#
+
1
ldrb
r3
,
[
SRC
],
#
+
1
ldrb
r
4
,
[
DS
T
]
ldrb
r
12
,
[
DS
T
]
eor
r3
,
r
4
eor
r3
,
r
12
strb
r3
,
[
DS
T
],
#
+
1
strb
r3
,
[
DS
T
],
#
+
1
sub
N
,
#
1
sub
N
,
#
1
...
@@ -78,7 +74,7 @@ PROLOGUE(memxor)
...
@@ -78,7 +74,7 @@ PROLOGUE(memxor)
C
We
have
at
least
4
byte
s
left
to
do
here.
C
We
have
at
least
4
byte
s
left
to
do
here.
sub
N
,
#
4
sub
N
,
#
4
ands
CNT
,
SRC
,
#
3
ands
r3
,
SRC
,
#
3
beq
.Lmemxor_same
beq
.Lmemxor_same
C
Di
fferent
al
ignment
case.
C
Di
fferent
al
ignment
case.
...
@@ -92,7 +88,9 @@ PROLOGUE(memxor)
...
@@ -92,7 +88,9 @@ PROLOGUE(memxor)
C
With
little
-
endian
,
we
need
to
do
C
With
little
-
endian
,
we
need
to
do
C
DS
T
[
i
]
^
=
(
SRC
[
i
]
>>
CNT
)
^
(
SRC
[
i
+
1
]
<<
TNC
)
C
DS
T
[
i
]
^
=
(
SRC
[
i
]
>>
CNT
)
^
(
SRC
[
i
+
1
]
<<
TNC
)
lsl
CNT
,
#
3
push
{
r4
,
r5
,
r6
}
lsl
CNT
,
r3
,
#
3
bic
SRC
,
#
3
bic
SRC
,
#
3
rsb
TNC
,
CNT
,
#
32
rsb
TNC
,
CNT
,
#
32
...
@@ -119,12 +117,15 @@ PROLOGUE(memxor)
...
@@ -119,12 +117,15 @@ PROLOGUE(memxor)
subs
N
,
#
8
subs
N
,
#
8
bcs
.Lmemxor_word_loop
bcs
.Lmemxor_word_loop
adds
N
,
#
8
adds
N
,
#
8
beq
.Lmemxor_done
beq
.Lmemxor_
odd_
done
C
We
have
TNC
/
8
left
-
over
byte
s
in
r4
,
high
end
C
We
have
TNC
/
8
left
-
over
byte
s
in
r4
,
high
end
lsr
r4
,
CNT
lsr
r4
,
CNT
ldr
r3
,
[
DS
T
]
ldr
r3
,
[
DS
T
]
eor
r3
,
r4
eor
r3
,
r4
pop
{
r4
,
r5
,
r6
}
C
Store
byte
s
,
one
by
one.
C
Store
byte
s
,
one
by
one.
.Lmemxor_leftover:
.Lmemxor_leftover:
strb
r3
,
[
DS
T
],
#
+
1
strb
r3
,
[
DS
T
],
#
+
1
...
@@ -133,23 +134,54 @@ PROLOGUE(memxor)
...
@@ -133,23 +134,54 @@ PROLOGUE(memxor)
subs
TNC
,
#
8
subs
TNC
,
#
8
lsr
r3
,
#
8
lsr
r3
,
#
8
bne
.Lmemxor_leftover
bne
.Lmemxor_leftover
b
.Lmemxor_bytes
b
.Lmemxor_bytes
.Lmemxor_odd_done:
pop
{
r4
,
r5
,
r6
}
bx
lr
.Lmemxor_same:
.Lmemxor_same:
push
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
C
lr
is
the
link
register
subs
N
,
#
8
subs
N
,
#
8
bcc
.Lmemxor_same_end
bcc
.Lmemxor_same_end
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
C
Keep
address
for
loads
in
r14
mov
r14
,
DS
T
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
subs
N
,
#
12
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
bcc
.Lmemxor_same_final_store
subs
N
,
#
12
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcc
.Lmemxor_same_wind_down
C
6
cycles
per
iteration
,
0.50
cycles
/
byte
.
For
this
sp
eed
,
C
loop
starts
at
offset
0x11c
in
the
object
file.
.Lmemxor_same_loop:
.Lmemxor_same_loop:
C
8
cycles
per
iteration
,
0.67
cycles
/
byte
C
r10
-
r12
contains
values
to
be
stored
at
DS
T
C
r6
-
r8
contains
values
read
from
r14
,
in
advance
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
ldmia
DS
T
,
{
r6
,
r7
,
r12
}
subs
N
,
#
12
subs
N
,
#
12
eor
r3
,
r6
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r4
,
r7
eor
r10
,
r3
,
r6
eor
r5
,
r12
eor
r11
,
r4
,
r7
stmia
DS
T
!
,
{
r3
,
r4
,
r5
}
eor
r12
,
r5
,
r8
ldmia
r14
!
,
{
r6
,
r7
,
r8
}
bcs
.Lmemxor_same_loop
bcs
.Lmemxor_same_loop
.Lmemxor_same_wind_down:
C
Wind
down
code
ldmia
SRC
!
,
{
r3
,
r4
,
r5
}
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
eor
r10
,
r3
,
r6
eor
r11
,
r4
,
r7
eor
r12
,
r5
,
r8
.Lmemxor_same_final_store:
stmia
DS
T
!
,
{
r10
,
r11
,
r12
}
.Lmemxor_same_end:
.Lmemxor_same_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
...
@@ -161,16 +193,18 @@ PROLOGUE(memxor)
...
@@ -161,16 +193,18 @@ PROLOGUE(memxor)
eor
r3
,
r6
eor
r3
,
r6
eor
r4
,
r7
eor
r4
,
r7
stmia
DS
T
!
,
{
r3
,
r4
}
stmia
DS
T
!
,
{
r3
,
r4
}
pop
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
beq
.Lmemxor_done
beq
.Lmemxor_done
b
.Lmemxor_bytes
b
.Lmemxor_bytes
.Lmemxor_same_lt_8:
.Lmemxor_same_lt_8:
pop
{
r4
,
r5
,
r6
,
r7
,
r8
,
r10
,
r11
,
r14
}
adds
N
,
#
4
adds
N
,
#
4
bcc
.Lmemxor_same_lt_4
bcc
.Lmemxor_same_lt_4
ldr
r3
,
[
SRC
],
#
+
4
ldr
r3
,
[
SRC
],
#
+
4
ldr
r
4
,
[
DS
T
]
ldr
r
12
,
[
DS
T
]
eor
r3
,
r
4
eor
r3
,
r
12
str
r3
,
[
DS
T
],
#
+
4
str
r3
,
[
DS
T
],
#
+
4
beq
.Lmemxor_done
beq
.Lmemxor_done
b
.Lmemxor_bytes
b
.Lmemxor_bytes
...
@@ -312,40 +346,53 @@ PROLOGUE(memxor3)
...
@@ -312,40 +346,53 @@ PROLOGUE(memxor3)
bne
.Lmemxor3_au
;
bne
.Lmemxor3_au
;
C
a
,
b
and
ds
t
al
l
have
the
same
al
ignment.
C
a
,
b
and
ds
t
al
l
have
the
same
al
ignment.
sub
AP
,
#
4
sub
BP
,
#
4
sub
DS
T
,
#
4
tst
N
,
#
4
it
ne
subne
N
,
#
4
bne
.Lmemxor3_aligned_word_loop
ldr
r4
,
[
AP
],
#
-
4
ldr
r5
,
[
BP
],
#
-
4
eor
r4
,
r5
str
r4
,
[
DS
T
],
#
-
4
subs
N
,
#
8
subs
N
,
#
8
bcc
.Lmemxor3_aligned_word_end
bcc
.Lmemxor3_aligned_word_end
C
This
loop
runs
at
8
cycles
per
iteration.
It
has
been
C
observed
running
at
only
7
cycles
,
for
this
sp
eed
,
the
loop
C
started
at
offset
0x2ac
in
the
object
file.
C
FIXME
:
consider
software
pipelining
,
si
milarly
to
the
memxor
C
loop.
.Lmemxor3_aligned_word_loop:
.Lmemxor3_aligned_word_loop:
ldr
r4
,
[
AP
,
#
-
4
]
ldmdb
AP
!
,
{
r4
,
r5
,
r6
}
ldr
r5
,
[
AP
],
#
-
8
ldmdb
BP
!
,
{
r7
,
r8
,
r10
}
ldr
r6
,
[
BP
,
#
-
4
]
subs
N
,
#
12
ldr
r7
,
[
BP
],
#
-
8
eor
r4
,
r7
eor
r5
,
r8
eor
r6
,
r10
stmdb
DS
T
!
,
{
r4
,
r5
,
r6
}
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
C
We
have
0
-
11
byte
s
left
to
do
,
and
N
holds
number
of
byte
s
-
12
.
adds
N
,
#
4
bcc
.Lmemxor3_aligned_lt_8
C
Do
8
byte
s
more
,
leftover
is
in
N
ldmdb
AP
!
,
{
r4
,
r5
}
ldmdb
BP
!
,
{
r6
,
r7
}
eor
r4
,
r6
eor
r4
,
r6
eor
r5
,
r7
eor
r5
,
r7
s
ubs
N
,
#
8
s
tmdb
DS
T
!
,
{
r4
,
r5
}
str
r4
,
[
DS
T
,
#
-
4
]
beq
.Lmemxor3_done
str
r5
,
[
DS
T
],
#
-
8
b
.Lmemxor3_bytes
bcs
.Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_lt_8:
.Lmemxor3_aligned_word_end:
adds
N
,
#
4
adds
N
,
#
8
bcc
.Lmemxor3_aligned_lt_4
ldr
r4
,
[
AP
,
#
-
4
]
!
ldr
r5
,
[
BP
,
#
-
4
]
!
eor
r4
,
r5
str
r4
,
[
DS
T
,
#
-
4
]
!
beq
.Lmemxor3_done
b
.Lmemxor3_bytes
.Lmemxor3_aligned_lt_4:
adds
N
,
#
4
beq
.Lmemxor3_done
beq
.Lmemxor3_done
add
AP
,
#
4
add
BP
,
#
4
add
DS
T
,
#
4
b
.Lmemxor3_bytes
b
.Lmemxor3_bytes
.Lmemxor3_uu:
.Lmemxor3_uu:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment