Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Nettle
nettle
Commits
38d507fa
Commit
38d507fa
authored
Apr 15, 2013
by
Niels Möller
Browse files
ARM assembly for umac_nh_n.
parent
83ce4b71
Changes
2
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
38d507fa
2013-04-15 Niels Möller <nisse@lysator.liu.se>
* armv7/umac-nh-n.asm: New file. 2.0-2.3 time speedup.
* testsuite/umac-test.c (test_align): Fixed memory leak.
2013-04-12 Niels Möller <nisse@lysator.liu.se>
...
...
armv7/umac-nh-n.asm
0 → 100644
View file @
38d507fa
C
nettle
,
low
-
level
cryptographics
library
C
C
Copyright
(
C
)
2013
Niels
M
ö
ller
C
C
The
nettle
library
is
free
software
; you can redistribute it and/or modify
C
it
under
the
terms
of
the
GNU
Lesser
General
Public
License
as
published
by
C
the
Free
Software
Foundation
; either version 2.1 of the License, or (at your
C
option
)
any
later
version.
C
C
The
nettle
library
is
di
stributed
in
the
hope
that
it
will
be
useful
,
but
C
WITHOUT
ANY
WARRANTY
; without even the implied warranty of MERCHANTABILITY
C
or
FITNESS
FOR
A
PARTICULAR
PURPOSE.
See
the
GNU
Lesser
General
Public
C
License
for
more
details.
C
C
You
should
have
received
a
copy
of
the
GNU
Lesser
General
Public
License
C
al
ong
with
the
nettle
library
; see the file COPYING.LIB. If not, write to
C
the
Free
Software
Foundation
,
Inc.
,
51
Franklin
Street
,
Fifth
Floor
,
Boston
,
C
MA
02111
-
1301
,
USA.
.file
"
umac
-
nh.asm
"
.fpu
neon
define
(
<
OUT
>
,
<
r0
>
)
define
(
<
ITERS
>
,
<
r1
>
)
define
(
<
KEY
>
,
<
r2
>
)
define
(
<
LENGTH
>
,
<
r3
>
)
define
(
<
MSG
>
,
<
r12
>
)
define
(
<
SHIFT
>
,
<
r14
>
)
define
(
<
QA
>
,
<
q0
>
)
define
(
<
QB
>
,
<
q1
>
)
define
(
<
QY0
>
,
<
q3
>
)
C
Accumulates
for
the
first
two
operations.
define
(
<
DM
>
,
<
d4
>
)
define
(
<
QY1
>
,
<
q4
>
)
C
Used
for
3
and
4
iterations.
define
(
<
QC
>
,
<
q5
>
)
define
(
<
QD
>
,
<
q6
>
)
define
(
<
QLEFT
>
,
<
q8
>
)
define
(
<
QRIGHT
>
,
<
q9
>
)
define
(
<
QT0
>
,
<
q10
>
)
define
(
<
QT1
>
,
<
q11
>
)
define
(
<
QT2
>
,
<
q12
>
)
define
(
<
QK0
>
,
<
q13
>
)
define
(
<
QK1
>
,
<
q14
>
)
define
(
<
QK2
>
,
<
q15
>
)
C
FIXME
:
Try
permuting
subkeys
using
vld4
,
vzip
or
si
milar.
.text
.align
3
PROLOGUE
(
_nettle_umac_nh_n
)
ldr
MSG
,
[
sp
]
str
lr
,
[
sp
,
#
-
4
]
!
C
Setup
for
64
-
bit
al
igned
reads
ands
SHIFT
,
MSG
,
#
7
and
MSG
,
MSG
,
#
-
8
vld1.8
{
DM
}
,
[
MSG
:
64
]
addne
MSG
,
MSG
,
#
8
addeq
SHIFT
,
SHIFT
,
#
8
C
FIXME
:
Combine
as
rsb
?
lsl
SHIFT
,
SHIFT
,
#
3
neg
SHIFT
,
SHIFT
C
Right
shift
in
QRIGHT
(
both
halves
)
vmov.i32
D0REG
(
QRIGHT
)[
0
],
SHIFT
vmov.32
D1REG
(
QRIGHT
),
D0REG
(
QRIGHT
)
add
SHIFT
,
SHIFT
,
#
64
vmov.i32
D0REG
(
QLEFT
)[
0
],
SHIFT
vmov.32
D1REG
(
QLEFT
),
D0REG
(
QLEFT
)
cmp
r1
,
#
3
vmov.i64
QY0
,
#
0
vshl.u64
DM
,
DM
,
D0REG
(
QRIGHT
)
bcc
.Lnh2
beq
.Lnh3
.Lnh4:
C
Permute
key
word
s
,
so
we
in
each
iteration
have
them
in
order
C
C
P0
:
[
0
,
4
,
1
,
5
]
P1
:
[
2
,
6
,
3
,
7
]
P2
:
[
4
,
8
,
5
,
9
]
P3
:
[
6
,
10
,
7
,
11
]
C
P4
:
[
8
,
12
,
9
,
13
]
P5
:
[
10
,
14
,
11
,
15
]
P6
:
[
12
,
16
,
13
,
17
]
P7
:
[
14
,
18
,
15
,
19
]
C
C
Al
so
arrange
the
message
word
s
,
so
we
get
them
as
C
M0
:
[
0
,
0
,
1
,
1
]
M1
:
[
2
,
2
,
3
,
3
]
M2
:
[
4
,
4
,
5
,
5
]
M3
:
[
6
,
6
,
7
,
7
]
C
M4
:
[
8
,
8
,
9
,
9
]
M5
:
[
10
,
10
,
11
,
11
]
M6
:
[
12
,
12
,
13
,
13
]
M7
:
[
14
,
14
,
15
,
15
]
C
C
Then
,
accumulate
Y0
(
first
two
"iters"
)
using
C
C
Y0
+
=
(
M0
+
P0
)
*
(
M2
+
P2
)
+
(
M1
+
P1
)
*
(
M3
+
P3
)
C
Y1
+
=
(
M0
+
P4
)
*
(
M2
+
P6
)
+
(
M1
+
P5
)
*
(
M3
+
P7
)
C
C
Next
iteration
is
then
C
C
Y0
+
=
(
M4
+
P4
)
*
(
M6
+
P6
)
+
(
M5
+
P5
)
*
(
M7
+
P7
)
C
Y1
+
=
(
M4
+
P6
)
*
(
M6
+
P8
)
+
(
M5
+
P7
)
*
(
M7
+
P11
)
C
C
So
we
can
reuse
P4
,
P5
,
P6
,
P7
from
the
previous
iteration.
C
How
to
for
in
registers?
We
need
4
Q
regs
for
P0
-
P3
,
and
one
C
more
for
the
last
read
key.
We
need
at
least
two
regiters
C
for
the
message
(
QA
and
QB
,
more
if
we
want
to
expand
only
C
once
)
.
For
the
Y0
update
,
we
can
let
the
factors
overwrite
C
P0
-
P3
,
and
for
the
Y1
update
,
we
can
overwrite
M0
-
M3.
vpush
{
q4
,
q5
,
q6
}
vld1.32
{
QK0
,
QK1
}
,
[
KEY
]
!
vld1.32
{
QK2
}
,
[
KEY
]
!
vmov
QT0
,
QK1
vmov
QT1
,
QK2
C
Permute
keys.
QK2
us
untouched
,
permuted
subkeys
put
in
QK0
,
QK1
,
QT0
,
QT1
vtrn.32
QK0
,
QK1
C
Gives
us
[
0
,
4
,
2
,
6
]
and
[
1
,
5
,
3
,
7
]
vswp
D1REG
(
QK0
),
D0REG
(
QK1
)
C
Gives
us
[
0
,
4
,
1
,
5
]
and
[
2
,
6
,
3
,
7
]
vtrn.32
QT0
,
QT1
C
Gives
us
[
4
,
8
,
6
,
10
]
and
[
5
,
9
,
7
,
11
]
vswp
D1REG
(
QT0
),
D0REG
(
QT1
)
C
Gives
us
[
4
,
8
,
5
,
9
]
and
[
6
,
10
,
7
,
11
]
vmov.i64
QY1
,
#
0
.Loop4:
C
Set
m
[
i
]
<--
m
[
i
-
1
]
>>
RSHIFT
+
m
[
i
]
<<
LSHIFT
vld1.8
{
QA
,
QB
}
,
[
MSG
:
64
]
!
vshl.u64
QC
,
QA
,
QRIGHT
vshl.u64
QD
,
QB
,
QRIGHT
vshl.u64
QA
,
QA
,
QLEFT
vshl.u64
QB
,
QB
,
QLEFT
veor
D0REG
(
QA
),
D0REG
(
QA
),
DM
veor
D1REG
(
QA
),
D1REG
(
QA
),
D0REG
(
QC
)
veor
D0REG
(
QB
),
D0REG
(
QB
),
D1REG
(
QC
)
veor
D1REG
(
QB
),
D1REG
(
QB
),
D0REG
(
QD
)
vmov
DM
,
D1REG
(
QD
)
C
Explode
message
(
too
bad
there
'
s
no
vadd
with
scalar
)
vdup.32
D1REG
(
QD
),
D1REG
(
QB
)[
1
]
vdup.32
D0REG
(
QD
),
D1REG
(
QB
)[
0
]
vdup.32
D1REG
(
QC
),
D0REG
(
QB
)[
1
]
vdup.32
D0REG
(
QC
),
D0REG
(
QB
)[
0
]
vdup.32
D1REG
(
QB
),
D1REG
(
QA
)[
1
]
vdup.32
D0REG
(
QB
),
D1REG
(
QA
)[
0
]
vdup.32
D1REG
(
QA
),
D0REG
(
QA
)[
1
]
vdup.32
D0REG
(
QA
),
D0REG
(
QA
)[
0
]
vadd.i32
QK0
,
QK0
,
QA
vadd.i32
QK1
,
QK1
,
QB
vadd.i32
QT0
,
QT0
,
QC
vadd.i32
QT1
,
QT1
,
QD
vmlal.u32
QY0
,
D0REG
(
QK0
),
D0REG
(
QT0
)
vmlal.u32
QY0
,
D1REG
(
QK0
),
D1REG
(
QT0
)
vmlal.u32
QY0
,
D0REG
(
QK1
),
D0REG
(
QT1
)
vmlal.u32
QY0
,
D1REG
(
QK1
),
D1REG
(
QT1
)
C
Next
4
subkeys
vld1.32
{
QT0
,
QT1
}
,
[
KEY
]
!
vmov
QK0
,
QK2
vmov
QK1
,
QT0
vmov
QK2
,
QT1
C
Save
vtrn.32
QK0
,
QK1
C
Gives
us
[
8
,
12
,
10
,
14
]
and
[
9
,
13
,
11
,
15
]
vswp
D1REG
(
QK0
),
D0REG
(
QK1
)
C
Gives
us
[
8
,
12
,
9
,
13
]
and
[
10
,
14
,
11
,
15
]
vtrn.32
QT0
,
QT1
C
Gives
us
[
12
,
16
,
14
,
18
]
and
[
13
,
17
,
15
,
19
]
vswp
D1REG
(
QT0
),
D0REG
(
QT1
)
C
Gives
us
[
12
,
16
,
13
,
17
]
and
[
14
,
18
,
15
,
19
]
vadd.i32
QA
,
QA
,
QK0
vadd.i32
QB
,
QB
,
QK1
vadd.i32
QC
,
QC
,
QT0
vadd.i32
QD
,
QD
,
QT1
subs
LENGTH
,
LENGTH
,
#
32
vmlal.u32
QY1
,
D0REG
(
QA
),
D0REG
(
QC
)
vmlal.u32
QY1
,
D1REG
(
QA
),
D1REG
(
QC
)
vmlal.u32
QY1
,
D0REG
(
QB
),
D0REG
(
QD
)
vmlal.u32
QY1
,
D1REG
(
QB
),
D1REG
(
QD
)
bhi
.Loop4
vst1.64
{
QY0
,
QY1
}
,
[
OUT
]
vpop
{
q4
,
q5
,
q6
}
ldr
pc
,
[
sp
],
#
+
4
.Lnh3:
vpush
{
q4
}
vld1.32
{
QK0
,
QK1
}
,
[
KEY
]
!
vmov.i64
QY1
,
#
0
.Loop3:
C
Set
m
[
i
]
<--
m
[
i
-
1
]
>>
RSHIFT
+
m
[
i
]
<<
LSHIFT
vld1.8
{
QA
,
QB
}
,
[
MSG
:
64
]
!
vshl.u64
QT0
,
QA
,
QRIGHT
vshl.u64
QT1
,
QB
,
QRIGHT
vshl.u64
QA
,
QA
,
QLEFT
vshl.u64
QB
,
QB
,
QLEFT
veor
D0REG
(
QA
),
D0REG
(
QA
),
DM
veor
D1REG
(
QA
),
D1REG
(
QA
),
D0REG
(
QT0
)
veor
D0REG
(
QB
),
D0REG
(
QB
),
D1REG
(
QT0
)
veor
D1REG
(
QB
),
D1REG
(
QB
),
D0REG
(
QT1
)
vmov
DM
,
D1REG
(
QT1
)
vld1.32
{
QK2
}
,
[
KEY
]
!
C
Construct
factors
,
with
low
half
corresponding
to
first
iteration
,
C
and
high
half
corresponding
to
the
second
iteration.
vmov
QT0
,
QK1
vtrn.32
QK0
,
QT0
C
Gives
us
[
0
,
4
,
2
,
6
]
and
[
1
,
5
,
3
,
7
]
vswp
D1REG
(
QK0
),
D0REG
(
QT0
)
C
Gives
us
[
0
,
4
,
1
,
5
]
and
[
2
,
6
,
3
,
7
]
vdup.32
D0REG
(
QT1
),
D0REG
(
QA
)[
0
]
vdup.32
D1REG
(
QT1
),
D0REG
(
QA
)[
1
]
vadd.i32
QT1
,
QT1
,
QK0
vmov
QK0
,
QK2
C
Save
for
next
iteration
vtrn.32
QK1
,
QK2
C
Gives
us
[
4
,
8
,
2
,
1
]
and
[
1
,
5
,
3
,
7
]
vswp
D1REG
(
QK1
),
D0REG
(
QK2
)
C
Gives
us
[
4
,
8
,
1
,
5
]
and
[
2
,
1
,
3
,
7
]
vdup.32
D0REG
(
QT2
),
D0REG
(
QB
)[
0
]
vdup.32
D1REG
(
QT2
),
D0REG
(
QB
)[
1
]
vadd.i32
QK1
,
QK1
,
QT2
vmlal.u32
QY0
,
D0REG
(
QT1
),
D0REG
(
QK1
)
vmlal.u32
QY0
,
D1REG
(
QT1
),
D1REG
(
QK1
)
vdup.32
D0REG
(
QT1
),
D1REG
(
QA
)[
0
]
vdup.32
D1REG
(
QT1
),
D1REG
(
QA
)[
1
]
vadd.i32
QT0
,
QT0
,
QT1
vdup.32
D0REG
(
QT1
),
D1REG
(
QB
)[
0
]
vdup.32
D1REG
(
QT1
),
D1REG
(
QB
)[
1
]
vadd.i32
QK2
,
QK2
,
QT1
vmlal.u32
QY0
,
D0REG
(
QT0
),
D0REG
(
QK2
)
vmlal.u32
QY0
,
D1REG
(
QT0
),
D1REG
(
QK2
)
vld1.32
{
QK1
}
,
[
KEY
]
!
vadd.i32
QA
,
QA
,
QK0
vadd.i32
QB
,
QB
,
QK1
subs
LENGTH
,
LENGTH
,
#
32
vmlal.u32
QY1
,
D0REG
(
QA
),
D0REG
(
QB
)
vmlal.u32
QY1
,
D1REG
(
QA
),
D1REG
(
QB
)
bhi
.Loop3
vadd.i64
D0REG
(
QY1
),
D0REG
(
QY1
),
D1REG
(
QY1
)
vst1.64
{
D0REG
(
QY0
),
D1REG
(
QY0
),
D0REG
(
QY1
)
}
,
[
OUT
]
vpop
{
q4
}
ldr
pc
,
[
sp
],
#
+
4
.Lnh2:
vld1.32
{
QK0
}
,
[
KEY
]
!
.Loop2:
C
Set
m
[
i
]
<--
m
[
i
-
1
]
>>
RSHIFT
+
m
[
i
]
<<
LSHIFT
vld1.8
{
QA
,
QB
}
,
[
MSG
:
64
]
!
vshl.u64
QT0
,
QA
,
QRIGHT
vshl.u64
QT1
,
QB
,
QRIGHT
vshl.u64
QA
,
QA
,
QLEFT
vshl.u64
QB
,
QB
,
QLEFT
veor
D0REG
(
QA
),
D0REG
(
QA
),
DM
veor
D1REG
(
QA
),
D1REG
(
QA
),
D0REG
(
QT0
)
veor
D0REG
(
QB
),
D0REG
(
QB
),
D1REG
(
QT0
)
veor
D1REG
(
QB
),
D1REG
(
QB
),
D0REG
(
QT1
)
vmov
DM
,
D1REG
(
QT1
)
vld1.32
{
QK1
,
QK2
}
,
[
KEY
]
!
C
Construct
factors
,
with
low
half
corresponding
to
first
iteration
,
C
and
high
half
corresponding
to
the
second
iteration.
vmov
QT0
,
QK1
vtrn.32
QK0
,
QT0
C
Gives
us
[
0
,
4
,
2
,
6
]
and
[
1
,
5
,
3
,
7
]
vswp
D1REG
(
QK0
),
D0REG
(
QT0
)
C
Gives
us
[
0
,
4
,
1
,
5
]
and
[
2
,
6
,
3
,
7
]
vdup.32
D0REG
(
QT1
),
D0REG
(
QA
)[
0
]
vdup.32
D1REG
(
QT1
),
D0REG
(
QA
)[
1
]
vadd.i32
QT1
,
QT1
,
QK0
vmov
QK0
,
QK2
C
Save
for
next
iteration
vtrn.32
QK1
,
QK2
C
Gives
us
[
4
,
8
,
6
,
10
]
and
[
5
,
9
,
7
,
11
]
vswp
D1REG
(
QK1
),
D0REG
(
QK2
)
C
Gives
us
[
4
,
8
,
5
,
9
]
and
[
6
,
10
,
7
,
11
]
vdup.32
D0REG
(
QT2
),
D0REG
(
QB
)[
0
]
vdup.32
D1REG
(
QT2
),
D0REG
(
QB
)[
1
]
vadd.i32
QK1
,
QK1
,
QT2
vmlal.u32
QY0
,
D0REG
(
QT1
),
D0REG
(
QK1
)
vmlal.u32
QY0
,
D1REG
(
QT1
),
D1REG
(
QK1
)
vdup.32
D0REG
(
QT1
),
D1REG
(
QA
)[
0
]
vdup.32
D1REG
(
QT1
),
D1REG
(
QA
)[
1
]
vadd.i32
QT0
,
QT0
,
QT1
vdup.32
D0REG
(
QT1
),
D1REG
(
QB
)[
0
]
vdup.32
D1REG
(
QT1
),
D1REG
(
QB
)[
1
]
vadd.i32
QK2
,
QK2
,
QT1
subs
LENGTH
,
LENGTH
,
#
32
vmlal.u32
QY0
,
D0REG
(
QT0
),
D0REG
(
QK2
)
vmlal.u32
QY0
,
D1REG
(
QT0
),
D1REG
(
QK2
)
bhi
.Loop2
vst1.64
{
QY0
}
,
[
OUT
]
.Lend:
ldr
pc
,
[
sp
],
#
+
4
EPILOGUE
(
_nettle_umac_nh_n
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment