Skip to content
Snippets Groups Projects
Commit 8c4ef180 authored by Niels Möller's avatar Niels Möller
Browse files

arm: Micro optimize neon implementation of salsa20 and chacha

parent db9b66e0
No related branches found
No related tags found
No related merge requests found
2020-06-25 Niels Möller <nisse@lysator.liu.se> 2020-06-25 Niels Möller <nisse@lysator.liu.se>
* arm/neon/chacha-core-internal.asm (QROUND): Micro optimize
rotations, using the vsra.u32 instruction. Gives 10% speedup,
benchmarked on Cortex-A5. Suggested by Torbjörn Granlund.
* arm/neon/salsa20-core-internal.asm (QROUND): Likewise.
* x86_64/chacha-core-internal.asm (QROUND): Fix use of macro * x86_64/chacha-core-internal.asm (QROUND): Fix use of macro
arguments. Spotted by Torbjörn Granlund. arguments. Spotted by Torbjörn Granlund.
......
...@@ -54,28 +54,24 @@ define(<QROUND>, < ...@@ -54,28 +54,24 @@ define(<QROUND>, <
C x2 += x3, x1 ^= x2, x1 lrot 7 C x2 += x3, x1 ^= x2, x1 lrot 7
vadd.i32 $1, $1, $2 vadd.i32 $1, $1, $2
veor $4, $4, $1 veor T0, $4, $1
vshl.i32 T0, $4, #16 vshl.i32 $4, T0, #16
vshr.u32 $4, $4, #16 vsra.u32 $4, T0, #16
veor $4, $4, T0
vadd.i32 $3, $3, $4 vadd.i32 $3, $3, $4
veor $2, $2, $3 veor T0, $2, $3
vshl.i32 T0, $2, #12 vshl.i32 $2, T0, #12
vshr.u32 $2, $2, #20 vsra.u32 $2, T0, #20
veor $2, $2, T0
vadd.i32 $1, $1, $2 vadd.i32 $1, $1, $2
veor $4, $4, $1 veor T0, $4, $1
vshl.i32 T0, $4, #8 vshl.i32 $4, T0, #8
vshr.u32 $4, $4, #24 vsra.u32 $4, T0, #24
veor $4, $4, T0
vadd.i32 $3, $3, $4 vadd.i32 $3, $3, $4
veor $2, $2, $3 veor T0, $2, $3
vshl.i32 T0, $2, #7 vshl.i32 $2, T0, #7
vshr.u32 $2, $2, #25 vsra.u32 $2, T0, #25
veor $2, $2, T0
>) >)
.text .text
......
...@@ -53,26 +53,22 @@ define(<S3>, <q15>) ...@@ -53,26 +53,22 @@ define(<S3>, <q15>)
define(<QROUND>, < define(<QROUND>, <
vadd.i32 T0, $1, $4 vadd.i32 T0, $1, $4
vshl.i32 T1, T0, #7 vshl.i32 T1, T0, #7
vshr.u32 T0, T0, #25 vsra.u32 T1, T0, #25
veor $2, $2, T0
veor $2, $2, T1 veor $2, $2, T1
vadd.i32 T0, $1, $2 vadd.i32 T0, $1, $2
vshl.i32 T1, T0, #9 vshl.i32 T1, T0, #9
vshr.u32 T0, T0, #23 vsra.u32 T1, T0, #23
veor $3, $3, T0
veor $3, $3, T1 veor $3, $3, T1
vadd.i32 T0, $2, $3 vadd.i32 T0, $2, $3
vshl.i32 T1, T0, #13 vshl.i32 T1, T0, #13
vshr.u32 T0, T0, #19 vsra.u32 T1, T0, #19
veor $4, $4, T0
veor $4, $4, T1 veor $4, $4, T1
vadd.i32 T0, $3, $4 vadd.i32 T0, $3, $4
vshl.i32 T1, T0, #18 vshl.i32 T1, T0, #18
vshr.u32 T0, T0, #14 vsra.u32 T1, T0, #14
veor $1, $1, T0
veor $1, $1, T1 veor $1, $1, T1
>) >)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment