Commit 8c4ef180 authored by Niels Möller's avatar Niels Möller
Browse files

arm: Micro optimize neon implementation of salsa20 and chacha

parent db9b66e0
2020-06-25 Niels Möller <nisse@lysator.liu.se>
* arm/neon/chacha-core-internal.asm (QROUND): Micro optimize
rotations, using the vsra.u32 instruction. Gives 10% speedup,
benchmarked on Cortex-A5. Suggested by Torbjörn Granlund.
* arm/neon/salsa20-core-internal.asm (QROUND): Likewise.
* x86_64/chacha-core-internal.asm (QROUND): Fix use of macro
arguments. Spotted by Torbjörn Granlund.
......
......@@ -54,28 +54,24 @@ define(<QROUND>, <
C x2 += x3, x1 ^= x2, x1 lrot 7
vadd.i32 $1, $1, $2
veor $4, $4, $1
vshl.i32 T0, $4, #16
vshr.u32 $4, $4, #16
veor $4, $4, T0
veor T0, $4, $1
vshl.i32 $4, T0, #16
vsra.u32 $4, T0, #16
vadd.i32 $3, $3, $4
veor $2, $2, $3
vshl.i32 T0, $2, #12
vshr.u32 $2, $2, #20
veor $2, $2, T0
veor T0, $2, $3
vshl.i32 $2, T0, #12
vsra.u32 $2, T0, #20
vadd.i32 $1, $1, $2
veor $4, $4, $1
vshl.i32 T0, $4, #8
vshr.u32 $4, $4, #24
veor $4, $4, T0
veor T0, $4, $1
vshl.i32 $4, T0, #8
vsra.u32 $4, T0, #24
vadd.i32 $3, $3, $4
veor $2, $2, $3
vshl.i32 T0, $2, #7
vshr.u32 $2, $2, #25
veor $2, $2, T0
veor T0, $2, $3
vshl.i32 $2, T0, #7
vsra.u32 $2, T0, #25
>)
.text
......
......@@ -53,26 +53,22 @@ define(<S3>, <q15>)
define(<QROUND>, <
vadd.i32 T0, $1, $4
vshl.i32 T1, T0, #7
vshr.u32 T0, T0, #25
veor $2, $2, T0
vsra.u32 T1, T0, #25
veor $2, $2, T1
vadd.i32 T0, $1, $2
vshl.i32 T1, T0, #9
vshr.u32 T0, T0, #23
veor $3, $3, T0
vsra.u32 T1, T0, #23
veor $3, $3, T1
vadd.i32 T0, $2, $3
vshl.i32 T1, T0, #13
vshr.u32 T0, T0, #19
veor $4, $4, T0
vsra.u32 T1, T0, #19
veor $4, $4, T1
vadd.i32 T0, $3, $4
vshl.i32 T1, T0, #18
vshr.u32 T0, T0, #14
veor $1, $1, T0
vsra.u32 T1, T0, #14
veor $1, $1, T1
>)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment