Commit 23cd4cf7 authored by Niels Möller's avatar Niels Möller

arm/v6: AES microptimization.

parent 8136e192
......@@ -2,6 +2,7 @@
* arm/aes.m4 (AES_ENCRYPT_ROUND, AES_DECRYPT): Moved macros to the
files using them.
(AES_FINAL_ROUND): Eliminated one uxtb instruction.
* arm/v6/aes-encrypt-internal.asm: Use ALIGN macro. Use 16-byte
alignment for loops.
......
......@@ -33,9 +33,8 @@ define(<AES_FINAL_ROUND>, <
uxtb T0, $3, ror #16
ldrb T0, [TABLE, T0]
eor $6, $6, T0, lsl #16
uxtb T0, $4, ror #24
ldrb T0, [TABLE, T0]
ldrb T0, [TABLE, $4, lsr #24]
eor $6, $6, T0, lsl #24
ldr T0, [$5], #+4
eor $6, T0
eor $6, $6, T0
>)
......@@ -19,10 +19,6 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
C Benchmarked at at 785, 914, 1051 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes. Unclear why it is slower
C than _aes_encrypt.
define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
......@@ -119,7 +115,7 @@ PROLOGUE(_nettle_aes_decrypt)
push {r4,r5,r6,r7,r8,r10,r11,lr}
nop C For some mysterious reason, taking out this nop
C slows this function down on Cortex-A9.
C slows this function down by 10(!) % on Cortex-A9.
ALIGN(16)
.Lblock_loop:
mov KEY, CTX
......
......@@ -19,7 +19,7 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
C Benchmarked at at 693, 824, 950 cycles/block on cortex A9,
C Benchmarked at at 680, 818, 929 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment