Commit d6fadad8 authored by Niels Möller's avatar Niels Möller

arm: Adapted AES assembly to new interface.

parent 7192dce7
2013-05-22 Niels Möller <nisse@lysator.liu.se>
* arm/v6/aes-encrypt-internal.asm: Adapted to new interface.
Unfortunately, 4% slowdown on Cortex-A9, for unknown reason.
* arm/v6/aes-decrypt-internal.asm: Likewise.
* arm/aes-encrypt-internal.asm: Adapted to new interface.
* arm/aes-decrypt-internal.asm: Likewise.
2013-05-21 Niels Möller <nisse@lysator.liu.se> 2013-05-21 Niels Möller <nisse@lysator.liu.se>
* sparc32/aes-encrypt-internal.asm: Adapted to new interface. * sparc32/aes-encrypt-internal.asm: Adapted to new interface.
......
...@@ -19,26 +19,32 @@ C MA 02111-1301, USA. ...@@ -19,26 +19,32 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>) include_src(<arm/aes.m4>)
C define(<CTX>, <r0>) define(<PARAM_ROUNDS>, <r0>)
define(<TABLE>, <r1>) define(<PARAM_KEYS>, <r1>)
define(<LENGTH>, <r2>) define(<TABLE>, <r2>)
define(<DST>, <r3>) define(<PARAM_LENGTH>, <r3>)
define(<SRC>, <r12>) C On stack: DST, SRC
define(<W0>, <r4>) define(<W0>, <r4>)
define(<W1>, <r5>) define(<W1>, <r5>)
define(<W2>, <r6>) define(<W2>, <r6>)
define(<W3>, <r7>) define(<W3>, <r7>)
define(<T0>, <r8>) define(<T0>, <r8>)
define(<KEY>, <r10>) define(<COUNT>, <r10>)
define(<ROUND>, <r11>) define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST define(<MASK>, <r0>) C Overlaps inputs, except TABLE
define(<X0>, <r1>)
define(<X1>, <r3>) define(<X1>, <r3>)
define(<X2>, <r12>) define(<X2>, <r12>)
define(<X3>, <r14>) C lr define(<X3>, <r14>) C lr
define(<MASK>, <r0>) C Overlaps CTX input
define(<CTX>, <[sp]>) define(<FRAME_ROUNDS>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
define(<FRAME_LENGTH>, <[sp, #+8]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+44]>)
define(<FRAME_SRC>, <[sp, #+48]>)
define(<AES_DECRYPT_ROUND>, < define(<AES_DECRYPT_ROUND>, <
...@@ -103,29 +109,30 @@ define(<AES_DECRYPT_ROUND>, < ...@@ -103,29 +109,30 @@ define(<AES_DECRYPT_ROUND>, <
.file "aes-decrypt-internal.asm" .file "aes-decrypt-internal.asm"
C _aes_decrypt(struct aes_context *ctx, C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T, C const struct aes_table *T,
C size_t length, uint8_t *dst, C size_t length, uint8_t *dst,
C uint8_t *src) C uint8_t *src)
.text .text
ALIGN(4) ALIGN(4)
PROLOGUE(_nettle_aes_decrypt) PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0 teq PARAM_LENGTH, #0
beq .Lend beq .Lend
ldr SRC, [sp]
push {r0, r4,r5,r6,r7,r8,r10,r11,lr} push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr}
mov MASK, #0x3fc mov MASK, #0x3fc
ALIGN(16) ALIGN(16)
.Lblock_loop: .Lblock_loop:
ldr KEY, CTX ldr X0, FRAME_SRC C Use X0 as SRC pointer
ldr ROUND, [KEY, #+AES_NROUNDS] ldm sp, {COUNT, KEY}
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1) AES_LOAD(X0,KEY,W0)
AES_LOAD(SRC,KEY,W2) AES_LOAD(X0,KEY,W1)
AES_LOAD(SRC,KEY,W3) AES_LOAD(X0,KEY,W2)
AES_LOAD(X0,KEY,W3)
push {LENGTH, DST, SRC}
str X0, FRAME_SRC
add TABLE, TABLE, #AES_TABLE0 add TABLE, TABLE, #AES_TABLE0
b .Lentry b .Lentry
...@@ -135,31 +142,35 @@ PROLOGUE(_nettle_aes_decrypt) ...@@ -135,31 +142,35 @@ PROLOGUE(_nettle_aes_decrypt)
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry: .Lentry:
subs ROUND, ROUND,#2 subs COUNT, COUNT,#2
C Transform W -> X C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop bne .Lround_loop
lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register
sub TABLE, TABLE, #AES_TABLE0 sub TABLE, TABLE, #AES_TABLE0
C Final round C Final round
AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, ROUND) AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, COUNT)
AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, ROUND) AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, COUNT)
AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, ROUND) AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, COUNT)
AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, ROUND) AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, COUNT)
pop {LENGTH, DST, SRC} ldr X0, FRAME_DST
ldr X1, FRAME_LENGTH
AES_STORE(DST,W0)
AES_STORE(DST,W1) AES_STORE(X0,W0)
AES_STORE(DST,W2) AES_STORE(X0,W1)
AES_STORE(DST,W3) AES_STORE(X0,W2)
AES_STORE(X0,W3)
subs X1, X1, #16
str X0, FRAME_DST
str X1, FRAME_LENGTH
subs LENGTH, LENGTH, #16
bhi .Lblock_loop bhi .Lblock_loop
add sp, sp, #4 C Drop saved r0 add sp, sp, #12 C Drop saved r0, r1, r3
pop {r4,r5,r6,r7,r8,r10,r11,pc} pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend: .Lend:
......
...@@ -19,32 +19,38 @@ C MA 02111-1301, USA. ...@@ -19,32 +19,38 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>) include_src(<arm/aes.m4>)
C Benchmarked at at 725, 930, 990 cycles/block on cortex A9, C Benchmarked at at 725, 815, 990 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes. C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling. C aligned accesses. Better scheduling.
C define(<CTX>, <r0>) define(<PARAM_ROUNDS>, <r0>)
define(<TABLE>, <r1>) define(<PARAM_KEYS>, <r1>)
define(<LENGTH>, <r2>) define(<TABLE>, <r2>)
define(<DST>, <r3>) define(<PARAM_LENGTH>, <r3>)
define(<SRC>, <r12>) C On stack: DST, SRC
define(<W0>, <r4>) define(<W0>, <r4>)
define(<W1>, <r5>) define(<W1>, <r5>)
define(<W2>, <r6>) define(<W2>, <r6>)
define(<W3>, <r7>) define(<W3>, <r7>)
define(<T0>, <r8>) define(<T0>, <r8>)
define(<KEY>, <r10>) define(<COUNT>, <r10>)
define(<ROUND>, <r11>) define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST define(<MASK>, <r0>) C Overlaps inputs, except TABLE
define(<X0>, <r1>)
define(<X1>, <r3>) define(<X1>, <r3>)
define(<X2>, <r12>) define(<X2>, <r12>)
define(<X3>, <r14>) C lr define(<X3>, <r14>) C lr
define(<MASK>, <r0>) C Overlaps CTX input
define(<CTX>, <[sp]>) define(<FRAME_ROUNDS>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
define(<FRAME_LENGTH>, <[sp, #+8]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+44]>)
define(<FRAME_SRC>, <[sp, #+48]>)
C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
...@@ -112,29 +118,30 @@ define(<AES_ENCRYPT_ROUND>, < ...@@ -112,29 +118,30 @@ define(<AES_ENCRYPT_ROUND>, <
.file "aes-encrypt-internal.asm" .file "aes-encrypt-internal.asm"
C _aes_encrypt(struct aes_context *ctx, C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T, C const struct aes_table *T,
C size_t length, uint8_t *dst, C size_t length, uint8_t *dst,
C uint8_t *src) C uint8_t *src)
.text .text
ALIGN(4) ALIGN(4)
PROLOGUE(_nettle_aes_encrypt) PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0 teq PARAM_LENGTH, #0
beq .Lend beq .Lend
ldr SRC, [sp]
push {r0, r4,r5,r6,r7,r8,r10,r11,lr} push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr}
mov MASK, #0x3fc mov MASK, #0x3fc
ALIGN(16) ALIGN(16)
.Lblock_loop: .Lblock_loop:
ldr KEY, CTX ldr X0, FRAME_SRC C Use X0 as SRC pointer
ldr ROUND, [KEY, #+AES_NROUNDS] ldm sp, {COUNT, KEY}
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1) AES_LOAD(X0,KEY,W0)
AES_LOAD(SRC,KEY,W2) AES_LOAD(X0,KEY,W1)
AES_LOAD(SRC,KEY,W3) AES_LOAD(X0,KEY,W2)
AES_LOAD(X0,KEY,W3)
push {LENGTH, DST, SRC}
str X0, FRAME_SRC
add TABLE, TABLE, #AES_TABLE0 add TABLE, TABLE, #AES_TABLE0
b .Lentry b .Lentry
...@@ -144,31 +151,35 @@ PROLOGUE(_nettle_aes_encrypt) ...@@ -144,31 +151,35 @@ PROLOGUE(_nettle_aes_encrypt)
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry: .Lentry:
subs ROUND, ROUND,#2 subs COUNT, COUNT,#2
C Transform W -> X C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop bne .Lround_loop
lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register
sub TABLE, TABLE, #AES_TABLE0 sub TABLE, TABLE, #AES_TABLE0
C Final round C Final round
AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, ROUND) AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, COUNT)
AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, ROUND) AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, COUNT)
AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, ROUND) AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, COUNT)
AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, ROUND) AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, COUNT)
pop {LENGTH, DST, SRC} ldr X0, FRAME_DST
ldr X1, FRAME_LENGTH
AES_STORE(DST,W0)
AES_STORE(DST,W1) AES_STORE(X0,W0)
AES_STORE(DST,W2) AES_STORE(X0,W1)
AES_STORE(DST,W3) AES_STORE(X0,W2)
AES_STORE(X0,W3)
subs X1, X1, #16
str X0, FRAME_DST
str X1, FRAME_LENGTH
subs LENGTH, LENGTH, #16
bhi .Lblock_loop bhi .Lblock_loop
add sp, sp, #4 C Drop saved r0 add sp, sp, #12 C Drop saved r0, r1, r3
pop {r4,r5,r6,r7,r8,r10,r11,pc} pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend: .Lend:
......
...@@ -19,25 +19,33 @@ C MA 02111-1301, USA. ...@@ -19,25 +19,33 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>) include_src(<arm/aes.m4>)
define(<CTX>, <r0>) define(<PARAM_ROUNDS>, <r0>)
define(<TABLE>, <r1>) define(<PARAM_KEYS>, <r1>)
define(<LENGTH>, <r2>) define(<TABLE>, <r2>)
define(<DST>, <r3>) define(<LENGTH>, <r3>)
define(<SRC>, <r12>) C On stack: DST, SRC
define(<W0>, <r4>) define(<W0>, <r4>)
define(<W1>, <r5>) define(<W1>, <r5>)
define(<W2>, <r6>) define(<W2>, <r6>)
define(<W3>, <r7>) define(<W3>, <r7>)
define(<T0>, <r8>) define(<T0>, <r8>)
define(<KEY>, <r10>) define(<COUNT>, <r10>)
define(<ROUND>, <r11>) define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS
define(<X1>, <r3>) define(<X1>, <r1>)
define(<X2>, <r12>) define(<X2>, <r12>)
define(<X3>, <r14>) C lr define(<X3>, <r14>) C lr
define(<FRAME_ROUNDS>>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+40]>)
define(<FRAME_SRC>, <[sp, #+44]>)
define(<SRC>, <%r12>) C Overlap registers used in inner loop.
define(<DST>, <COUNT>)
C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
define(<AES_DECRYPT_ROUND>, < define(<AES_DECRYPT_ROUND>, <
...@@ -102,7 +110,7 @@ define(<AES_DECRYPT_ROUND>, < ...@@ -102,7 +110,7 @@ define(<AES_DECRYPT_ROUND>, <
.file "aes-decrypt-internal.asm" .file "aes-decrypt-internal.asm"
C _aes_decrypt(struct aes_context *ctx, C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T, C const struct aes_table *T,
C size_t length, uint8_t *dst, C size_t length, uint8_t *dst,
C uint8_t *src) C uint8_t *src)
...@@ -111,22 +119,23 @@ define(<AES_DECRYPT_ROUND>, < ...@@ -111,22 +119,23 @@ define(<AES_DECRYPT_ROUND>, <
PROLOGUE(_nettle_aes_decrypt) PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0 teq LENGTH, #0
beq .Lend beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr} ldr SRC, [sp, #+4]
nop C For some mysterious reason, taking out this nop
C slows this function down by 10(!) % on Cortex-A9. push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
ALIGN(16) ALIGN(16)
.Lblock_loop: .Lblock_loop:
mov KEY, CTX ldm sp, {COUNT, KEY}
add TABLE, TABLE, #AES_TABLE0
AES_LOAD(SRC,KEY,W0) AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1) AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2) AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3) AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC} str SRC, FRAME_SRC
ldr ROUND, [CTX, #+AES_NROUNDS]
add TABLE, TABLE, #AES_TABLE0
b .Lentry b .Lentry
ALIGN(16) ALIGN(16)
...@@ -135,29 +144,34 @@ PROLOGUE(_nettle_aes_decrypt) ...@@ -135,29 +144,34 @@ PROLOGUE(_nettle_aes_decrypt)
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry: .Lentry:
subs ROUND, ROUND,#2 subs COUNT, COUNT,#2
C Transform W -> X C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0 sub TABLE, TABLE, #AES_TABLE0
C Final round C Final round
ldr DST, FRAME_DST
AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0) AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0)
AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1) AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1)
AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2) AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2)
AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3) AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3)
pop {LENGTH, DST, SRC} ldr SRC, FRAME_SRC
AES_STORE(DST,W0) AES_STORE(DST,W0)
AES_STORE(DST,W1) AES_STORE(DST,W1)
AES_STORE(DST,W2) AES_STORE(DST,W2)
AES_STORE(DST,W3) AES_STORE(DST,W3)
str DST, FRAME_DST
subs LENGTH, LENGTH, #16 subs LENGTH, LENGTH, #16
bhi .Lblock_loop bhi .Lblock_loop
add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc} pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend: .Lend:
......
...@@ -19,31 +19,39 @@ C MA 02111-1301, USA. ...@@ -19,31 +19,39 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>) include_src(<arm/aes.m4>)
C Benchmarked at at 680, 818, 929 cycles/block on cortex A9, C Benchmarked at at 706, 870, 963 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes. C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling. C aligned accesses. Better scheduling.
define(<CTX>, <r0>) define(<PARAM_ROUNDS>, <r0>)
define(<TABLE>, <r1>) define(<PARAM_KEYS>, <r1>)
define(<LENGTH>, <r2>) define(<TABLE>, <r2>)
define(<DST>, <r3>) define(<LENGTH>, <r3>)
define(<SRC>, <r12>) C On stack: DST, SRC
define(<W0>, <r4>) define(<W0>, <r4>)
define(<W1>, <r5>) define(<W1>, <r5>)
define(<W2>, <r6>) define(<W2>, <r6>)
define(<W3>, <r7>) define(<W3>, <r7>)
define(<T0>, <r8>) define(<T0>, <r8>)
define(<KEY>, <r10>) define(<COUNT>, <r10>)
define(<ROUND>, <r11>) define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS
define(<X1>, <r3>) define(<X1>, <r1>)
define(<X2>, <r12>) define(<X2>, <r12>)
define(<X3>, <r14>) C lr define(<X3>, <r14>) C lr
define(<FRAME_ROUNDS>>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+40]>)
define(<FRAME_SRC>, <[sp, #+44]>)
define(<SRC>, <%r12>) C Overlap registers used in inner loop.
define(<DST>, <COUNT>)
C 53 instr. C 53 instr.
C It's tempting to use eor with rotation, but that's slower. C It's tempting to use eor with rotation, but that's slower.
...@@ -110,7 +118,7 @@ define(<AES_ENCRYPT_ROUND>, < ...@@ -110,7 +118,7 @@ define(<AES_ENCRYPT_ROUND>, <
.file "aes-encrypt-internal.asm" .file "aes-encrypt-internal.asm"
C _aes_encrypt(struct aes_context *ctx, C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T, C const struct aes_table *T,
C size_t length, uint8_t *dst, C size_t length, uint8_t *dst,
C uint8_t *src) C uint8_t *src)
...@@ -119,20 +127,23 @@ define(<AES_ENCRYPT_ROUND>, < ...@@ -119,20 +127,23 @@ define(<AES_ENCRYPT_ROUND>, <