Commit d6fadad8 authored by Niels Möller's avatar Niels Möller

arm: Adapted AES assembly to new interface.

parent 7192dce7
2013-05-22 Niels Möller <nisse@lysator.liu.se>
* arm/v6/aes-encrypt-internal.asm: Adapted to new interface.
Unfortunately, 4% slowdown on Cortex-A9, for unknown reason.
* arm/v6/aes-decrypt-internal.asm: Likewise.
* arm/aes-encrypt-internal.asm: Adapted to new interface.
* arm/aes-decrypt-internal.asm: Likewise.
2013-05-21 Niels Möller <nisse@lysator.liu.se>
* sparc32/aes-encrypt-internal.asm: Adapted to new interface.
......
......@@ -19,26 +19,32 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
C define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
define(<DST>, <r3>)
define(<SRC>, <r12>)
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_KEYS>, <r1>)
define(<TABLE>, <r2>)
define(<PARAM_LENGTH>, <r3>)
C On stack: DST, SRC
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
define(<KEY>, <r10>)
define(<ROUND>, <r11>)
define(<COUNT>, <r10>)
define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
define(<MASK>, <r0>) C Overlaps inputs, except TABLE
define(<X0>, <r1>)
define(<X1>, <r3>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
define(<MASK>, <r0>) C Overlaps CTX input
define(<CTX>, <[sp]>)
define(<FRAME_ROUNDS>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
define(<FRAME_LENGTH>, <[sp, #+8]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+44]>)
define(<FRAME_SRC>, <[sp, #+48]>)
define(<AES_DECRYPT_ROUND>, <
......@@ -103,29 +109,30 @@ define(<AES_DECRYPT_ROUND>, <
.file "aes-decrypt-internal.asm"
C _aes_decrypt(struct aes_context *ctx,
C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
ALIGN(4)
PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0
teq PARAM_LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r0, r4,r5,r6,r7,r8,r10,r11,lr}
push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr}
mov MASK, #0x3fc
ALIGN(16)
.Lblock_loop:
ldr KEY, CTX
ldr ROUND, [KEY, #+AES_NROUNDS]
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC}
ldr X0, FRAME_SRC C Use X0 as SRC pointer
ldm sp, {COUNT, KEY}
AES_LOAD(X0,KEY,W0)
AES_LOAD(X0,KEY,W1)
AES_LOAD(X0,KEY,W2)
AES_LOAD(X0,KEY,W3)
str X0, FRAME_SRC
add TABLE, TABLE, #AES_TABLE0
b .Lentry
......@@ -135,31 +142,35 @@ PROLOGUE(_nettle_aes_decrypt)
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs ROUND, ROUND,#2
subs COUNT, COUNT,#2
C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register
lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register
sub TABLE, TABLE, #AES_TABLE0
C Final round
AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, ROUND)
AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, ROUND)
AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, ROUND)
AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, ROUND)
AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, COUNT)
AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, COUNT)
AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, COUNT)
AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, COUNT)
pop {LENGTH, DST, SRC}
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
ldr X0, FRAME_DST
ldr X1, FRAME_LENGTH
AES_STORE(X0,W0)
AES_STORE(X0,W1)
AES_STORE(X0,W2)
AES_STORE(X0,W3)
subs X1, X1, #16
str X0, FRAME_DST
str X1, FRAME_LENGTH
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
add sp, sp, #4 C Drop saved r0
add sp, sp, #12 C Drop saved r0, r1, r3
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
......
......@@ -19,32 +19,38 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
C Benchmarked at at 725, 930, 990 cycles/block on cortex A9,
C Benchmarked at at 725, 815, 990 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling.
C define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
define(<DST>, <r3>)
define(<SRC>, <r12>)
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_KEYS>, <r1>)
define(<TABLE>, <r2>)
define(<PARAM_LENGTH>, <r3>)
C On stack: DST, SRC
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
define(<KEY>, <r10>)
define(<ROUND>, <r11>)
define(<COUNT>, <r10>)
define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
define(<MASK>, <r0>) C Overlaps inputs, except TABLE
define(<X0>, <r1>)
define(<X1>, <r3>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
define(<MASK>, <r0>) C Overlaps CTX input
define(<CTX>, <[sp]>)
define(<FRAME_ROUNDS>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
define(<FRAME_LENGTH>, <[sp, #+8]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+44]>)
define(<FRAME_SRC>, <[sp, #+48]>)
C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
......@@ -112,29 +118,30 @@ define(<AES_ENCRYPT_ROUND>, <
.file "aes-encrypt-internal.asm"
C _aes_encrypt(struct aes_context *ctx,
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
ALIGN(4)
PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0
teq PARAM_LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r0, r4,r5,r6,r7,r8,r10,r11,lr}
push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr}
mov MASK, #0x3fc
ALIGN(16)
.Lblock_loop:
ldr KEY, CTX
ldr ROUND, [KEY, #+AES_NROUNDS]
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC}
ldr X0, FRAME_SRC C Use X0 as SRC pointer
ldm sp, {COUNT, KEY}
AES_LOAD(X0,KEY,W0)
AES_LOAD(X0,KEY,W1)
AES_LOAD(X0,KEY,W2)
AES_LOAD(X0,KEY,W3)
str X0, FRAME_SRC
add TABLE, TABLE, #AES_TABLE0
b .Lentry
......@@ -144,31 +151,35 @@ PROLOGUE(_nettle_aes_encrypt)
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs ROUND, ROUND,#2
subs COUNT, COUNT,#2
C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register
lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register
sub TABLE, TABLE, #AES_TABLE0
C Final round
AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, ROUND)
AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, ROUND)
AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, ROUND)
AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, ROUND)
AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, COUNT)
AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, COUNT)
AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, COUNT)
AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, COUNT)
pop {LENGTH, DST, SRC}
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
ldr X0, FRAME_DST
ldr X1, FRAME_LENGTH
AES_STORE(X0,W0)
AES_STORE(X0,W1)
AES_STORE(X0,W2)
AES_STORE(X0,W3)
subs X1, X1, #16
str X0, FRAME_DST
str X1, FRAME_LENGTH
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
add sp, sp, #4 C Drop saved r0
add sp, sp, #12 C Drop saved r0, r1, r3
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
......
......@@ -19,25 +19,33 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
define(<DST>, <r3>)
define(<SRC>, <r12>)
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_KEYS>, <r1>)
define(<TABLE>, <r2>)
define(<LENGTH>, <r3>)
C On stack: DST, SRC
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
define(<KEY>, <r10>)
define(<ROUND>, <r11>)
define(<COUNT>, <r10>)
define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
define(<X1>, <r3>)
define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS
define(<X1>, <r1>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
define(<FRAME_ROUNDS>>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+40]>)
define(<FRAME_SRC>, <[sp, #+44]>)
define(<SRC>, <%r12>) C Overlap registers used in inner loop.
define(<DST>, <COUNT>)
C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
define(<AES_DECRYPT_ROUND>, <
......@@ -102,7 +110,7 @@ define(<AES_DECRYPT_ROUND>, <
.file "aes-decrypt-internal.asm"
C _aes_decrypt(struct aes_context *ctx,
C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
......@@ -111,22 +119,23 @@ define(<AES_DECRYPT_ROUND>, <
PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr}
nop C For some mysterious reason, taking out this nop
C slows this function down by 10(!) % on Cortex-A9.
ldr SRC, [sp, #+4]
push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
ALIGN(16)
.Lblock_loop:
mov KEY, CTX
ldm sp, {COUNT, KEY}
add TABLE, TABLE, #AES_TABLE0
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC}
ldr ROUND, [CTX, #+AES_NROUNDS]
add TABLE, TABLE, #AES_TABLE0
str SRC, FRAME_SRC
b .Lentry
ALIGN(16)
......@@ -135,29 +144,34 @@ PROLOGUE(_nettle_aes_decrypt)
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs ROUND, ROUND,#2
subs COUNT, COUNT,#2
C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
C Final round
ldr DST, FRAME_DST
AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0)
AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1)
AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2)
AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3)
pop {LENGTH, DST, SRC}
ldr SRC, FRAME_SRC
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
str DST, FRAME_DST
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
......
......@@ -19,31 +19,39 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
C Benchmarked at at 680, 818, 929 cycles/block on cortex A9,
C Benchmarked at at 706, 870, 963 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling.
define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
define(<DST>, <r3>)
define(<SRC>, <r12>)
define(<PARAM_ROUNDS>, <r0>)
define(<PARAM_KEYS>, <r1>)
define(<TABLE>, <r2>)
define(<LENGTH>, <r3>)
C On stack: DST, SRC
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
define(<KEY>, <r10>)
define(<ROUND>, <r11>)
define(<COUNT>, <r10>)
define(<KEY>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
define(<X1>, <r3>)
define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS
define(<X1>, <r1>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
define(<FRAME_ROUNDS>>, <[sp]>)
define(<FRAME_KEYS>, <[sp, #+4]>)
C 8 saved registers
define(<FRAME_DST>, <[sp, #+40]>)
define(<FRAME_SRC>, <[sp, #+44]>)
define(<SRC>, <%r12>) C Overlap registers used in inner loop.
define(<DST>, <COUNT>)
C 53 instr.
C It's tempting to use eor with rotation, but that's slower.
......@@ -110,7 +118,7 @@ define(<AES_ENCRYPT_ROUND>, <
.file "aes-encrypt-internal.asm"
C _aes_encrypt(struct aes_context *ctx,
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
......@@ -119,20 +127,23 @@ define(<AES_ENCRYPT_ROUND>, <
PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr}
ldr SRC, [sp, #+4]
push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
ALIGN(16)
.Lblock_loop:
mov KEY, CTX
ldm sp, {COUNT, KEY}
add TABLE, TABLE, #AES_TABLE0
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC}
ldr ROUND, [CTX, #+AES_NROUNDS]
add TABLE, TABLE, #AES_TABLE0
str SRC, FRAME_SRC
b .Lentry
ALIGN(16)
......@@ -141,29 +152,34 @@ PROLOGUE(_nettle_aes_encrypt)
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs ROUND, ROUND,#2
subs COUNT, COUNT,#2
C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
C Final round
ldr DST, FRAME_DST
AES_FINAL_ROUND_V6(X0, X1, X2, X3, KEY, W0)
AES_FINAL_ROUND_V6(X1, X2, X3, X0, KEY, W1)
AES_FINAL_ROUND_V6(X2, X3, X0, X1, KEY, W2)
AES_FINAL_ROUND_V6(X3, X0, X1, X2, KEY, W3)
pop {LENGTH, DST, SRC}
ldr SRC, FRAME_SRC
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
str DST, FRAME_DST
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment