Commit 4ac1b5f0 authored by Niels Möller's avatar Niels Möller

ARM assembly for AES.

parent 32f3ba18
2013-03-08 Niels Möller <nisse@lysator.liu.se>
* armv7/aes-decrypt-internal.asm: New file, 15% speedup.
* armv7/aes-encrypt-internal.asm: New file, 25% speedup.
* armv7/aes.m4: New file.
2013-03-07 Niels Möller <nisse@lysator.liu.se>
* gmp-glue.c (mpz_limbs_cmp): Don't use PTR and SIZ macros.
......
C nettle, low-level cryptographics library
C
C Copyright (C) 2013 Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
include_src(<armv7/aes.m4>)
C Benchmarked at at 785, 914, 1051 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes. Unclear why it is slower
C than _aes_encrypt.
define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
define(<DST>, <r3>)
define(<SRC>, <r12>)
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
define(<KEY>, <r10>)
define(<ROUND>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
define(<X1>, <r3>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
.file "aes-decrypt-internal.asm"
C _aes_decrypt(struct aes_context *ctx,
C const struct aes_table *T,
C unsigned length, uint8_t *dst,
C uint8_t *src)
.text
.align 2
PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr}
.Lblock_loop:
mov KEY, CTX
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC}
ldr ROUND, [CTX, #+AES_NROUNDS]
add TABLE, TABLE, #AES_TABLE0
b .Lentry
.align 2
.Lround_loop:
C Transform X -> W
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs ROUND, ROUND,#2
C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
C Final round
AES_FINAL_ROUND(X0, X3, X2, X1, KEY, W0)
AES_FINAL_ROUND(X1, X0, X3, X2, KEY, W1)
AES_FINAL_ROUND(X2, X1, X0, X3, KEY, W2)
AES_FINAL_ROUND(X3, X2, X1, X0, KEY, W3)
pop {LENGTH, DST, SRC}
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
bx lr
EPILOGUE(_nettle_aes_decrypt)
C nettle, low-level cryptographics library
C
C Copyright (C) 2013 Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
include_src(<armv7/aes.m4>)
C Benchmarked at at 693, 824, 950 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling.
define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
define(<DST>, <r3>)
define(<SRC>, <r12>)
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
define(<KEY>, <r10>)
define(<ROUND>, <r11>)
define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
define(<X1>, <r3>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
.file "aes-encrypt-internal.asm"
C _aes_encrypt(struct aes_context *ctx,
C const struct aes_table *T,
C unsigned length, uint8_t *dst,
C uint8_t *src)
.text
.align 2
PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp]
push {r4,r5,r6,r7,r8,r10,r11,lr}
.Lblock_loop:
mov KEY, CTX
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
push {LENGTH, DST, SRC}
ldr ROUND, [CTX, #+AES_NROUNDS]
add TABLE, TABLE, #AES_TABLE0
b .Lentry
.align 2
.Lround_loop:
C Transform X -> W
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs ROUND, ROUND,#2
C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
C Final round
AES_FINAL_ROUND(X0, X1, X2, X3, KEY, W0)
AES_FINAL_ROUND(X1, X2, X3, X0, KEY, W1)
AES_FINAL_ROUND(X2, X3, X0, X1, KEY, W2)
AES_FINAL_ROUND(X3, X0, X1, X2, KEY, W3)
pop {LENGTH, DST, SRC}
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
bx lr
EPILOGUE(_nettle_aes_encrypt)
C Loads one word, and adds it to the subkey. Uses T0
C AES_LOAD(SRC, KEY, REG)
define(<AES_LOAD>, <
ldrb $3, [$1], #+1
ldrb T0, [$1], #+1
orr $3, T0, lsl #8
ldrb T0, [$1], #+1
orr $3, T0, lsl #16
ldrb T0, [$1], #+1
orr $3, T0, lsl #24
ldr T0, [$2], #+4
eor $3, T0
>)
C Stores one word. Destroys input.
C AES_STORE(DST, X)
define(<AES_STORE>, <
strb $2, [$1], #+1
ror $2, $2, #8
strb $2, [$1], #+1
ror $2, $2, #8
strb $2, [$1], #+1
ror $2, $2, #8
strb $2, [$1], #+1
>)
C 53 instr.
C It's tempting to use eor with rotation, but that's slower.
C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
define(<AES_ENCRYPT_ROUND>, <
uxtb T0, $1
ldr $5, [TABLE, T0, lsl #2]
uxtb T0, $2
ldr $6, [TABLE, T0, lsl #2]
uxtb T0, $3
ldr $7, [TABLE, T0, lsl #2]
uxtb T0, $4
ldr $8, [TABLE, T0, lsl #2]
uxtb T0, $2, ror #8
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $3, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $4, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $1, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $3, ror #16
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $4, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $1, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $2, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $4, ror #24
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $1, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $2, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $3, ror #24
ldr T0, [TABLE, T0, lsl #2]
ldm $9!, {$1,$2,$3,$4}
eor $8, $8, T0
sub TABLE, TABLE, #3072
eor $5, $5, $1
eor $6, $6, $2
eor $7, $7, $3
eor $8, $8, $4
>)
define(<AES_DECRYPT_ROUND>, <
uxtb T0, $1
ldr $5, [TABLE, T0, lsl #2]
uxtb T0, $2
ldr $6, [TABLE, T0, lsl #2]
uxtb T0, $3
ldr $7, [TABLE, T0, lsl #2]
uxtb T0, $4
ldr $8, [TABLE, T0, lsl #2]
uxtb T0, $4, ror #8
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $1, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $2, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $3, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $3, ror #16
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $4, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $1, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $2, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $2, ror #24
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $3, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $4, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $1, ror #24
ldr T0, [TABLE, T0, lsl #2]
ldm $9!, {$1,$2,$3,$4}
eor $8, $8, T0
sub TABLE, TABLE, #3072
eor $5, $5, $1
eor $6, $6, $2
eor $7, $7, $3
eor $8, $8, $4
>)
C AES_FINAL_ROUND(a,b,c,d,key,res)
define(<AES_FINAL_ROUND>, <
uxtb T0, $1
ldrb $6, [TABLE, T0]
uxtb T0, $2, ror #8
ldrb T0, [TABLE, T0]
eor $6, $6, T0, lsl #8
uxtb T0, $3, ror #16
ldrb T0, [TABLE, T0]
eor $6, $6, T0, lsl #16
uxtb T0, $4, ror #24
ldrb T0, [TABLE, T0]
eor $6, $6, T0, lsl #24
ldr T0, [$5], #+4
eor $6, T0
>)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment