Commit fc5801a7 authored by Niels Möller's avatar Niels Möller

Setup for using ARM assembly. Initial memxor for ARM.

parent d56b4410
2013-02-06 Niels Möller <nisse@lysator.liu.se>
* armv7/README: New file.
* armv7/machine.m4: New (empty) file.
* armv7/memxor.asm: Initial assembly implementation.
* config.m4.in: Substitute ASM_TYPE_PROGBITS as TYPE_PROGBITS.
* config.make.in: Added .s to the suffix list.
......@@ -17,6 +21,7 @@
(ASM_TYPE_PROGBITS): New substituted variable, set in the same way
as ASM_TYPE_FUNCTION.
(ASM_MARK_NOEXEC_STACK): Use TYPE_PROGBITS.
(asm_path): Set up asm_path for armv7.
* asm.m4: Use changecom to disable m4 quoting. Use divert to
suppress output.
......
Currently, code in this directory is written for arm cortex-a9.
ABI gnueabi(hf) (not depending on the floating point conventions)
Registers May be Argument
clobbered number
r0 Y 1
r1 Y 2
r2 Y 3
r3 Y 4
r4 N
r5 N
r6 N
r7 N
r8 N
r9 (sl)
r10 N
r11 N
r12 (ip) Y
r13 (sp)
r14 (lr)
r15 (pc)
q0 (d0, d1) Y 1 (for "hf" abi)
q1 (d2, d3) Y 2
q2 (d4, d5) Y 3
q3 (d6, d7) Y 4
q4 (d8, d9) N
q5 (d10, d11) N
q6 (d12, d13) N
q7 (d14, d15) N
q8 (d16, d17) Y
q9 (d18, d19) Y
q10 (d20, d21) Y
q11 (d22, d23) Y
q12 (d24, d25) Y
q13 (d26, d27) Y
q14 (d28, d29) Y
q15 (d30, d31) Y
C -*- mode: asm; asm-comment-char: ?C; -*-
C nettle, low-level cryptographics library
C
C Copyright (C) 2013, Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
C Register usage:
define(<DST>, <r0>)
define(<SRC>, <r1>)
define(<N>, <r2>)
define(<CNT>, <r6>)
define(<TNC>, <r7>)
.syntax unified
.file "memxor.asm"
.text
.arm
C memxor(uint8_t *dst, const uint8_t *src, size_t n)
.align 2
PROLOGUE(memxor)
cmp N, #0
beq .Lmemxor_ret
push {r4, r5, r6, r7}
cmp N, #7
bcs .Lmemxor_large
C Simple byte loop
.Lmemxor_bytes:
ldrb r3, [SRC], #+1
ldrb r4, [DST]
eor r3, r4
strb r3, [DST], #+1
subs N, #1
bne .Lmemxor_bytes
.Lmemxor_done:
pop {r4,r5,r6,r7}
.Lmemxor_ret:
bx lr
.Lmemxor_align_loop:
ldrb r3, [SRC], #+1
ldrb r4, [DST]
eor r3, r4
strb r3, [DST], #+1
sub N, #1
.Lmemxor_large:
tst DST, #3
bne .Lmemxor_align_loop
C We have at least 4 bytes left to do here.
sub N, #4
ands CNT, SRC, #3
beq .Lmemxor_same
C Different alignment case.
C v original SRC
C +-------+------+
C |SRC |SRC+4 |
C +---+---+------+
C |DST |
C +-------+
C
C With little-endian, we need to do
C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
lsl CNT, #3
bic SRC, #3
rsb TNC, CNT, #32
ldr r4, [SRC], #+4
tst N, #4
itet eq
moveq r5, r4
subne N, #4
beq .Lmemxor_odd
.Lmemxor_word_loop:
ldr r5, [SRC], #+4
ldr r3, [DST]
eor r3, r3, r4, lsr CNT
eor r3, r3, r5, lsl TNC
str r3, [DST], #+4
.Lmemxor_odd:
ldr r4, [SRC], #+4
ldr r3, [DST]
eor r3, r3, r5, lsr CNT
eor r3, r3, r4, lsl TNC
str r3, [DST], #+4
subs N, #8
bcs .Lmemxor_word_loop
adds N, #8
beq .Lmemxor_done
C We have TNC/8 left-over bytes in r4, high end
lsr r4, CNT
ldr r3, [DST]
eor r3, r4
C Store bytes, one by one.
.Lmemxor_leftover:
strb r3, [DST], #+1
subs N, #1
beq .Lmemxor_done
subs TNC, #8
lsr r3, #8
bne .Lmemxor_leftover
b .Lmemxor_bytes
.Lmemxor_same:
ldr r3, [SRC], #+4
ldr r4, [DST]
eor r3, r4
str r3, [DST], #+4
subs N, #4
bcs .Lmemxor_same
adds N, #4
beq .Lmemxor_done
b .Lmemxor_bytes
EPILOGUE(memxor)
define(<DST>, <r0>)
define(<AP>, <r1>)
define(<BP>, <r2>)
define(<N>, <r3>)
undefine(<CNT>)
undefine(<TNC>)
C Temporaries r4-r7
define(<ACNT>, <r8>)
define(<ATNC>, <r10>)
define(<BCNT>, <r11>)
define(<BTNC>, <r12>)
C memxor3(uint8_t *dst, const uint8_t *a, const uint8_t *b, size_t n)
.align 2
PROLOGUE(memxor3)
cmp N, #0
beq .Lmemxor3_ret
push {r4,r5,r6,r7,r8,r10,r11}
cmp N, #7
add AP, N
add BP, N
add DST, N
bcs .Lmemxor3_large
C Simple byte loop
.Lmemxor3_bytes:
ldrb r4, [AP, #-1]!
ldrb r5, [BP, #-1]!
eor r4, r5
strb r4, [DST, #-1]!
subs N, #1
bne .Lmemxor3_bytes
.Lmemxor3_done:
pop {r4,r5,r6,r7,r8,r10,r11}
.Lmemxor3_ret:
bx lr
.Lmemxor3_align_loop:
ldrb r4, [AP, #-1]!
ldrb r5, [BP, #-1]!
eor r5, r4
strb r5, [DST, #-1]!
sub N, #1
.Lmemxor3_large:
tst DST, #3
bne .Lmemxor3_align_loop
C We have at least 4 bytes left to do here.
sub N, #4
ands ACNT, AP, #3
lsl ACNT, #3
beq .Lmemxor3_a_aligned
ands BCNT, BP, #3
lsl BCNT, #3
bne .Lmemxor3_uu
C Swap
mov r4, AP
mov AP, BP
mov BP, r4
mov BCNT, ACNT
.Lmemxor3_au:
C FIXME: Switch roles of A and B
C AP is aligned, BP is not
C v original SRC
C +-------+------+
C |SRC-4 |SRC |
C +---+---+------+
C |DST-4 |
C +-------+
C
C With little-endian, we need to do
C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
rsb BTNC, BCNT, #32
bic BP, #3
ldr r4, [BP]
tst N, #4
itet eq
moveq r5, r4
subne N, #4
beq .Lmemxor3_au_odd
.Lmemxor3_au_loop:
ldr r5, [BP, #-4]!
ldr r6, [AP, #-4]!
eor r6, r6, r4, lsl BTNC
eor r6, r6, r5, lsr BCNT
str r6, [DST, #-4]!
.Lmemxor3_au_odd:
ldr r4, [BP, #-4]!
ldr r6, [AP, #-4]!
eor r6, r6, r5, lsl BTNC
eor r6, r6, r4, lsr BCNT
str r6, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_au_loop
adds N, #8
beq .Lmemxor3_done
C Leftover bytes in r4, low end
ldr r5, [AP, #-4]
C FIXME: Do this with a single shift/rotate?
lsr r5, BTNC
eor r4, r5
ror r4, BCNT
.Lmemxor3_au_leftover:
C Store a byte at a time
ror r4, #24
strb r4, [DST, #-1]!
subs N, #1
beq .Lmemxor3_done
subs BCNT, #8
sub AP, #1
bne .Lmemxor3_au_leftover
b .Lmemxor3_bytes
.Lmemxor3_a_aligned:
ands BCNT, BP, #3
lsl BCNT, #3
bne .Lmemxor3_au ;
C a, b and dst all have the same alignment.
.Lmemxor3_aligned_word_loop:
ldr r4, [AP, #-4]!
ldr r5, [BP, #-4]!
eor r4, r5
str r4, [DST, #-4]!
subs N, #4
bcs .Lmemxor3_aligned_word_loop
adds N, #4
beq .Lmemxor3_done
b .Lmemxor3_bytes
.Lmemxor3_uu:
cmp ACNT, BCNT
bic AP, #3
bic BP, #3
rsb ATNC, ACNT, #32
bne .Lmemxor3_uud
C AP and BP are unaligned in the same way
ldr r4, [AP]
ldr r6, [BP]
eor r4, r6
tst N, #4
itet eq
moveq r5, r4
subne N, #4
beq .Lmemxor3_uu_odd
.Lmemxor3_uu_loop:
ldr r5, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r5, r6
lsl r4, ATNC
eor r4, r4, r5, lsr ACNT
str r4, [DST, #-4]!
.Lmemxor3_uu_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r4, r6
lsl r5, ATNC
eor r5, r5, r4, lsr ACNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uu_loop
adds N, #8
beq .Lmemxor3_done
C Leftover bytes in a4, low end
ror r4, ACNT
.Lmemxor3_uu_leftover:
ror r4, #24
strb r4, [DST, #-1]!
subs N, #1
beq .Lmemxor3_done
subs ACNT, #8
bne .Lmemxor3_uu_leftover
b .Lmemxor3_bytes
.Lmemxor3_uud:
C Both AP and BP unaligned, and in different ways
rsb BTNC, BCNT, #32
ldr r4, [AP]
ldr r6, [BP]
tst N, #4
ittet eq
moveq r5, r4
moveq r7, r6
subne N, #4
beq .Lmemxor3_uud_odd
.Lmemxor3_uud_loop:
ldr r5, [AP, #-4]!
ldr r7, [BP, #-4]!
lsl r4, ATNC
eor r4, r4, r6, lsl BTNC
eor r4, r4, r5, lsr ACNT
eor r4, r4, r7, lsr BCNT
str r4, [DST, #-4]!
.Lmemxor3_uud_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
lsl r5, ATNC
eor r5, r5, r7, lsl BTNC
eor r5, r5, r4, lsr ACNT
eor r5, r5, r6, lsr BCNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uud_loop
adds N, #8
beq .Lmemxor3_done
C FIXME: More clever left-over handling? For now, just adjust pointers.
add AP, AP, ACNT, lsr #3
add BP, BP, BCNT, lsr #3
b .Lmemxor3_bytes
EPILOGUE(memxor3)
......@@ -229,6 +229,9 @@ if test "x$enable_assembler" = xyes ; then
asm_path=sparc32
fi
;;
armv7l*)
asm_path=armv7
;;
*)
enable_assembler=no
;;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment