Commit 38d507fa authored by Niels Möller's avatar Niels Möller

ARM assembly for umac_nh_n.

parent 83ce4b71
2013-04-15 Niels Möller <nisse@lysator.liu.se>
* armv7/umac-nh-n.asm: New file. 2.0-2.3 time speedup.
* testsuite/umac-test.c (test_align): Fixed memory leak.
2013-04-12 Niels Möller <nisse@lysator.liu.se>
......
C nettle, low-level cryptographics library
C
C Copyright (C) 2013 Niels Möller
C
C The nettle library is free software; you can redistribute it and/or modify
C it under the terms of the GNU Lesser General Public License as published by
C the Free Software Foundation; either version 2.1 of the License, or (at your
C option) any later version.
C
C The nettle library is distributed in the hope that it will be useful, but
C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
C License for more details.
C
C You should have received a copy of the GNU Lesser General Public License
C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
.file "umac-nh.asm"
.fpu neon
define(<OUT>, <r0>)
define(<ITERS>, <r1>)
define(<KEY>, <r2>)
define(<LENGTH>, <r3>)
define(<MSG>, <r12>)
define(<SHIFT>, <r14>)
define(<QA>, <q0>)
define(<QB>, <q1>)
define(<QY0>, <q3>) C Accumulates for the first two operations.
define(<DM>, <d4>)
define(<QY1>, <q4>) C Used for 3 and 4 iterations.
define(<QC>, <q5>)
define(<QD>, <q6>)
define(<QLEFT>, <q8>)
define(<QRIGHT>, <q9>)
define(<QT0>, <q10>)
define(<QT1>, <q11>)
define(<QT2>, <q12>)
define(<QK0>, <q13>)
define(<QK1>, <q14>)
define(<QK2>, <q15>)
C FIXME: Try permuting subkeys using vld4, vzip or similar.
.text
.align 3
PROLOGUE(_nettle_umac_nh_n)
ldr MSG, [sp]
str lr, [sp, #-4]!
C Setup for 64-bit aligned reads
ands SHIFT, MSG, #7
and MSG, MSG, #-8
vld1.8 {DM}, [MSG :64]
addne MSG, MSG, #8
addeq SHIFT, SHIFT, #8
C FIXME: Combine as rsb ?
lsl SHIFT, SHIFT, #3
neg SHIFT, SHIFT
C Right shift in QRIGHT (both halves)
vmov.i32 D0REG(QRIGHT)[0], SHIFT
vmov.32 D1REG(QRIGHT), D0REG(QRIGHT)
add SHIFT, SHIFT, #64
vmov.i32 D0REG(QLEFT)[0], SHIFT
vmov.32 D1REG(QLEFT), D0REG(QLEFT)
cmp r1, #3
vmov.i64 QY0, #0
vshl.u64 DM, DM, D0REG(QRIGHT)
bcc .Lnh2
beq .Lnh3
.Lnh4:
C Permute key words, so we in each iteration have them in order
C
C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
C
C Also arrange the message words, so we get them as
C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
C
C Then, accumulate Y0 (first two "iters") using
C
C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3)
C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
C
C Next iteration is then
C
C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7)
C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
C
C So we can reuse P4, P5, P6, P7 from the previous iteration.
C How to for in registers? We need 4 Q regs for P0-P3, and one
C more for the last read key. We need at least two regiters
C for the message (QA and QB, more if we want to expand only
C once). For the Y0 update, we can let the factors overwrite
C P0-P3, and for the Y1 update, we can overwrite M0-M3.
vpush {q4,q5,q6}
vld1.32 {QK0,QK1}, [KEY]!
vld1.32 {QK2}, [KEY]!
vmov QT0, QK1
vmov QT1, QK2
C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
vtrn.32 QK0, QK1 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
vswp D1REG(QK0), D0REG(QK1) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
vtrn.32 QT0, QT1 C Gives us [4,8,6,10] and [5 ,9,7,11]
vswp D1REG(QT0), D0REG(QT1) C Gives us [4,8,5, 9] and [6,10,7,11]
vmov.i64 QY1, #0
.Loop4:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QC, QA, QRIGHT
vshl.u64 QD, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QC)
veor D0REG(QB), D0REG(QB), D1REG(QC)
veor D1REG(QB), D1REG(QB), D0REG(QD)
vmov DM, D1REG(QD)
C Explode message (too bad there's no vadd with scalar)
vdup.32 D1REG(QD), D1REG(QB)[1]
vdup.32 D0REG(QD), D1REG(QB)[0]
vdup.32 D1REG(QC), D0REG(QB)[1]
vdup.32 D0REG(QC), D0REG(QB)[0]
vdup.32 D1REG(QB), D1REG(QA)[1]
vdup.32 D0REG(QB), D1REG(QA)[0]
vdup.32 D1REG(QA), D0REG(QA)[1]
vdup.32 D0REG(QA), D0REG(QA)[0]
vadd.i32 QK0, QK0, QA
vadd.i32 QK1, QK1, QB
vadd.i32 QT0, QT0, QC
vadd.i32 QT1, QT1, QD
vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
C Next 4 subkeys
vld1.32 {QT0,QT1}, [KEY]!
vmov QK0, QK2
vmov QK1, QT0
vmov QK2, QT1 C Save
vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15]
vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15]
vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19]
vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19]
vadd.i32 QA, QA, QK0
vadd.i32 QB, QB, QK1
vadd.i32 QC, QC, QT0
vadd.i32 QD, QD, QT1
subs LENGTH, LENGTH, #32
vmlal.u32 QY1, D0REG(QA), D0REG(QC)
vmlal.u32 QY1, D1REG(QA), D1REG(QC)
vmlal.u32 QY1, D0REG(QB), D0REG(QD)
vmlal.u32 QY1, D1REG(QB), D1REG(QD)
bhi .Loop4
vst1.64 {QY0, QY1}, [OUT]
vpop {q4,q5,q6}
ldr pc, [sp], #+4
.Lnh3:
vpush {q4}
vld1.32 {QK0,QK1}, [KEY]!
vmov.i64 QY1, #0
.Loop3:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QT0, QA, QRIGHT
vshl.u64 QT1, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QT0)
veor D0REG(QB), D0REG(QB), D1REG(QT0)
veor D1REG(QB), D1REG(QB), D0REG(QT1)
vmov DM, D1REG(QT1)
vld1.32 {QK2}, [KEY]!
C Construct factors, with low half corresponding to first iteration,
C and high half corresponding to the second iteration.
vmov QT0, QK1
vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
vdup.32 D0REG(QT1), D0REG(QA)[0]
vdup.32 D1REG(QT1), D0REG(QA)[1]
vadd.i32 QT1, QT1, QK0
vmov QK0, QK2 C Save for next iteration
vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
vdup.32 D0REG(QT2), D0REG(QB)[0]
vdup.32 D1REG(QT2), D0REG(QB)[1]
vadd.i32 QK1, QK1, QT2
vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
vdup.32 D0REG(QT1), D1REG(QA)[0]
vdup.32 D1REG(QT1), D1REG(QA)[1]
vadd.i32 QT0, QT0, QT1
vdup.32 D0REG(QT1), D1REG(QB)[0]
vdup.32 D1REG(QT1), D1REG(QB)[1]
vadd.i32 QK2, QK2, QT1
vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
vld1.32 {QK1}, [KEY]!
vadd.i32 QA, QA, QK0
vadd.i32 QB, QB, QK1
subs LENGTH, LENGTH, #32
vmlal.u32 QY1, D0REG(QA), D0REG(QB)
vmlal.u32 QY1, D1REG(QA), D1REG(QB)
bhi .Loop3
vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
vpop {q4}
ldr pc, [sp], #+4
.Lnh2:
vld1.32 {QK0}, [KEY]!
.Loop2:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QT0, QA, QRIGHT
vshl.u64 QT1, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QT0)
veor D0REG(QB), D0REG(QB), D1REG(QT0)
veor D1REG(QB), D1REG(QB), D0REG(QT1)
vmov DM, D1REG(QT1)
vld1.32 {QK1,QK2}, [KEY]!
C Construct factors, with low half corresponding to first iteration,
C and high half corresponding to the second iteration.
vmov QT0, QK1
vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
vdup.32 D0REG(QT1), D0REG(QA)[0]
vdup.32 D1REG(QT1), D0REG(QA)[1]
vadd.i32 QT1, QT1, QK0
vmov QK0, QK2 C Save for next iteration
vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11]
vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11]
vdup.32 D0REG(QT2), D0REG(QB)[0]
vdup.32 D1REG(QT2), D0REG(QB)[1]
vadd.i32 QK1, QK1, QT2
vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
vdup.32 D0REG(QT1), D1REG(QA)[0]
vdup.32 D1REG(QT1), D1REG(QA)[1]
vadd.i32 QT0, QT0, QT1
vdup.32 D0REG(QT1), D1REG(QB)[0]
vdup.32 D1REG(QT1), D1REG(QB)[1]
vadd.i32 QK2, QK2, QT1
subs LENGTH, LENGTH, #32
vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
bhi .Loop2
vst1.64 {QY0}, [OUT]
.Lend:
ldr pc, [sp], #+4
EPILOGUE(_nettle_umac_nh_n)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment