Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nettle/nettle
  • briansmith/nettle
  • ajlawrence/nettle
  • mhoffmann/nettle
  • devnexen/nettle
  • wiml/nettle
  • lumag/nettle
  • michaelweiser/nettle
  • aberaud/nettle
  • mamonet/nettle
  • npocs/nettle
  • babelouest/nettle
  • ueno/nettle
  • rth/nettle
14 results
Show changes
Showing
with 2869 additions and 0 deletions
C arm/fat/aes-encrypt-internal.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
define(`fat_transform', `$1_arm')
include_src(`arm/aes-encrypt-internal.asm')
C arm/fat/chacha-3core.asm
ifelse(`
Copyright (C) 2020 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_fat_chacha_3core) picked up by configure
include_src(`arm/neon/chacha-3core.asm')
C arm/fat/salsa20-2core.asm
ifelse(`
Copyright (C) 2020 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_fat_salsa20_2core) picked up by configure
include_src(`arm/neon/salsa20-2core.asm')
C arm/fat/sha1-compress-2.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(nettle_sha1_compress) picked up by configure
define(`fat_transform', `_$1_armv6')
include_src(`arm/v6/sha1-compress.asm')
C arm/fat/sha256-compress-n-2.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_sha256_compress_n) picked up by configure
define(`fat_transform', `$1_armv6')
include_src(`arm/v6/sha256-compress-n.asm')
C arm/fat/sha3-permute-2.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_sha3_permute) picked up by configure
define(`fat_transform', `_$1_neon')
include_src(`arm/neon/sha3-permute.asm')
C arm/fat/sha3-compress-2.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_sha512_compress) picked up by configure
define(`fat_transform', `$1_neon')
include_src(`arm/neon/sha512-compress.asm')
C arm/fat/umac-nh-2.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_umac_nh) picked up by configure
define(`fat_transform', `$1_neon')
include_src(`arm/neon/umac-nh.asm')
C arm/fat/umac-nh-n-2.asm
ifelse(`
Copyright (C) 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(_nettle_umac_nh_n) picked up by configure
define(`fat_transform', `$1_neon')
include_src(`arm/neon/umac-nh-n.asm')
define(`QREG', `ifelse(
$1, d0, q0,
$1, d2, q1,
$1, d4, q2,
$1, d6, q3,
$1, d8, q4,
$1, d10, q5,
$1, d12, q6,
$1, d14, q7,
$1, d16, q8,
$1, d18, q9,
$1, d20, q10,
$1, d22, q11,
$1, d24, q12,
$1, d26, q13,
$1, d28, q14,
$1, d30, q15,
`NO REGISTER')')dnl
define(`D0REG', `ifelse(
$1, q0, d0,
$1, q1, d2,
$1, q2, d4,
$1, q3, d6,
$1, q4, d8,
$1, q5, d10,
$1, q6, d12,
$1, q7, d14,
$1, q8, d16,
$1, q9, d18,
$1, q10, d20,
$1, q11, d22,
$1, q12, d24,
$1, q13, d26,
$1, q14, d28,
$1, q15, d30,
`NO REGISTER')')dnl
define(`D1REG', `ifelse(
$1, q0, d1,
$1, q1, d3,
$1, q2, d5,
$1, q3, d7,
$1, q4, d9,
$1, q5, d11,
$1, q6, d13,
$1, q7, d15,
$1, q8, d17,
$1, q9, d19,
$1, q10, d21,
$1, q11, d23,
$1, q12, d25,
$1, q13, d27,
$1, q14, d29,
$1, q15, d31,
`NO REGISTER')')dnl
C arm/memxor.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C Possible speedups:
C
C The ldm instruction can do load two registers per cycle,
C if the address is two-word aligned. Or three registers in two
C cycles, regardless of alignment.
C Register usage:
define(`DST', `r0')
define(`SRC', `r1')
define(`N', `r2')
define(`CNT', `r6')
define(`TNC', `r12')
C little-endian and big-endian need to shift in different directions for
C alignment correction
define(`S0ADJ', IF_LE(`lsr', `lsl'))
define(`S1ADJ', IF_LE(`lsl', `lsr'))
.syntax unified
.file "memxor.asm"
.text
.arm
C memxor(void *dst, const void *src, size_t n)
.align 4
PROLOGUE(nettle_memxor)
cmp N, #0
beq .Lmemxor_done
cmp N, #7
bcs .Lmemxor_large
C Simple byte loop
.Lmemxor_bytes:
ldrb r3, [SRC], #+1
ldrb r12, [DST]
eor r3, r12
strb r3, [DST], #+1
subs N, #1
bne .Lmemxor_bytes
.Lmemxor_done:
bx lr
.Lmemxor_align_loop:
ldrb r3, [SRC], #+1
ldrb r12, [DST]
eor r3, r12
strb r3, [DST], #+1
sub N, #1
.Lmemxor_large:
tst DST, #3
bne .Lmemxor_align_loop
C We have at least 4 bytes left to do here.
sub N, #4
ands r3, SRC, #3
beq .Lmemxor_same
C Different alignment case.
C v original SRC
C +-------+------+
C |SRC |SRC+4 |
C +---+---+------+
C |DST |
C +-------+
C
C With little-endian, we need to do
C DST[i] ^= (SRC[i] >> CNT) ^ (SRC[i+1] << TNC)
C With big-endian, we need to do
C DST[i] ^= (SRC[i] << CNT) ^ (SRC[i+1] >> TNC)
push {r4,r5,r6}
lsl CNT, r3, #3
bic SRC, #3
rsb TNC, CNT, #32
ldr r4, [SRC], #+4
tst N, #4
itet eq
moveq r5, r4
subne N, #4
beq .Lmemxor_odd
.Lmemxor_word_loop:
ldr r5, [SRC], #+4
ldr r3, [DST]
eor r3, r3, r4, S0ADJ CNT
eor r3, r3, r5, S1ADJ TNC
str r3, [DST], #+4
.Lmemxor_odd:
ldr r4, [SRC], #+4
ldr r3, [DST]
eor r3, r3, r5, S0ADJ CNT
eor r3, r3, r4, S1ADJ TNC
str r3, [DST], #+4
subs N, #8
bcs .Lmemxor_word_loop
adds N, #8
beq .Lmemxor_odd_done
C We have TNC/8 left-over bytes in r4, high end on LE and low end on
C BE, excess bits to be discarded by alignment adjustment at the other
S0ADJ r4, CNT
C now byte-aligned at low end on LE and high end on BE
ldr r3, [DST]
eor r3, r4
pop {r4,r5,r6}
C Store bytes, one by one.
.Lmemxor_leftover:
C bring uppermost byte down for saving while preserving lower ones
IF_BE(` ror r3, #24')
strb r3, [DST], #+1
subs N, #1
beq .Lmemxor_done
subs TNC, #8
C bring down next byte, no need to preserve
IF_LE(` lsr r3, #8')
bne .Lmemxor_leftover
b .Lmemxor_bytes
.Lmemxor_odd_done:
pop {r4,r5,r6}
bx lr
.Lmemxor_same:
push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
subs N, #8
bcc .Lmemxor_same_end
ldmia SRC!, {r3, r4, r5}
C Keep address for loads in r14
mov r14, DST
ldmia r14!, {r6, r7, r8}
subs N, #12
eor r10, r3, r6
eor r11, r4, r7
eor r12, r5, r8
bcc .Lmemxor_same_final_store
subs N, #12
ldmia r14!, {r6, r7, r8}
bcc .Lmemxor_same_wind_down
C 6 cycles per iteration, 0.50 cycles/byte. For this speed,
C loop starts at offset 0x11c in the object file.
.Lmemxor_same_loop:
C r10-r12 contains values to be stored at DST
C r6-r8 contains values read from r14, in advance
ldmia SRC!, {r3, r4, r5}
subs N, #12
stmia DST!, {r10, r11, r12}
eor r10, r3, r6
eor r11, r4, r7
eor r12, r5, r8
ldmia r14!, {r6, r7, r8}
bcs .Lmemxor_same_loop
.Lmemxor_same_wind_down:
C Wind down code
ldmia SRC!, {r3, r4, r5}
stmia DST!, {r10, r11, r12}
eor r10, r3, r6
eor r11, r4, r7
eor r12, r5, r8
.Lmemxor_same_final_store:
stmia DST!, {r10, r11, r12}
.Lmemxor_same_end:
C We have 0-11 bytes left to do, and N holds number of bytes -12.
adds N, #4
bcc .Lmemxor_same_lt_8
C Do 8 bytes more, leftover is in N
ldmia SRC!, {r3, r4}
ldmia DST, {r6, r7}
eor r3, r6
eor r4, r7
stmia DST!, {r3, r4}
pop {r4,r5,r6,r7,r8,r10,r11,r14}
beq .Lmemxor_done
b .Lmemxor_bytes
.Lmemxor_same_lt_8:
pop {r4,r5,r6,r7,r8,r10,r11,r14}
adds N, #4
bcc .Lmemxor_same_lt_4
ldr r3, [SRC], #+4
ldr r12, [DST]
eor r3, r12
str r3, [DST], #+4
beq .Lmemxor_done
b .Lmemxor_bytes
.Lmemxor_same_lt_4:
adds N, #4
beq .Lmemxor_done
b .Lmemxor_bytes
EPILOGUE(nettle_memxor)
C arm/memxor3.asm
ifelse(`
Copyright (C) 2013, 2015 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C Possible speedups:
C
C The ldm instruction can do load two registers per cycle,
C if the address is two-word aligned. Or three registers in two
C cycles, regardless of alignment.
C Register usage:
define(`DST', `r0')
define(`AP', `r1')
define(`BP', `r2')
define(`N', `r3')
C Temporaries r4-r7
define(`ACNT', `r8')
define(`ATNC', `r10')
define(`BCNT', `r11')
define(`BTNC', `r12')
C little-endian and big-endian need to shift in different directions for
C alignment correction
define(`S0ADJ', IF_LE(`lsr', `lsl'))
define(`S1ADJ', IF_LE(`lsl', `lsr'))
.syntax unified
.file "memxor3.asm"
.text
.arm
C memxor3(void *dst, const void *a, const void *b, size_t n)
.align 2
PROLOGUE(nettle_memxor3)
cmp N, #0
beq .Lmemxor3_ret
push {r4,r5,r6,r7,r8,r10,r11}
cmp N, #7
add AP, N
add BP, N
add DST, N
bcs .Lmemxor3_large
C Simple byte loop
.Lmemxor3_bytes:
ldrb r4, [AP, #-1]!
ldrb r5, [BP, #-1]!
eor r4, r5
strb r4, [DST, #-1]!
subs N, #1
bne .Lmemxor3_bytes
.Lmemxor3_done:
pop {r4,r5,r6,r7,r8,r10,r11}
.Lmemxor3_ret:
bx lr
.Lmemxor3_align_loop:
ldrb r4, [AP, #-1]!
ldrb r5, [BP, #-1]!
eor r5, r4
strb r5, [DST, #-1]!
sub N, #1
.Lmemxor3_large:
tst DST, #3
bne .Lmemxor3_align_loop
C We have at least 4 bytes left to do here.
sub N, #4
ands ACNT, AP, #3
lsl ACNT, #3
beq .Lmemxor3_a_aligned
ands BCNT, BP, #3
lsl BCNT, #3
bne .Lmemxor3_uu
C Swap
mov r4, AP
mov AP, BP
mov BP, r4
.Lmemxor3_au:
C NOTE: We have the relevant shift count in ACNT, not BCNT
C AP is aligned, BP is not
C v original SRC
C +-------+------+
C |SRC-4 |SRC |
C +---+---+------+
C |DST-4 |
C +-------+
C
C With little-endian, we need to do
C DST[i-i] ^= (SRC[i-i] >> CNT) ^ (SRC[i] << TNC)
C With big-endian, we need to do
C DST[i-i] ^= (SRC[i-i] << CNT) ^ (SRC[i] >> TNC)
rsb ATNC, ACNT, #32
bic BP, #3
ldr r4, [BP]
tst N, #4
itet eq
moveq r5, r4
subne N, #4
beq .Lmemxor3_au_odd
.Lmemxor3_au_loop:
ldr r5, [BP, #-4]!
ldr r6, [AP, #-4]!
eor r6, r6, r4, S1ADJ ATNC
eor r6, r6, r5, S0ADJ ACNT
str r6, [DST, #-4]!
.Lmemxor3_au_odd:
ldr r4, [BP, #-4]!
ldr r6, [AP, #-4]!
eor r6, r6, r5, S1ADJ ATNC
eor r6, r6, r4, S0ADJ ACNT
str r6, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_au_loop
adds N, #8
beq .Lmemxor3_done
C Leftover bytes in r4, low end on LE and high end on BE before
C preparatory alignment correction
ldr r5, [AP, #-4]
eor r4, r5, r4, S1ADJ ATNC
C now byte-aligned in high end on LE and low end on BE because we're
C working downwards in saving the very first bytes of the buffer
.Lmemxor3_au_leftover:
C Store a byte at a time
C bring uppermost byte down for saving while preserving lower ones
IF_LE(` ror r4, #24')
strb r4, [DST, #-1]!
subs N, #1
beq .Lmemxor3_done
subs ACNT, #8
C bring down next byte, no need to preserve
IF_BE(` lsr r4, #8')
sub AP, #1
bne .Lmemxor3_au_leftover
b .Lmemxor3_bytes
.Lmemxor3_a_aligned:
ands ACNT, BP, #3
lsl ACNT, #3
bne .Lmemxor3_au ;
C a, b and dst all have the same alignment.
subs N, #8
bcc .Lmemxor3_aligned_word_end
C This loop runs at 8 cycles per iteration. It has been
C observed running at only 7 cycles, for this speed, the loop
C started at offset 0x2ac in the object file.
C FIXME: consider software pipelining, similarly to the memxor
C loop.
.Lmemxor3_aligned_word_loop:
ldmdb AP!, {r4,r5,r6}
ldmdb BP!, {r7,r8,r10}
subs N, #12
eor r4, r7
eor r5, r8
eor r6, r10
stmdb DST!, {r4, r5,r6}
bcs .Lmemxor3_aligned_word_loop
.Lmemxor3_aligned_word_end:
C We have 0-11 bytes left to do, and N holds number of bytes -12.
adds N, #4
bcc .Lmemxor3_aligned_lt_8
C Do 8 bytes more, leftover is in N
ldmdb AP!, {r4, r5}
ldmdb BP!, {r6, r7}
eor r4, r6
eor r5, r7
stmdb DST!, {r4,r5}
beq .Lmemxor3_done
b .Lmemxor3_bytes
.Lmemxor3_aligned_lt_8:
adds N, #4
bcc .Lmemxor3_aligned_lt_4
ldr r4, [AP,#-4]!
ldr r5, [BP,#-4]!
eor r4, r5
str r4, [DST,#-4]!
beq .Lmemxor3_done
b .Lmemxor3_bytes
.Lmemxor3_aligned_lt_4:
adds N, #4
beq .Lmemxor3_done
b .Lmemxor3_bytes
.Lmemxor3_uu:
cmp ACNT, BCNT
bic AP, #3
bic BP, #3
rsb ATNC, ACNT, #32
bne .Lmemxor3_uud
C AP and BP are unaligned in the same way
ldr r4, [AP]
ldr r6, [BP]
eor r4, r6
tst N, #4
itet eq
moveq r5, r4
subne N, #4
beq .Lmemxor3_uu_odd
.Lmemxor3_uu_loop:
ldr r5, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r5, r6
S1ADJ r4, ATNC
eor r4, r4, r5, S0ADJ ACNT
str r4, [DST, #-4]!
.Lmemxor3_uu_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
eor r4, r6
S1ADJ r5, ATNC
eor r5, r5, r4, S0ADJ ACNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uu_loop
adds N, #8
beq .Lmemxor3_done
C Leftover bytes in r4, low end on LE and high end on BE before
C preparatory alignment correction
IF_LE(` ror r4, ACNT')
IF_BE(` ror r4, ATNC')
C now byte-aligned in high end on LE and low end on BE because we're
C working downwards in saving the very first bytes of the buffer
.Lmemxor3_uu_leftover:
C bring uppermost byte down for saving while preserving lower ones
IF_LE(` ror r4, #24')
strb r4, [DST, #-1]!
subs N, #1
beq .Lmemxor3_done
subs ACNT, #8
C bring down next byte, no need to preserve
IF_BE(` lsr r4, #8')
bne .Lmemxor3_uu_leftover
b .Lmemxor3_bytes
.Lmemxor3_uud:
C Both AP and BP unaligned, and in different ways
rsb BTNC, BCNT, #32
ldr r4, [AP]
ldr r6, [BP]
tst N, #4
ittet eq
moveq r5, r4
moveq r7, r6
subne N, #4
beq .Lmemxor3_uud_odd
.Lmemxor3_uud_loop:
ldr r5, [AP, #-4]!
ldr r7, [BP, #-4]!
S1ADJ r4, ATNC
eor r4, r4, r6, S1ADJ BTNC
eor r4, r4, r5, S0ADJ ACNT
eor r4, r4, r7, S0ADJ BCNT
str r4, [DST, #-4]!
.Lmemxor3_uud_odd:
ldr r4, [AP, #-4]!
ldr r6, [BP, #-4]!
S1ADJ r5, ATNC
eor r5, r5, r7, S1ADJ BTNC
eor r5, r5, r4, S0ADJ ACNT
eor r5, r5, r6, S0ADJ BCNT
str r5, [DST, #-4]!
subs N, #8
bcs .Lmemxor3_uud_loop
adds N, #8
beq .Lmemxor3_done
C FIXME: More clever left-over handling? For now, just adjust pointers.
add AP, AP, ACNT, lsr #3
add BP, BP, BCNT, lsr #3
b .Lmemxor3_bytes
EPILOGUE(nettle_memxor3)
C arm/neon/chacha-3core.asm
ifelse(`
Copyright (C) 2020 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "chacha-3core.asm"
.fpu neon
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
define(`SRCp32', `r3')
C State, X, Y and Z representing consecutive blocks
define(`X0', `q0')
define(`X1', `q1')
define(`X2', `q2')
define(`X3', `q3')
define(`Y0', `q8')
define(`Y1', `q9')
define(`Y2', `q10')
define(`Y3', `q11')
define(`Z0', `q12')
define(`Z1', `q13')
define(`Z2', `q14')
define(`Z3', `q15')
define(`T0', `q4')
define(`T1', `q5')
define(`T2', `q6')
define(`T3', `q7')
.text
.align 4
.Lcount1:
.int 1,0,0,0
C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_3core)
C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
add SRCp32, SRC, #32
vld1.32 {X0,X1}, [SRC]
vld1.32 {X2,X3}, [SRCp32]
vpush {q4,q5,q6,q7}
adr r12, .Lcount1
vld1.32 {Z3}, [r12]
vadd.i64 Y3, X3, Z3 C Increment 64-bit counter
vadd.i64 Z3, Y3, Z3
.Lshared_entry:
vmov Y0, X0
vmov Z0, X0
vmov Y1, X1
vmov Z1, X1
vmov Y2, X2
vmov Z2, X2
C Save initial values for the words including the counters.
vmov T2, Y3
vmov T3, Z3
.Loop:
C Interleave three blocks. Note that with this scheduling,
C only two temporaries, T0 and T1, are needed.
vadd.i32 X0, X0, X1
veor X3, X3, X0
vadd.i32 Y0, Y0, Y1
vrev32.16 X3, X3 C lrot 16
veor Y3, Y3, Y0
vadd.i32 Z0, Z0, Z1
vadd.i32 X2, X2, X3
vrev32.16 Y3, Y3 C lrot 16
veor Z3, Z3, Z0
veor T0, X1, X2
vadd.i32 Y2, Y2, Y3
vrev32.16 Z3, Z3 C lrot 16
vshl.i32 X1, T0, #12
veor T1, Y1, Y2
vadd.i32 Z2, Z2, Z3
vsri.u32 X1, T0, #20
vshl.i32 Y1, T1, #12
veor T0, Z1, Z2
vadd.i32 X0, X0, X1
vsri.u32 Y1, T1, #20
vshl.i32 Z1, T0, #12
veor T1, X3, X0
vadd.i32 Y0, Y0, Y1
vsri.u32 Z1, T0, #20
vshl.i32 X3, T1, #8
veor T0, Y3, Y0
vadd.i32 Z0, Z0, Z1
vsri.u32 X3, T1, #24
vshl.i32 Y3, T0, #8
veor T1, Z3, Z0
vadd.i32 X2, X2, X3
vsri.u32 Y3, T0, #24
vext.32 X3, X3, X3, #3
vshl.i32 Z3, T1, #8
veor T0, X1, X2
vadd.i32 Y2, Y2, Y3
vsri.u32 Z3, T1, #24
vext.32 Y3, Y3, Y3, #3
vshl.i32 X1, T0, #7
veor T1, Y1, Y2
vadd.i32 Z2, Z2, Z3
vsri.u32 X1, T0, #25
vshl.i32 Y1, T1, #7
veor T0, Z1, Z2
vext.32 X1, X1, X1, #1
vsri.u32 Y1, T1, #25
vshl.i32 Z1, T0, #7
vext.32 Y2, Y2, Y2, #2
vext.32 Y1, Y1, Y1, #1
vsri.u32 Z1, T0, #25
vext.32 X2, X2, X2, #2
C Second QROUND
vadd.i32 X0, X0, X1
vext.32 Z2, Z2, Z2, #2
vext.32 Z1, Z1, Z1, #1
veor X3, X3, X0
vadd.i32 Y0, Y0, Y1
vext.32 Z3, Z3, Z3, #3
vrev32.16 X3, X3 C lrot 16
veor Y3, Y3, Y0
vadd.i32 Z0, Z0, Z1
vadd.i32 X2, X2, X3
vrev32.16 Y3, Y3 C lrot 16
veor Z3, Z3, Z0
veor T0, X1, X2
vadd.i32 Y2, Y2, Y3
vrev32.16 Z3, Z3 C lrot 16
vshl.i32 X1, T0, #12
veor T1, Y1, Y2
vadd.i32 Z2, Z2, Z3
vsri.u32 X1, T0, #20
vshl.i32 Y1, T1, #12
veor T0, Z1, Z2
vadd.i32 X0, X0, X1
vsri.u32 Y1, T1, #20
vshl.i32 Z1, T0, #12
veor T1, X3, X0
vadd.i32 Y0, Y0, Y1
vsri.u32 Z1, T0, #20
vshl.i32 X3, T1, #8
veor T0, Y3, Y0
vadd.i32 Z0, Z0, Z1
vsri.u32 X3, T1, #24
vshl.i32 Y3, T0, #8
veor T1, Z3, Z0
vadd.i32 X2, X2, X3
vsri.u32 Y3, T0, #24
vext.32 X3, X3, X3, #1
vshl.i32 Z3, T1, #8
veor T0, X1, X2
vext.32 X2, X2, X2, #2
vadd.i32 Y2, Y2, Y3
vext.32 Y3, Y3, Y3, #1
vsri.u32 Z3, T1, #24
vshl.i32 X1, T0, #7
veor T1, Y1, Y2
vext.32 Y2, Y2, Y2, #2
vadd.i32 Z2, Z2, Z3
vext.32 Z3, Z3, Z3, #1
vsri.u32 X1, T0, #25
vshl.i32 Y1, T1, #7
veor T0, Z1, Z2
vext.32 Z2, Z2, Z2, #2
vext.32 X1, X1, X1, #3
vsri.u32 Y1, T1, #25
vshl.i32 Z1, T0, #7
vext.32 Y1, Y1, Y1, #3
vsri.u32 Z1, T0, #25
subs ROUNDS, ROUNDS, #2
vext.32 Z1, Z1, Z1, #3
bhi .Loop
C Add updated counters
vadd.i32 Y3, Y3, T2
vadd.i32 Z3, Z3, T3
vld1.32 {T0,T1}, [SRC]
vadd.i32 X0, X0, T0
vadd.i32 X1, X1, T1
C vst1.8 because caller expects results little-endian
C interleave loads, calculations and stores to save cycles on stores
C use vstm when little-endian for some additional speedup
IF_BE(` vst1.8 {X0,X1}, [DST]!')
vld1.32 {T2,T3}, [SRCp32]
vadd.i32 X2, X2, T2
vadd.i32 X3, X3, T3
IF_BE(` vst1.8 {X2,X3}, [DST]!')
IF_LE(` vstmia DST!, {X0,X1,X2,X3}')
vadd.i32 Y0, Y0, T0
vadd.i32 Y1, Y1, T1
IF_BE(` vst1.8 {Y0,Y1}, [DST]!')
vadd.i32 Y2, Y2, T2
IF_BE(` vst1.8 {Y2,Y3}, [DST]!')
IF_LE(` vstmia DST!, {Y0,Y1,Y2,Y3}')
vadd.i32 Z0, Z0, T0
vadd.i32 Z1, Z1, T1
IF_BE(` vst1.8 {Z0,Z1}, [DST]!')
vadd.i32 Z2, Z2, T2
vpop {q4,q5,q6,q7}
IF_BE(` vst1.8 {Z2,Z3}, [DST]')
IF_LE(` vstm DST, {Z0,Z1,Z2,Z3}')
bx lr
EPILOGUE(_nettle_chacha_3core)
PROLOGUE(_nettle_chacha_3core32)
add SRCp32, SRC, #32
vld1.32 {X0,X1}, [SRC]
vld1.32 {X2,X3}, [SRCp32]
vpush {q4,q5,q6,q7}
adr r12, .Lcount1
vld1.32 {Z3}, [r12]
vadd.i32 Y3, X3, Z3 C Increment 32-bit counter
vadd.i32 Z3, Y3, Z3
b .Lshared_entry
EPILOGUE(_nettle_chacha_3core32)
C arm/neon/salsa20-2core.asm
ifelse(`
Copyright (C) 2020 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "salsa20-2core.asm"
.fpu neon
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
define(`SRCp32', `r3')
C State, even elements in X, odd elements in Y
define(`X0', `q0')
define(`X1', `q1')
define(`X2', `q2')
define(`X3', `q3')
define(`Y0', `q8')
define(`Y1', `q9')
define(`Y2', `q10')
define(`Y3', `q11')
define(`T0', `q12')
define(`T1', `q13')
define(`T2', `q14')
define(`T3', `q15')
.text
.align 4
.Lcount1:
.int 1,0,0,0
C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_salsa20_2core)
C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
add SRCp32, SRC, #32
vld1.32 {X0,X1}, [SRC]
vld1.32 {X2,X3}, [SRCp32]
adr r12, .Lcount1
vmov Y3, X0
vld1.32 {Y1}, [r12]
vmov Y0, X1
vadd.i64 Y1, Y1, X2 C Increment counter
vmov Y2, X3
vtrn.32 X0, Y3 C X0: 0 0 2 2 Y3: 1 1 3 3
vtrn.32 X1, Y0 C X1: 4 4 6 6 Y0: 5 5 7 7
vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 11 11
vtrn.32 X3, Y2 C X3: 12 12 14 14 Y2: 13 13 15 15
C Swap, to get
C X0: 0 10 Y0: 5 15
C X1: 4 14 Y1: 9 3
C X2: 8 2 Y2: 13 7
C X3: 12 6 Y3: 1 11
vswp D1REG(X0), D1REG(X2)
vswp D1REG(X1), D1REG(X3)
vswp D1REG(Y0), D1REG(Y2)
vswp D1REG(Y1), D1REG(Y3)
.Loop:
C Register layout (A is first block, B is second block)
C
C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15
C X1: A4 B4 A14 B14 Y1: A9 B9 A3 B3
C X2: A8 B8 A2 B2 Y2: A13 B13 A7 B7
C X3: A12 B12 A6 B6 Y3: A1 B1 A11 B11
vadd.i32 T0, X0, X3
vshl.i32 T1, T0, #7
vadd.i32 T2, Y0, Y3
vsri.u32 T1, T0, #25
vshl.i32 T3, T2, #7
veor X1, X1, T1
vsri.u32 T3, T2, #25
vadd.i32 T0, X1, X0
veor Y1, Y1, T3
vshl.i32 T1, T0, #9
vadd.i32 T2, Y1, Y0
vsri.u32 T1, T0, #23
vshl.i32 T3, T2, #9
veor X2, X2, T1
vsri.u32 T3, T2, #23
vadd.i32 T0, X2, X1
veor Y2, Y2, T3
vshl.i32 T1, T0, #13
vadd.i32 T2, Y2, Y1
vsri.u32 T1, T0, #19
vshl.i32 T3, T2, #13
veor X3, X3, T1
vsri.u32 T3, T2, #19
vadd.i32 T0, X3, X2
veor Y3, Y3, T3
vshl.i32 T1, T0, #18
vadd.i32 T2, Y3, Y2
vext.32 Y1, Y1, Y1, #2
vsri.u32 T1, T0, #14
vshl.i32 T3, T2, #18
vext.32 Y2, Y2, Y2, #2
veor X0, X0, T1
vsri.u32 T3, T2, #14
vext.32 X3, X3, X3, #2
veor Y0, Y0, T3
C Register layout:
C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15
C Y1: A3 B3 A9 B9 X1: A4 B4 A14 B14 (Y1 swapped)
C X2: A2 B2 A8 B8 Y2: A7 B7 A13 B13 (X2, Y2 swapped)
C Y3: A1 B1 A11 B11 X3: A6 B6 A12 B12 (X3 swapped)
vadd.i32 T0, X0, Y1
vext.32 X2, X2, X2, #2
vshl.i32 T1, T0, #7
vadd.i32 T2, Y0, X1
vsri.u32 T1, T0, #25
vshl.i32 T3, T2, #7
veor Y3, Y3, T1
vsri.u32 T3, T2, #25
vadd.i32 T0, Y3, X0
veor X3, X3, T3
vshl.i32 T1, T0, #9
vadd.i32 T2, X3, Y0
vsri.u32 T1, T0, #23
vshl.i32 T3, T2, #9
veor X2, X2, T1
vsri.u32 T3, T2, #23
vadd.i32 T0, X2, Y3
veor Y2, Y2, T3
vshl.i32 T1, T0, #13
vadd.i32 T2, Y2, X3
vsri.u32 T1, T0, #19
vshl.i32 T3, T2, #13
veor Y1, Y1, T1
vsri.u32 T3, T2, #19
vadd.i32 T0, Y1, X2
veor X1, X1, T3
vext.32 X2, X2, X2, #2
vshl.i32 T1, T0, #18
vadd.i32 T2, X1, Y2
vext.32 Y1, Y1, Y1, #2
vsri.u32 T1, T0, #14
subs ROUNDS, ROUNDS, #2
vshl.i32 T3, T2, #18
vext.32 X3, X3, X3, #2
veor X0, X0, T1
vsri.u32 T3, T2, #14
vext.32 Y2, Y2, Y2, #2
veor Y0, Y0, T3
bhi .Loop
C Inverse swaps and transpositions
vswp D1REG(X0), D1REG(X2)
vswp D1REG(X1), D1REG(X3)
vswp D1REG(Y0), D1REG(Y2)
vswp D1REG(Y1), D1REG(Y3)
vld1.32 {T0,T1}, [SRC]
vld1.32 {T2,T3}, [SRCp32]
vtrn.32 X0, Y3
vtrn.32 X1, Y0
vtrn.32 X2, Y1
vtrn.32 X3, Y2
C Add in the original context
vadd.i32 X0, X0, T0
vadd.i32 X1, X1, T1
C vst1.8 because caller expects results little-endian
C interleave loads, calculations and stores to save cycles on stores
C use vstm when little-endian for some additional speedup
IF_BE(` vst1.8 {X0,X1}, [DST]!')
vadd.i32 X2, X2, T2
vadd.i32 X3, X3, T3
IF_BE(` vst1.8 {X2,X3}, [DST]!')
IF_LE(` vstmia DST!, {X0,X1,X2,X3}')
vld1.32 {X0}, [r12]
vadd.i32 T0, T0, Y3
vadd.i64 T2, T2, X0
vadd.i32 T1, T1, Y0
IF_BE(` vst1.8 {T0,T1}, [DST]!')
vadd.i32 T2, T2, Y1
vadd.i32 T3, T3, Y2
IF_BE(` vst1.8 {T2,T3}, [DST]')
IF_LE(` vstm DST, {T0,T1,T2,T3}')
bx lr
EPILOGUE(_nettle_salsa20_2core)
C arm/neon/sha3-permute.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "sha3-permute.asm"
.fpu neon
define(`CTX', `r0')
define(`COUNT', `r1')
define(`RC', `r2')
C First column
define(`A0', `d0')
define(`A5', `d2')
define(`A10', `d3')
define(`A15', `d4')
define(`A20', `d5')
define(`A1', `d6')
define(`A2', `d7')
define(`A3', `d8')
define(`A4', `d9')
define(`A6', `d16')
define(`A7', `d17')
define(`A8', `d18')
define(`A9', `d19')
define(`A11', `d20')
define(`A12', `d21')
define(`A13', `d22')
define(`A14', `d23')
define(`A16', `d24')
define(`A17', `d25')
define(`A18', `d26')
define(`A19', `d27')
define(`A21', `d28')
define(`A22', `d29')
define(`A23', `d30')
define(`A24', `d31')
define(`T0', `d10')
define(`T1', `d11')
define(`C0', `d1')
define(`C1', `d12')
define(`C2', `d13')
define(`C3', `d14')
define(`C4', `d15')
C ROL(DST, SRC, COUNT)
C Must have SRC != DST
define(`ROL', `
vshr.u64 $1, $2, #eval(64-$3)
vsli.i64 $1, $2, #$3
')
C sha3_permute(struct sha3_ctx *ctx)
.text
.align 3
.Lrc:
.quad 0x0000000000000001
.quad 0x0000000000008082
.quad 0x800000000000808A
.quad 0x8000000080008000
.quad 0x000000000000808B
.quad 0x0000000080000001
.quad 0x8000000080008081
.quad 0x8000000000008009
.quad 0x000000000000008A
.quad 0x0000000000000088
.quad 0x0000000080008009
.quad 0x000000008000000A
.quad 0x000000008000808B
.quad 0x800000000000008B
.quad 0x8000000000008089
.quad 0x8000000000008003
.quad 0x8000000000008002
.quad 0x8000000000000080
.quad 0x000000000000800A
.quad 0x800000008000000A
.quad 0x8000000080008081
.quad 0x8000000000008080
.quad 0x0000000080000001
.quad 0x8000000080008008
PROLOGUE(nettle_sha3_permute)
vpush {d8-d15}
vld1.64 {A0}, [CTX]!
vldm CTX!, {A1,A2,A3,A4}
vld1.64 {A5}, [CTX]!
vldm CTX!, {A6,A7,A8,A9}
vld1.64 {A10}, [CTX]!
vldm CTX!, {A11,A12,A13,A14}
vld1.64 {A15}, [CTX]!
vldm CTX!, {A16,A17,A18,A19}
vld1.64 {A20}, [CTX]!
vldm CTX, {A21,A22,A23,A24}
sub CTX, CTX, #168
mov COUNT, #24
adr RC, .Lrc
.align 3
.Loop:
veor QREG(T0), QREG(A5), QREG(A15)
veor C0, A0, T0
veor C0, C0, T1
veor QREG(C1), QREG(A1), QREG(A6)
veor QREG(C1), QREG(C1), QREG(A11)
veor QREG(C1), QREG(C1), QREG(A16)
veor QREG(C1), QREG(C1), QREG(A21)
veor QREG(C3), QREG(A3), QREG(A8)
veor QREG(C3), QREG(C3), QREG(A13)
veor QREG(C3), QREG(C3), QREG(A18)
veor QREG(C3), QREG(C3), QREG(A23)
C D0 = C4 ^ (C1 <<< 1)
C NOTE: Using ROL macro (and vsli) is slightly slower.
vshl.i64 T0, C1, #1
vshr.u64 T1, C1, #63
veor T0, T0, C4
veor T0, T0, T1
vmov T1, T0
veor A0, A0, T0
veor QREG(A5), QREG(A5), QREG(T0)
veor QREG(A15), QREG(A15), QREG(T0)
C D1 = C0 ^ (C2 <<< 1)
C D2 = C1 ^ (C3 <<< 1)
ROL(T0, C2, 1)
ROL(T1, C3, 1)
veor T0, T0, C0
veor T1, T1, C1
veor QREG(A1), QREG(A1), QREG(T0)
veor QREG(A6), QREG(A6), QREG(T0)
veor QREG(A11), QREG(A11), QREG(T0)
veor QREG(A16), QREG(A16), QREG(T0)
veor QREG(A21), QREG(A21), QREG(T0)
C D3 = C2 ^ (C4 <<< 1)
C D4 = C3 ^ (C0 <<< 1)
ROL(T0, C4, 1)
ROL(T1, C0, 1)
veor T0, T0, C2
veor T1, T1, C3
veor QREG(A3), QREG(A3), QREG(T0)
veor QREG(A8), QREG(A8), QREG(T0)
veor QREG(A13), QREG(A13), QREG(T0)
veor QREG(A18), QREG(A18), QREG(T0)
veor QREG(A23), QREG(A23), QREG(T0)
ROL( T0, A1, 1)
ROL( A1, A6, 44)
ROL( A6, A9, 20)
ROL( A9, A22, 61)
ROL(A22, A14, 39)
ROL(A14, A20, 18)
ROL(A20, A2, 62)
ROL( A2, A12, 43)
ROL(A12, A13, 25)
ROL(A13, A19, 8)
ROL(A19, A23, 56)
ROL(A23, A15, 41)
ROL(A15, A4, 27)
ROL( A4, A24, 14)
ROL(A24, A21, 2)
ROL(A21, A8, 55)
ROL( A8, A16, 45)
ROL(A16, A5, 36)
ROL( A5, A3, 28)
ROL( A3, A18, 21)
ROL(A18, A17, 15)
ROL(A17, A11, 10)
ROL(A11, A7, 6)
ROL( A7, A10, 3)
C New A10 value left in T0
vbic C0, A2, A1
vbic C1, A3, A2
vbic C2, A4, A3
vbic C3, A0, A4
vbic C4, A1, A0
veor A0, A0, C0
vld1.64 {C0}, [RC :64]!
veor QREG(A1), QREG(A1), QREG(C1)
veor QREG(A3), QREG(A3), QREG(C3)
veor A0, A0, C0
vbic C0, A7, A6
vbic C1, A8, A7
vbic C2, A9, A8
vbic C3, A5, A9
vbic C4, A6, A5
veor A5, A5, C0
veor QREG(A6), QREG(A6), QREG(C1)
veor QREG(A8), QREG(A8), QREG(C3)
vbic C0, A12, A11
vbic C1, A13, A12
vbic C2, A14, A13
vbic C3, T0, A14
vbic C4, A11, T0
veor A10, T0, C0
veor QREG(A11), QREG(A11), QREG(C1)
veor QREG(A13), QREG(A13), QREG(C3)
vbic C0, A17, A16
vbic C1, A18, A17
vbic C2, A19, A18
vbic C3, A15, A19
vbic C4, A16, A15
veor A15, A15, C0
veor QREG(A16), QREG(A16), QREG(C1)
veor QREG(A18), QREG(A18), QREG(C3)
vbic C0, A22, A21
vbic C1, A23, A22
vbic C2, A24, A23
vbic C3, A20, A24
vbic C4, A21, A20
subs COUNT, COUNT, #1
veor A20, A20, C0
veor QREG(A21), QREG(A21), QREG(C1)
veor QREG(A23), QREG(A23), QREG(C3)
bne .Loop
vst1.64 {A0}, [CTX]!
vstm CTX!, {A1,A2,A3,A4}
vst1.64 {A5}, [CTX]!
vstm CTX!, {A6,A7,A8,A9}
vst1.64 {A10}, [CTX]!
vstm CTX!, {A11,A12,A13,A14}
vst1.64 {A15}, [CTX]!
vstm CTX!, {A16,A17,A18,A19}
vst1.64 {A20}, [CTX]!
vstm CTX, {A21,A22,A23,A24}
vpop {d8-d15}
bx lr
EPILOGUE(nettle_sha3_permute)
C arm/neon/sha512-compress.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "sha512-compress.asm"
.fpu neon
define(`STATE', `r0')
define(`INPUT', `r1')
define(`K', `r2')
define(`COUNT', `r3')
define(`SHIFT', `r12')
define(`SA', `d0')
define(`SB', `d1')
define(`SC', `d2')
define(`SD', `d3')
define(`SE', `d4')
define(`SF', `d5')
define(`SG', `d6')
define(`SH', `d7')
define(`QSAB', `q0')
define(`QSCD', `q1')
define(`QSEF', `q2')
define(`QSGH', `q3')
C d8-d15 are callee-save
define(`DT0', `d8')
define(`DT1', `d9')
define(`QT01', `q4')
define(`DT2', `d10')
define(`DT3', `d11')
define(`QT23', `q5')
define(`DT4', `d12')
define(`DT5', `d13')
define(`QT45', `q6')
C Used only when reading the input, can overlap with state
define(`DT6', `d0')
define(`DT7', `d1')
define(`QT67', `q0')
define(`DW0', `d16')
define(`DW1', `d17')
define(`DW2', `d18')
define(`DW3', `d19')
define(`DW4', `d20')
define(`DW5', `d21')
define(`DW6', `d22')
define(`DW7', `d23')
define(`DW8', `d24')
define(`DW9', `d25')
define(`DW10', `d26')
define(`DW11', `d27')
define(`DW12', `d28')
define(`DW13', `d29')
define(`DW14', `d30')
define(`DW15', `d31')
define(`QW0001', `q8')
define(`QW0203', `q9')
define(`QW0405', `q10')
define(`QW0607', `q11')
define(`QW0809', `q12')
define(`QW1011', `q13')
define(`QW1213', `q14')
define(`QW1415', `q15')
define(`EXPAND_ME', `$1')
define(`W', `EXPAND_ME(`DW'eval(($1) % 16))')
C If x = W(i+14), y = w(i+1), we xor in parallel
C
C x << 45 y << 63
C x >> 19 y >> 1
C x << 3 y << 56
C x >> 61 y >> 8
C xor x >> 6 y >> 7
C -----------------------------
C DT0 DT1
define(`EXPN', `
vshl.i64 DT0, W($1+14), #45
vshl.i64 DT1, W($1 + 1), #63
vshr.u64 DT2, W($1+14), #19
vshr.u64 DT3, W($1 + 1), #1
vshl.i64 DT4, W($1+14), #3
vshl.i64 DT5, W($1 + 1), #56
veor.i64 QT01, QT01, QT23
vshr.u64 DT2, W($1+14), #61
vshr.u64 DT3, W($1 + 1), #8
veor.i64 QT01, QT01, QT45
vshr.u64 DT4, W($1+14), #6
vshr.u64 DT5, W($1 + 1), #7
veor.i64 QT01, QT01, QT23
vadd.i64 W($1), W($1), W($1 + 9)
veor.i64 QT01, QT01, QT45
vadd.i64 W($1), W($1), DT0
vadd.i64 W($1), W($1), DT1
')
C ROUND(A,B,C,D,E,F,G,H,i)
C
C H += S1(E) + Choice(E,F,G) + K + W
C D += H
C H += S0(A) + Majority(A,B,C)
C
C Where
C
C S1(E) = E<<<50 ^ E<<<46 ^ E<<<23
C S0(A) = A<<<36 ^ A<<<30 ^ A<<<25
C Choice (E, F, G) = G^(E&(F^G))
C Majority (A,B,C) = (A&B) + (C&(A^B))
C Do S1 and S0 in parallel
C
C e << 50 a << 36
C e >> 14 a >> 28
C e << 46 a << 30
C e >> 18 a >> 34
C e << 23 a << 25
C xor e >> 41 a >> 39
C ----------------------------
C DT0 DT1
define(`ROUND', `
vshl.i64 DT0, $5, #50
vshl.i64 DT1, $1, #36
vshr.u64 DT2, $5, #14
vshr.u64 DT3, $1, #28
vshl.i64 DT4, $5, #46
vshl.i64 DT5, $1, #30
veor QT01, QT01, QT23
vshr.u64 DT2, $5, #18
vshr.u64 DT3, $1, #34
veor QT01, QT01, QT45
vshl.i64 DT4, $5, #23
vshl.i64 DT5, $1, #25
veor QT01, QT01, QT23
vshr.u64 DT2, $5, #41
vshr.u64 DT3, $1, #39
veor QT01, QT01, QT45
veor DT4, $6, $7
veor DT5, $1, $2
vand DT4, DT4, $5
vand DT5, DT5, $3
veor DT4, DT4, $7
veor QT01, QT01, QT23
vand DT2, $1, $2
vldr DT3, [K,#eval(8*$9)]
vadd.i64 $8, $8, W($9)
vadd.i64 QT01, QT01, QT45
vadd.i64 $8, $8, DT3
vadd.i64 $8, $8, DT0
vadd.i64 DT1, DT1, DT2
vadd.i64 $4, $4, $8
vadd.i64 $8, $8, DT1
')
C void
C _nettle_sha512_compress(uint64_t *state, const uint8_t *input, const uint64_t *k)
.text
.align 2
PROLOGUE(_nettle_sha512_compress)
vpush {d8,d9,d10,d11,d12,d13}
ands SHIFT, INPUT, #7
and INPUT, INPUT, #-8
vld1.8 {DT5}, [INPUT :64]
addne INPUT, INPUT, #8
addeq SHIFT, SHIFT, #8
lsl SHIFT, SHIFT, #3
C Put right shift in DT0 and DT1, aka QT01
neg SHIFT, SHIFT
vmov.i32 DT0, #0
vmov.32 DT0[0], SHIFT
vmov DT1, DT0
C Put left shift in DT2 and DT3, aka QT23
add SHIFT, SHIFT, #64
vmov.i32 DT2, #0
vmov.32 DT2[0], SHIFT
vmov DT3, DT2
vshl.u64 DT5, DT5, DT0
C Set w[i] <-- w[i-1] >> RSHIFT + w[i] << LSHIFT
vld1.8 {W(0),W(1),W(2),W(3)}, [INPUT :64]!
vshl.u64 QT67, QW0001, QT01 C Right shift
vshl.u64 QW0001, QW0001, QT23 C Left shift
veor W(0), W(0), DT5
veor W(1), W(1), DT6
vrev64.8 QW0001, QW0001
vshl.u64 QT45, QW0203, QT01 C Right shift
vshl.u64 QW0203, QW0203, QT23 C Left shift
veor W(2), W(2), DT7
veor W(3), W(3), DT4
vrev64.8 QW0203, QW0203
vld1.8 {W(4),W(5),W(6),W(7)}, [INPUT :64]!
vshl.u64 QT67, QW0405, QT01 C Right shift
vshl.u64 QW0405, QW0405, QT23 C Left shift
veor W(4), W(4), DT5
veor W(5), W(5), DT6
vrev64.8 QW0405, QW0405
vshl.u64 QT45, QW0607, QT01 C Right shift
vshl.u64 QW0607, QW0607, QT23 C Left shift
veor W(6), W(6), DT7
veor W(7), W(7), DT4
vrev64.8 QW0607, QW0607
vld1.8 {W(8),W(9),W(10),W(11)}, [INPUT :64]!
vshl.u64 QT67, QW0809, QT01 C Right shift
vshl.u64 QW0809, QW0809, QT23 C Left shift
veor W(8), W(8), DT5
veor W(9), W(9), DT6
vrev64.8 QW0809, QW0809
vshl.u64 QT45, QW1011, QT01 C Right shift
vshl.u64 QW1011, QW1011, QT23 C Left shift
veor W(10), W(10), DT7
veor W(11), W(11), DT4
vrev64.8 QW1011, QW1011
vld1.8 {W(12),W(13),W(14),W(15)}, [INPUT :64]!
vshl.u64 QT67, QW1213, QT01 C Right shift
vshl.u64 QW1213, QW1213, QT23 C Left shift
veor W(12), W(12), DT5
veor W(13), W(13), DT6
vrev64.8 QW1213, QW1213
vshl.u64 QT45, QW1415, QT01 C Right shift
vshl.u64 QW1415, QW1415, QT23 C Left shift
veor W(14), W(14), DT7
veor W(15), W(15), DT4
vrev64.8 QW1415, QW1415
vldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
add K, K, #128
mov COUNT, #4
.Loop:
EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 0)
EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 1)
EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 2)
EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 3)
EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 4)
EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 5)
EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 6)
EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 7)
EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH, 8)
EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG, 9)
EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF, 10)
EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE, 11)
EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD, 12)
EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC, 13)
EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB, 14)
subs COUNT, COUNT, #1
EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA, 15)
add K, K, #128
bne .Loop
vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
vadd.i64 QSAB, QSAB, QW0001
vadd.i64 QSCD, QSCD, QW0203
vst1.64 {SA,SB,SC,SD}, [STATE]!
vld1.64 {DW0, DW1, DW2, DW3}, [STATE]
vadd.i64 QSEF, QSEF, QW0001
vadd.i64 QSGH, QSGH, QW0203
vst1.64 {SE,SF,SG,SH}, [STATE]!
vpop {d8,d9,d10,d11,d12,d13}
bx lr
EPILOGUE(_nettle_sha512_compress)
divert(-1)
define shastate
p/x $d0.u64
p/x $d1.u64
p/x $d2.u64
p/x $d3.u64
p/x $d4.u64
p/x $d5.u64
p/x $d6.u64
p/x $d7.u64
end
C arm/neon/umac-nh-n.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "umac-nh.asm"
.fpu neon
define(`OUT', `r0')
define(`ITERS', `r1')
define(`KEY', `r2')
define(`LENGTH', `r3')
define(`MSG', `r12')
define(`SHIFT', `r14')
define(`QA', `q0')
define(`QB', `q1')
define(`QY0', `q3') C Accumulates for the first two operations.
define(`DM', `d4')
define(`QY1', `q4') C Used for 3 and 4 iterations.
define(`QC', `q5')
define(`QD', `q6')
define(`QLEFT', `q8')
define(`QRIGHT', `q9')
define(`QT0', `q10')
define(`QT1', `q11')
define(`QT2', `q12')
define(`QK0', `q13')
define(`QK1', `q14')
define(`QK2', `q15')
C FIXME: Try permuting subkeys using vld4, vzip or similar.
.text
.align 3
PROLOGUE(_nettle_umac_nh_n)
ldr MSG, [sp]
str lr, [sp, #-4]!
C Setup for 64-bit aligned reads
ands SHIFT, MSG, #7
and MSG, MSG, #-8
vld1.8 {DM}, [MSG :64]
addne MSG, MSG, #8
addeq SHIFT, SHIFT, #8
C FIXME: Combine as rsb ?
lsl SHIFT, SHIFT, #3
neg SHIFT, SHIFT
C Right shift in QRIGHT (both halves)
vmov.i32 D0REG(QRIGHT)[0], SHIFT
vmov.32 D1REG(QRIGHT), D0REG(QRIGHT)
add SHIFT, SHIFT, #64
vmov.i32 D0REG(QLEFT)[0], SHIFT
vmov.32 D1REG(QLEFT), D0REG(QLEFT)
cmp r1, #3
vmov.i64 QY0, #0
vshl.u64 DM, DM, D0REG(QRIGHT)
bcc .Lnh2
beq .Lnh3
.Lnh4:
C Permute key words, so we in each iteration have them in order
C
C P0: [0, 4,1, 5] P1: [ 2, 6, 3, 7] P2: [ 4, 8, 5, 9] P3: [ 6,10, 7,11]
C P4: [8,12,9,13] P5: [10,14,11,15] P6: [12,16,13,17] P7: [14,18,15,19]
C
C Also arrange the message words, so we get them as
C M0: [0,0,1,1] M1: [ 2, 2, 3, 3] M2: [ 4, 4, 5, 5] M3: [ 6, 6, 7, 7]
C M4: [8,8,9,9] M5: [10,10,11,11] M6: [12,12,13,13] M7: [14,14,15,15]
C
C Then, accumulate Y0 (first two "iters") using
C
C Y0 += (M0+P0) * (M2+P2) + (M1+P1) * (M3+P3)
C Y1 += (M0+P4) * (M2+P6) + (M1+P5) * (M3+P7)
C
C Next iteration is then
C
C Y0 += (M4+P4) * (M6+P6) + (M5+P5) * (M7 + P7)
C Y1 += (M4+P6) * (M6+P8) + (M5+P7) * (M7 + P11)
C
C So we can reuse P4, P5, P6, P7 from the previous iteration.
C How to for in registers? We need 4 Q regs for P0-P3, and one
C more for the last read key. We need at least two regiters
C for the message (QA and QB, more if we want to expand only
C once). For the Y0 update, we can let the factors overwrite
C P0-P3, and for the Y1 update, we can overwrite M0-M3.
vpush {q4,q5,q6}
vld1.32 {QK0,QK1}, [KEY]!
vld1.32 {QK2}, [KEY]!
vmov QT0, QK1
vmov QT1, QK2
C Permute keys. QK2 us untouched, permuted subkeys put in QK0,QK1,QT0,QT1
vtrn.32 QK0, QK1 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
vswp D1REG(QK0), D0REG(QK1) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
vtrn.32 QT0, QT1 C Gives us [4,8,6,10] and [5 ,9,7,11]
vswp D1REG(QT0), D0REG(QT1) C Gives us [4,8,5, 9] and [6,10,7,11]
vmov.i64 QY1, #0
.Loop4:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QC, QA, QRIGHT
vshl.u64 QD, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QC)
veor D0REG(QB), D0REG(QB), D1REG(QC)
veor D1REG(QB), D1REG(QB), D0REG(QD)
vmov DM, D1REG(QD)
C Explode message (too bad there's no vadd with scalar)
vdup.32 D1REG(QD), D1REG(QB)[1]
vdup.32 D0REG(QD), D1REG(QB)[0]
vdup.32 D1REG(QC), D0REG(QB)[1]
vdup.32 D0REG(QC), D0REG(QB)[0]
vdup.32 D1REG(QB), D1REG(QA)[1]
vdup.32 D0REG(QB), D1REG(QA)[0]
vdup.32 D1REG(QA), D0REG(QA)[1]
vdup.32 D0REG(QA), D0REG(QA)[0]
vadd.i32 QK0, QK0, QA
vadd.i32 QK1, QK1, QB
vadd.i32 QT0, QT0, QC
vadd.i32 QT1, QT1, QD
vmlal.u32 QY0, D0REG(QK0), D0REG(QT0)
vmlal.u32 QY0, D1REG(QK0), D1REG(QT0)
vmlal.u32 QY0, D0REG(QK1), D0REG(QT1)
vmlal.u32 QY0, D1REG(QK1), D1REG(QT1)
C Next 4 subkeys
vld1.32 {QT0,QT1}, [KEY]!
vmov QK0, QK2
vmov QK1, QT0
vmov QK2, QT1 C Save
vtrn.32 QK0, QK1 C Gives us [8,12,10,14] and [9,13,11,15]
vswp D1REG(QK0), D0REG(QK1) C Gives us [8,12,9,13] and [10,14,11,15]
vtrn.32 QT0, QT1 C Gives us [12,16,14,18] and [13,17,15,19]
vswp D1REG(QT0), D0REG(QT1) C Gives us [12,16,13,17] and [14,18,15,19]
vadd.i32 QA, QA, QK0
vadd.i32 QB, QB, QK1
vadd.i32 QC, QC, QT0
vadd.i32 QD, QD, QT1
subs LENGTH, LENGTH, #32
vmlal.u32 QY1, D0REG(QA), D0REG(QC)
vmlal.u32 QY1, D1REG(QA), D1REG(QC)
vmlal.u32 QY1, D0REG(QB), D0REG(QD)
vmlal.u32 QY1, D1REG(QB), D1REG(QD)
bhi .Loop4
vst1.64 {QY0, QY1}, [OUT]
vpop {q4,q5,q6}
ldr pc, [sp], #+4
.Lnh3:
vpush {q4}
vld1.32 {QK0,QK1}, [KEY]!
vmov.i64 QY1, #0
.Loop3:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QT0, QA, QRIGHT
vshl.u64 QT1, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QT0)
veor D0REG(QB), D0REG(QB), D1REG(QT0)
veor D1REG(QB), D1REG(QB), D0REG(QT1)
vmov DM, D1REG(QT1)
vld1.32 {QK2}, [KEY]!
C Construct factors, with low half corresponding to first iteration,
C and high half corresponding to the second iteration.
vmov QT0, QK1
vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
vdup.32 D0REG(QT1), D0REG(QA)[0]
vdup.32 D1REG(QT1), D0REG(QA)[1]
vadd.i32 QT1, QT1, QK0
vmov QK0, QK2 C Save for next iteration
vtrn.32 QK1, QK2 C Gives us [4, 8, 2, 1] and [1, 5, 3, 7]
vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 1, 5] and [2, 1, 3, 7]
vdup.32 D0REG(QT2), D0REG(QB)[0]
vdup.32 D1REG(QT2), D0REG(QB)[1]
vadd.i32 QK1, QK1, QT2
vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
vdup.32 D0REG(QT1), D1REG(QA)[0]
vdup.32 D1REG(QT1), D1REG(QA)[1]
vadd.i32 QT0, QT0, QT1
vdup.32 D0REG(QT1), D1REG(QB)[0]
vdup.32 D1REG(QT1), D1REG(QB)[1]
vadd.i32 QK2, QK2, QT1
vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
vld1.32 {QK1}, [KEY]!
vadd.i32 QA, QA, QK0
vadd.i32 QB, QB, QK1
subs LENGTH, LENGTH, #32
vmlal.u32 QY1, D0REG(QA), D0REG(QB)
vmlal.u32 QY1, D1REG(QA), D1REG(QB)
bhi .Loop3
vadd.i64 D0REG(QY1), D0REG(QY1), D1REG(QY1)
vst1.64 {D0REG(QY0), D1REG(QY0), D0REG(QY1)}, [OUT]
vpop {q4}
ldr pc, [sp], #+4
.Lnh2:
vld1.32 {QK0}, [KEY]!
.Loop2:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QT0, QA, QRIGHT
vshl.u64 QT1, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QT0)
veor D0REG(QB), D0REG(QB), D1REG(QT0)
veor D1REG(QB), D1REG(QB), D0REG(QT1)
vmov DM, D1REG(QT1)
vld1.32 {QK1,QK2}, [KEY]!
C Construct factors, with low half corresponding to first iteration,
C and high half corresponding to the second iteration.
vmov QT0, QK1
vtrn.32 QK0, QT0 C Gives us [0, 4, 2, 6] and [1, 5, 3, 7]
vswp D1REG(QK0), D0REG(QT0) C Gives us [0, 4, 1, 5] and [2, 6, 3, 7]
vdup.32 D0REG(QT1), D0REG(QA)[0]
vdup.32 D1REG(QT1), D0REG(QA)[1]
vadd.i32 QT1, QT1, QK0
vmov QK0, QK2 C Save for next iteration
vtrn.32 QK1, QK2 C Gives us [4, 8, 6, 10] and [5, 9, 7, 11]
vswp D1REG(QK1), D0REG(QK2) C Gives us [4, 8, 5, 9] and [6, 10, 7, 11]
vdup.32 D0REG(QT2), D0REG(QB)[0]
vdup.32 D1REG(QT2), D0REG(QB)[1]
vadd.i32 QK1, QK1, QT2
vmlal.u32 QY0, D0REG(QT1), D0REG(QK1)
vmlal.u32 QY0, D1REG(QT1), D1REG(QK1)
vdup.32 D0REG(QT1), D1REG(QA)[0]
vdup.32 D1REG(QT1), D1REG(QA)[1]
vadd.i32 QT0, QT0, QT1
vdup.32 D0REG(QT1), D1REG(QB)[0]
vdup.32 D1REG(QT1), D1REG(QB)[1]
vadd.i32 QK2, QK2, QT1
subs LENGTH, LENGTH, #32
vmlal.u32 QY0, D0REG(QT0), D0REG(QK2)
vmlal.u32 QY0, D1REG(QT0), D1REG(QK2)
bhi .Loop2
vst1.64 {QY0}, [OUT]
.Lend:
ldr pc, [sp], #+4
EPILOGUE(_nettle_umac_nh_n)
C arm/neon/umac-nh.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "umac-nh.asm"
.fpu neon
define(`KEY', `r0')
define(`LENGTH', `r1')
define(`MSG', `r2')
define(`SHIFT', `r3')
define(`QA', `q0')
define(`QB', `q1')
define(`DM', `d16')
define(`QLEFT', `q9')
define(`QRIGHT', `q10')
define(`QY', `q11')
define(`QT0', `q12')
define(`QT1', `q13')
define(`QK0', `q14')
define(`QK1', `q15')
.text
.align 3
PROLOGUE(_nettle_umac_nh)
C Setup for 64-bit aligned reads
ands SHIFT, MSG, #7
and MSG, MSG, #-8
vld1.8 {DM}, [MSG :64]
addne MSG, MSG, #8
addeq SHIFT, SHIFT, #8
C FIXME: Combine as rsb ?
lsl SHIFT, SHIFT, #3
neg SHIFT, SHIFT
C Right shift in QRIGHT (both halves)
vmov.i32 D0REG(QRIGHT)[0], SHIFT
vmov.32 D1REG(QRIGHT), D0REG(QRIGHT)
add SHIFT, SHIFT, #64
vmov.i32 D0REG(QLEFT)[0], SHIFT
vmov.32 D1REG(QLEFT), D0REG(QLEFT)
vmov.i64 QY, #0
vshl.u64 DM, DM, D0REG(QRIGHT)
.Loop:
C Set m[i] <-- m[i-1] >> RSHIFT + m[i] << LSHIFT
vld1.8 {QA, QB}, [MSG :64]!
vshl.u64 QT0, QA, QRIGHT
vshl.u64 QT1, QB, QRIGHT
vshl.u64 QA, QA, QLEFT
vshl.u64 QB, QB, QLEFT
veor D0REG(QA), D0REG(QA), DM
veor D1REG(QA), D1REG(QA), D0REG(QT0)
veor D0REG(QB), D0REG(QB), D1REG(QT0)
veor D1REG(QB), D1REG(QB), D0REG(QT1)
vmov DM, D1REG(QT1)
vld1.i32 {QK0, QK1}, [KEY]!
vadd.i32 QA, QA, QK0
vadd.i32 QB, QB, QK1
subs LENGTH, LENGTH, #32
vmlal.u32 QY, D0REG(QA), D0REG(QB)
vmlal.u32 QY, D1REG(QA), D1REG(QB)
bhi .Loop
vadd.i64 D0REG(QY), D0REG(QY), D1REG(QY)
C return value needs to respect word order mandated by AAPCS
IF_LE(` vmov r0, r1, D0REG(QY)')
IF_BE(` vmov r1, r0, D0REG(QY)')
bx lr
EPILOGUE(_nettle_umac_nh)
C arm/v6/aes-decrypt-internal.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.arch armv6
include_src(`arm/aes.m4')
define(`PARAM_ROUNDS', `r0')
define(`PARAM_KEYS', `r1')
define(`TABLE', `r2')
define(`LENGTH', `r3')
C On stack: DST, SRC
define(`W0', `r4')
define(`W1', `r5')
define(`W2', `r6')
define(`W3', `r7')
define(`T0', `r8')
define(`COUNT', `r10')
define(`KEY', `r11')
define(`X0', `r0') C Overlaps PARAM_ROUNDS and PARAM_KEYS
define(`X1', `r1')
define(`X2', `r12')
define(`X3', `r14') C lr
define(`FRAME_ROUNDS', `[sp]')
define(`FRAME_KEYS', `[sp, #+4]')
C 8 saved registers
define(`FRAME_DST', `[sp, #+40]')
define(`FRAME_SRC', `[sp, #+44]')
define(`SRC', `r12') C Overlap registers used in inner loop.
define(`DST', `COUNT')
C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
define(`AES_DECRYPT_ROUND', `
uxtb T0, $1
ldr $5, [TABLE, T0, lsl #2]
uxtb T0, $2
ldr $6, [TABLE, T0, lsl #2]
uxtb T0, $3
ldr $7, [TABLE, T0, lsl #2]
uxtb T0, $4
ldr $8, [TABLE, T0, lsl #2]
uxtb T0, $4, ror #8
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $1, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $2, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $3, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $3, ror #16
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $4, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $1, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $2, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $2, ror #24
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $3, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $4, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $1, ror #24
ldr T0, [TABLE, T0, lsl #2]
ldm $9, {$1,$2,$3,$4}
eor $8, $8, T0
sub TABLE, TABLE, #3072
eor $5, $5, $1
eor $6, $6, $2
sub $9, $9, #16
eor $7, $7, $3
eor $8, $8, $4
')
.file "aes-decrypt-internal.asm"
C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
ALIGN(4)
PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp, #+4]
push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
ALIGN(16)
.Lblock_loop:
ldm sp, {COUNT, KEY}
add TABLE, TABLE, #AES_TABLE0
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD_INCR(SRC,KEY,W3, -28)
str SRC, FRAME_SRC
b .Lentry
ALIGN(16)
.Lround_loop:
C Transform X -> W
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs COUNT, COUNT,#2
C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
C Final round
ldr DST, FRAME_DST
AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0)
AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1)
AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2)
AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3)
ldr SRC, FRAME_SRC
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
str DST, FRAME_DST
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
bx lr
EPILOGUE(_nettle_aes_decrypt)
C arm/v6/aes-encrypt-internal.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.arch armv6
include_src(`arm/aes.m4')
C Benchmarked at at 706, 870, 963 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling.
define(`PARAM_ROUNDS', `r0')
define(`PARAM_KEYS', `r1')
define(`TABLE', `r2')
define(`LENGTH', `r3')
C On stack: DST, SRC
define(`W0', `r4')
define(`W1', `r5')
define(`W2', `r6')
define(`W3', `r7')
define(`T0', `r8')
define(`COUNT', `r10')
define(`KEY', `r11')
define(`X0', `r0') C Overlaps PARAM_ROUNDS and PARAM_KEYS
define(`X1', `r1')
define(`X2', `r12')
define(`X3', `r14') C lr
define(`FRAME_ROUNDS', `[sp]')
define(`FRAME_KEYS', `[sp, #+4]')
C 8 saved registers
define(`FRAME_DST', `[sp, #+40]')
define(`FRAME_SRC', `[sp, #+44]')
define(`SRC', `r12') C Overlap registers used in inner loop.
define(`DST', `COUNT')
C 53 instr.
C It's tempting to use eor with rotation, but that's slower.
C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
define(`AES_ENCRYPT_ROUND', `
uxtb T0, $1
ldr $5, [TABLE, T0, lsl #2]
uxtb T0, $2
ldr $6, [TABLE, T0, lsl #2]
uxtb T0, $3
ldr $7, [TABLE, T0, lsl #2]
uxtb T0, $4
ldr $8, [TABLE, T0, lsl #2]
uxtb T0, $2, ror #8
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $3, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $4, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $1, ror #8
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $3, ror #16
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $4, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $1, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $2, ror #16
ldr T0, [TABLE, T0, lsl #2]
eor $8, $8, T0
uxtb T0, $4, ror #24
add TABLE, TABLE, #1024
ldr T0, [TABLE, T0, lsl #2]
eor $5, $5, T0
uxtb T0, $1, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $6, $6, T0
uxtb T0, $2, ror #24
ldr T0, [TABLE, T0, lsl #2]
eor $7, $7, T0
uxtb T0, $3, ror #24
ldr T0, [TABLE, T0, lsl #2]
ldm $9!, {$1,$2,$3,$4}
eor $8, $8, T0
sub TABLE, TABLE, #3072
eor $5, $5, $1
eor $6, $6, $2
eor $7, $7, $3
eor $8, $8, $4
')
.file "aes-encrypt-internal.asm"
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
ALIGN(4)
PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0
beq .Lend
ldr SRC, [sp, #+4]
push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
ALIGN(16)
.Lblock_loop:
ldm sp, {COUNT, KEY}
add TABLE, TABLE, #AES_TABLE0
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
str SRC, FRAME_SRC
b .Lentry
ALIGN(16)
.Lround_loop:
C Transform X -> W
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
subs COUNT, COUNT,#2
C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
C Final round
ldr DST, FRAME_DST
AES_FINAL_ROUND_V6(X0, X1, X2, X3, KEY, W0)
AES_FINAL_ROUND_V6(X1, X2, X3, X0, KEY, W1)
AES_FINAL_ROUND_V6(X2, X3, X0, X1, KEY, W2)
AES_FINAL_ROUND_V6(X3, X0, X1, X2, KEY, W3)
ldr SRC, FRAME_SRC
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
str DST, FRAME_DST
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
bx lr
EPILOGUE(_nettle_aes_encrypt)