Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • nettle/nettle
  • briansmith/nettle
  • ajlawrence/nettle
  • mhoffmann/nettle
  • devnexen/nettle
  • wiml/nettle
  • lumag/nettle
  • michaelweiser/nettle
  • aberaud/nettle
  • mamonet/nettle
  • npocs/nettle
  • babelouest/nettle
  • ueno/nettle
  • rth/nettle
14 results
Show changes
Showing
with 2941 additions and 0 deletions
C arm/v6/sha1-compress.asm
ifelse(`
Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "sha1-compress.asm"
.arch armv6
define(`STATE', `r0')
define(`INPUT', `r1')
define(`SA', `r2')
define(`SB', `r3')
define(`SC', `r4')
define(`SD', `r5')
define(`SE', `r6')
define(`T0', `r7')
define(`SHIFT', `r8')
define(`WPREV', `r10')
define(`W', `r12')
define(`K', `lr')
C FIXME: Could avoid a mov with even and odd variants.
define(`LOAD', `
ldr T0, [INPUT], #+4
sel W, WPREV, T0
ror W, W, SHIFT
mov WPREV, T0
IF_LE(` rev W, W')
str W, [SP,#eval(4*$1)]
')
define(`EXPN', `
ldr W, [sp, #+eval(4*$1)]
ldr T0, [sp, #+eval(4*(($1 + 2) % 16))]
eor W, W, T0
ldr T0, [sp, #+eval(4*(($1 + 8) % 16))]
eor W, W, T0
ldr T0, [sp, #+eval(4*(($1 + 13) % 16))]
eor W, W, T0
ror W, W, #31
str W, [sp, #+eval(4*$1)]
')
C F1(B,C,D) = D^(B&(C^D))
C ROUND1(A,B,C,D,E)
define(`ROUND1', `
eor T0, $3, $4
add $5, $5, K
and T0, T0, $2
add $5, $5, $1, ror #27
eor T0, T0, $4
add $5, $5, W
ror $2, $2, #2
add $5, $5, T0
')
C F2(B,C,D) = B^C^D
define(`ROUND2', `
eor T0, $2, $4
add $5, $5, K
eor T0, T0, $3
add $5, $5, $1, ror #27
add $5, $5, W
ror $2, $2, #2
add $5, $5, T0
')
C F3(B,C,D) = (B&C) | (D & (B|C)) = (B & (C ^ D)) + (C & D)
define(`ROUND3', `
eor T0, $3, $4
add $5, $5, K
and T0, T0, $2
add $5, $5, $1, ror #27
add $5, $5, T0
add $5, $5, W
and T0, $3, $4
ror $2, $2, #2
add $5, $5, T0
')
C void nettle_sha1_compress(uint32_t *state, const uint8_t *input)
.text
.align 2
.LK1:
.int 0x5A827999
.LK2:
.int 0x6ED9EBA1
.LK3:
.int 0x8F1BBCDC
PROLOGUE(nettle_sha1_compress)
push {r4,r5,r6,r7,r8,r10,lr}
sub sp, sp, #64
C Sets SHIFT to 8*low bits of input pointer. Sets up GE flags
C as follows, corresponding to bytes to be used from WPREV
C SHIFT 0 8 16 24
C CPSR.GE 0000 1110 1100 1000
ands SHIFT, INPUT, #3
and INPUT, INPUT, $-4
ldr WPREV, [INPUT]
addne INPUT, INPUT, #4 C Unaligned input
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
IF_LE(` lsl W, T0, SHIFT')
IF_BE(` lsr W, T0, SHIFT')
uadd8 T0, T0, W C Sets APSR.GE bits
C on BE rotate right by 32-SHIFT bits
C because there is no rotate left
IF_BE(` rsb SHIFT, SHIFT, #32')
ldr K, .LK1
ldm STATE, {SA,SB,SC,SD,SE}
LOAD( 0) ROUND1(SA, SB, SC, SD, SE)
LOAD( 1) ROUND1(SE, SA, SB, SC, SD)
LOAD( 2) ROUND1(SD, SE, SA, SB, SC)
LOAD( 3) ROUND1(SC, SD, SE, SA, SB)
LOAD( 4) ROUND1(SB, SC, SD, SE, SA)
LOAD( 5) ROUND1(SA, SB, SC, SD, SE)
LOAD( 6) ROUND1(SE, SA, SB, SC, SD)
LOAD( 7) ROUND1(SD, SE, SA, SB, SC)
LOAD( 8) ROUND1(SC, SD, SE, SA, SB)
LOAD( 9) ROUND1(SB, SC, SD, SE, SA)
LOAD(10) ROUND1(SA, SB, SC, SD, SE)
LOAD(11) ROUND1(SE, SA, SB, SC, SD)
LOAD(12) ROUND1(SD, SE, SA, SB, SC)
LOAD(13) ROUND1(SC, SD, SE, SA, SB)
LOAD(14) ROUND1(SB, SC, SD, SE, SA)
LOAD(15) ROUND1(SA, SB, SC, SD, SE)
EXPN( 0) ROUND1(SE, SA, SB, SC, SD)
EXPN( 1) ROUND1(SD, SE, SA, SB, SC)
EXPN( 2) ROUND1(SC, SD, SE, SA, SB)
EXPN( 3) ROUND1(SB, SC, SD, SE, SA)
ldr K, .LK2
EXPN( 4) ROUND2(SA, SB, SC, SD, SE)
EXPN( 5) ROUND2(SE, SA, SB, SC, SD)
EXPN( 6) ROUND2(SD, SE, SA, SB, SC)
EXPN( 7) ROUND2(SC, SD, SE, SA, SB)
EXPN( 8) ROUND2(SB, SC, SD, SE, SA)
EXPN( 9) ROUND2(SA, SB, SC, SD, SE)
EXPN(10) ROUND2(SE, SA, SB, SC, SD)
EXPN(11) ROUND2(SD, SE, SA, SB, SC)
EXPN(12) ROUND2(SC, SD, SE, SA, SB)
EXPN(13) ROUND2(SB, SC, SD, SE, SA)
EXPN(14) ROUND2(SA, SB, SC, SD, SE)
EXPN(15) ROUND2(SE, SA, SB, SC, SD)
EXPN( 0) ROUND2(SD, SE, SA, SB, SC)
EXPN( 1) ROUND2(SC, SD, SE, SA, SB)
EXPN( 2) ROUND2(SB, SC, SD, SE, SA)
EXPN( 3) ROUND2(SA, SB, SC, SD, SE)
EXPN( 4) ROUND2(SE, SA, SB, SC, SD)
EXPN( 5) ROUND2(SD, SE, SA, SB, SC)
EXPN( 6) ROUND2(SC, SD, SE, SA, SB)
EXPN( 7) ROUND2(SB, SC, SD, SE, SA)
ldr K, .LK3
EXPN( 8) ROUND3(SA, SB, SC, SD, SE)
EXPN( 9) ROUND3(SE, SA, SB, SC, SD)
EXPN(10) ROUND3(SD, SE, SA, SB, SC)
EXPN(11) ROUND3(SC, SD, SE, SA, SB)
EXPN(12) ROUND3(SB, SC, SD, SE, SA)
EXPN(13) ROUND3(SA, SB, SC, SD, SE)
EXPN(14) ROUND3(SE, SA, SB, SC, SD)
EXPN(15) ROUND3(SD, SE, SA, SB, SC)
EXPN( 0) ROUND3(SC, SD, SE, SA, SB)
EXPN( 1) ROUND3(SB, SC, SD, SE, SA)
EXPN( 2) ROUND3(SA, SB, SC, SD, SE)
EXPN( 3) ROUND3(SE, SA, SB, SC, SD)
EXPN( 4) ROUND3(SD, SE, SA, SB, SC)
EXPN( 5) ROUND3(SC, SD, SE, SA, SB)
EXPN( 6) ROUND3(SB, SC, SD, SE, SA)
EXPN( 7) ROUND3(SA, SB, SC, SD, SE)
EXPN( 8) ROUND3(SE, SA, SB, SC, SD)
EXPN( 9) ROUND3(SD, SE, SA, SB, SC)
EXPN(10) ROUND3(SC, SD, SE, SA, SB)
EXPN(11) ROUND3(SB, SC, SD, SE, SA)
ldr K, .LK4
EXPN(12) ROUND2(SA, SB, SC, SD, SE)
EXPN(13) ROUND2(SE, SA, SB, SC, SD)
EXPN(14) ROUND2(SD, SE, SA, SB, SC)
EXPN(15) ROUND2(SC, SD, SE, SA, SB)
EXPN( 0) ROUND2(SB, SC, SD, SE, SA)
EXPN( 1) ROUND2(SA, SB, SC, SD, SE)
EXPN( 2) ROUND2(SE, SA, SB, SC, SD)
EXPN( 3) ROUND2(SD, SE, SA, SB, SC)
EXPN( 4) ROUND2(SC, SD, SE, SA, SB)
EXPN( 5) ROUND2(SB, SC, SD, SE, SA)
EXPN( 6) ROUND2(SA, SB, SC, SD, SE)
EXPN( 7) ROUND2(SE, SA, SB, SC, SD)
EXPN( 8) ROUND2(SD, SE, SA, SB, SC)
EXPN( 9) ROUND2(SC, SD, SE, SA, SB)
EXPN(10) ROUND2(SB, SC, SD, SE, SA)
EXPN(11) ROUND2(SA, SB, SC, SD, SE)
EXPN(12) ROUND2(SE, SA, SB, SC, SD)
EXPN(13) ROUND2(SD, SE, SA, SB, SC)
EXPN(14) ROUND2(SC, SD, SE, SA, SB)
EXPN(15) ROUND2(SB, SC, SD, SE, SA)
C Use registers we no longer need.
ldm STATE, {INPUT,T0,SHIFT,W,K}
add SA, SA, INPUT
add SB, SB, T0
add SC, SC, SHIFT
add SD, SD, W
add SE, SE, K
add sp, sp, #64
stm STATE, {SA,SB,SC,SD,SE}
pop {r4,r5,r6,r7,r8,r10,pc}
EPILOGUE(nettle_sha1_compress)
.LK4:
.int 0xCA62C1D6
C arm/v6/sha256-compress-n.asm
ifelse(`
Copyright (C) 2013, 2022 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "sha256-compress-n.asm"
.arch armv6
define(`STATE', `r0')
define(`K', `r1')
define(`BLOCKS', `r2')
define(`INPUT', `r3')
define(`SA', `r2') C Overlap BLOCKS
define(`SB', `r4')
define(`SC', `r5')
define(`SD', `r6')
define(`SE', `r7')
define(`SF', `r8')
define(`SG', `r10')
define(`SH', `r11')
define(`T0', `r12')
define(`T1', `r3') C Overlap INPUT
define(`COUNT', `r0') C Overlap STATE
define(`W', `r14')
C Used for data load. Must not clobber STATE (r0), K (r1) or INPUT (r3)
define(`I0', `r2')
define(`I1', `r4')
define(`I2', `r5')
define(`I3', `r6')
define(`I4', `r7')
define(`DST', `r8')
define(`SHIFT', `r10')
define(`ILEFT', `r11')
define(`EXPN', `
ldr W, [sp, #+eval(4*$1)]
ldr T0, [sp, #+eval(4*(($1 + 14) % 16))]
ror T1, T0, #17
eor T1, T1, T0, ror #19
eor T1, T1, T0, lsr #10
add W, W, T1
ldr T0, [sp, #+eval(4*(($1 + 9) % 16))]
add W, W, T0
ldr T0, [sp, #+eval(4*(($1 + 1) % 16))]
ror T1, T0, #7
eor T1, T1, T0, ror #18
eor T1, T1, T0, lsr #3
add W, W, T1
str W, [sp, #+eval(4*$1)]
')
C ROUND(A,B,C,D,E,F,G,H)
C
C H += S1(E) + Choice(E,F,G) + K + W
C D += H
C H += S0(A) + Majority(A,B,C)
C
C Where
C
C S1(E) = E<<<26 ^ E<<<21 ^ E<<<7
C S0(A) = A<<<30 ^ A<<<19 ^ A<<<10
C Choice (E, F, G) = G^(E&(F^G))
C Majority (A,B,C) = (A&B) + (C&(A^B))
define(`ROUND', `
ror T0, $5, #6
eor T0, T0, $5, ror #11
eor T0, T0, $5, ror #25
add $8, $8, T0
eor T0, $6, $7
and T0, T0, $5
eor T0, T0, $7
add $8,$8, T0
ldr T0, [K], #+4
add $8, $8, W
add $8, $8, T0
add $4, $4, $8
ror T0, $1, #2
eor T0, T0, $1, ror #13
eor T0, T0, $1, ror #22
add $8, $8, T0
and T0, $1, $2
add $8, $8, T0
eor T0, $1, $2
and T0, T0, $3
add $8, $8, T0
')
define(`NOEXPN', `
ldr W, [sp, + $1]
add $1, $1, #4
')
.text
.align 2
define(`SHIFT_OFFSET', 64)
define(`INPUT_OFFSET', 68)
define(`I0_OFFSET', 72)
define(`STATE_OFFSET', 76)
define(`K_OFFSET', 80)
define(`BLOCKS_OFFSET', 84)
C const uint8_t *
C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
C size_t blocks, const uint8_t *input)
PROLOGUE(_nettle_sha256_compress_n)
cmp BLOCKS, #0
bne .Lwork
mov r0, INPUT
bx lr
.Lwork:
C Also save STATE (r0), K (r1) and BLOCKS (r2)
push {r0,r1,r2,r4,r5,r6,r7,r8,r10,r11,r12,r14}
sub sp, sp, #STATE_OFFSET
C Load data up front, since we don't have enough registers
C to load and shift on-the-fly
ands SHIFT, INPUT, #3
and INPUT, INPUT, $-4
ldr I0, [INPUT]
addne INPUT, INPUT, #4
lsl SHIFT, SHIFT, #3
mov T0, #0
movne T0, #-1
IF_LE(` lsl I1, T0, SHIFT')
IF_BE(` lsr I1, T0, SHIFT')
uadd8 T0, T0, I1 C Sets APSR.GE bits
C on BE rotate right by 32-SHIFT bits
C because there is no rotate left
IF_BE(` rsb SHIFT, SHIFT, #32')
str SHIFT, [sp, #SHIFT_OFFSET]
.Loop_block:
mov DST, sp
mov ILEFT, #4
.Lcopy:
ldm INPUT!, {I1,I2,I3,I4}
sel I0, I0, I1
ror I0, I0, SHIFT
IF_LE(` rev I0, I0')
sel I1, I1, I2
ror I1, I1, SHIFT
IF_LE(` rev I1, I1')
sel I2, I2, I3
ror I2, I2, SHIFT
IF_LE(` rev I2, I2')
sel I3, I3, I4
ror I3, I3, SHIFT
IF_LE(` rev I3, I3')
subs ILEFT, ILEFT, #1
stm DST!, {I0,I1,I2,I3}
mov I0, I4
bne .Lcopy
str INPUT, [sp, #INPUT_OFFSET]
str I0, [sp, #I0_OFFSET]
C Process block, with input at sp, expanded on the fly
ldm STATE, {SA,SB,SC,SD,SE,SF,SG,SH}
mov COUNT,#0
.Loop1:
NOEXPN(COUNT) ROUND(SA,SB,SC,SD,SE,SF,SG,SH)
NOEXPN(COUNT) ROUND(SH,SA,SB,SC,SD,SE,SF,SG)
NOEXPN(COUNT) ROUND(SG,SH,SA,SB,SC,SD,SE,SF)
NOEXPN(COUNT) ROUND(SF,SG,SH,SA,SB,SC,SD,SE)
NOEXPN(COUNT) ROUND(SE,SF,SG,SH,SA,SB,SC,SD)
NOEXPN(COUNT) ROUND(SD,SE,SF,SG,SH,SA,SB,SC)
NOEXPN(COUNT) ROUND(SC,SD,SE,SF,SG,SH,SA,SB)
NOEXPN(COUNT) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
cmp COUNT,#64
bne .Loop1
mov COUNT, #3
.Loop2:
EXPN( 0) ROUND(SA,SB,SC,SD,SE,SF,SG,SH)
EXPN( 1) ROUND(SH,SA,SB,SC,SD,SE,SF,SG)
EXPN( 2) ROUND(SG,SH,SA,SB,SC,SD,SE,SF)
EXPN( 3) ROUND(SF,SG,SH,SA,SB,SC,SD,SE)
EXPN( 4) ROUND(SE,SF,SG,SH,SA,SB,SC,SD)
EXPN( 5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC)
EXPN( 6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB)
EXPN( 7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
EXPN( 8) ROUND(SA,SB,SC,SD,SE,SF,SG,SH)
EXPN( 9) ROUND(SH,SA,SB,SC,SD,SE,SF,SG)
EXPN(10) ROUND(SG,SH,SA,SB,SC,SD,SE,SF)
EXPN(11) ROUND(SF,SG,SH,SA,SB,SC,SD,SE)
EXPN(12) ROUND(SE,SF,SG,SH,SA,SB,SC,SD)
EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC)
EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB)
subs COUNT, COUNT, #1
EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA)
bne .Loop2
ldr STATE, [sp, #STATE_OFFSET]
C No longer needed registers
ldm STATE, {K, T1, T0, W}
add SA, SA, K
add SB, SB, T1
add SC, SC, T0
add SD, SD, W
stm STATE!, {SA,SB,SC,SD}
ldm STATE, {K, T1, T0, W}
add SE, SE, K
add SF, SF, T1
add SG, SG, T0
add SH, SH, W
stm STATE, {SE,SF,SG,SH}
sub STATE, STATE, #16
ldr BLOCKS, [sp, #BLOCKS_OFFSET]
subs BLOCKS, BLOCKS, #1
str BLOCKS, [sp, #BLOCKS_OFFSET]
ldr SHIFT, [sp, #SHIFT_OFFSET]
ldr K, [sp, #K_OFFSET]
ldr INPUT, [sp, #INPUT_OFFSET]
ldr I0, [sp, #I0_OFFSET]
bne .Loop_block
C Restore input pointer adjustment
IF_BE(` rsbs SHIFT, SHIFT, #32')
IF_LE(` cmp SHIFT, #0')
subne INPUT, INPUT, #4
orr r0, INPUT, SHIFT, lsr #3
C Discard saved STATE, K and BLOCKS.
add sp, sp, #STATE_OFFSET + 12
pop {r4,r5,r6,r7,r8,r10,r11,r12,pc}
EPILOGUE(_nettle_sha256_compress_n)
General-purpose Registers[1]
There are thirty-one, 64-bit, general-purpose (integer) registers visible to
the A64 instruction set; these are labeled r0-r30. In a 64-bit context these
registers are normally referred to using the names x0-x30; in a 32-bit context
the registers are specified by using w0-w30. Additionally, a stack-pointer
register, SP, can be used with a restricted number of instructions.
The first eight registers, r0-r7, are used to pass argument values into
a subroutine and to return result values from a function.
Software developers creating platform-independent code are advised to avoid
using r18 if at all possible. Most compilers provide a mechanism to prevent
specific registers from being used for general allocation; portable hand-coded
assembler should avoid it entirely. It should not be assumed that treating the
register as callee-saved will be sufficient to satisfy the requirements of the
platform. Virtualization code must, of course, treat the register as they would
any other resource provided to the virtual machine.
A subroutine invocation must preserve the contents of the registers r19-r29
and SP. All 64 bits of each value stored in r19-r29 must be preserved, even
when using the ILP32 data model.
SIMD and Floating-Point Registers[1]
Unlike in AArch32, in AArch64 the 128-bit and 64-bit views of a SIMD and
Floating-Point register do not overlap multiple registers in a narrower view,
so q1, d1 and s1 all refer to the same entry in the register bank.
The first eight registers, v0-v7, are used to pass argument values into
a subroutine and to return result values from a function. They may also
be used to hold intermediate values within a routine (but, in general,
only between subroutine calls).
Registers v8-v15 must be preserved by a callee across subroutine calls;
the remaining registers (v0-v7, v16-v31) do not need to be preserved
(or should be preserved by the caller). Additionally, only the bottom 64 bits
of each value stored in v8-v15 need to be preserved.
Endianness
Similar to arm, aarch64 can run with little-endian or big-endian memory
accesses. Endianness is handled exclusively on load and store operations.
Register layout and operation behaviour is identical in both modes.
When writing SIMD code, endianness interaction with vector loads and stores may
exhibit seemingly unintuitive behaviour, particularly when mixing normal and
vector load/store operations.
See [2] for a good overview, particularly into the pitfalls of using
ldr/str vs. ld1/st1.
For example, ld1 {v1.2d,v2.2d},[x0] will load v1 and v2 with elements of a
one-dimensional vector from consecutive memory locations. So v1.d[0] will be
read from x0+0, v1.d[1] from x0+8 (bytes) and v2.d[0] from x0+16 and v2.d[1]
from x0+24. That'll be the same in LE and BE mode because it is the structure
of the vector prescribed by the load operation. Endianness will be applied to
the individual doublewords but the order in which they're loaded from memory
and in which they're put into d[0] and d[1] won't change.
Another way is to explicitly load a vector of bytes using ld1 {v1.16b,
v2.16b},[x0]. This will load x0+0 into v1.b[0], x0+1 (byte) into v1.b[1] and so
forth. This load (or store) is endianness-neutral and behaves identical in LE
and BE mode.
Care must however be taken when switching views onto the registers: d[0] is
mapped onto b[0] through b[7] and b[0] will be the least significant byte in
d[0] and b[7] will be MSB. This layout is also the same in both memory
endianness modes. ld1 {v1.16b}, however, will always load a vector of bytes
with eight elements as consecutive bytes from memory into b[0] through b[7].
When accessed trough d[0] this will only appear as the expected
doubleword-sized number if it was indeed stored little-endian in memory.
Something similar happens when loading a vector of doublewords (ld1
{v1.2d},[x0]) and then accessing individual bytes of it. Bytes will only be at
the expected indices if the doublewords are indeed stored in current memory
endianness in memory. Therefore it is most intuitive to use the appropriate
vector element width for the data being loaded or stored to apply the necessary
endianness correction.
Finally, ldr/str are not vector operations. When used to load a 128bit
quadword, they will apply endianness to the whole quadword. Therefore
particular care must be taken if the loaded data is then to be regarded as
elements of e.g. a doubleword vector. Indicies may appear reversed on
big-endian systems (because they are).
Hardware-accelerated SHA Instructions
The SHA optimized cores are implemented using SHA hashing instructions added
to AArch64 in crypto extensions. The repository [3] illustrates using those
instructions for optimizing SHA hashing functions.
[1] https://github.com/ARM-software/abi-aa/releases/download/2020Q4/aapcs64.pdf
[2] https://llvm.org/docs/BigEndianNEON.html
[3] https://github.com/noloader/SHA-Intrinsics
C arm64/chacha-2core.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Torbjörn Granlund
Copyright (C) 2022 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C Register usage:
C Argments
define(`DST', `x0')
define(`SRC', `x1')
define(`ROUNDS', `x2')
C Working state
define(`ROT24', `v0')
define(`T0', `v16')
C State, even elements in X, odd elements in Y
define(`X0', `v17')
define(`X1', `v18')
define(`X2', `v19')
define(`X3', `v20')
define(`Y0', `v21')
define(`Y1', `v22')
define(`Y2', `v23')
define(`Y3', `v24')
C Original input state
define(`S0', `v25')
define(`S1', `v26')
define(`S2', `v27')
define(`S3', `v28')
define(`S3p1', `v29')
define(`TMP0', `v30')
define(`TMP1', `v31')
C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_2core)
eor X1.16b, X1.16b, X1.16b
mov w3, #1
mov X1.s[0], w3
add x3, SRC, #48
ld1 {X3.4s}, [x3]
add Y3.4s, X3.4s, X1.4s
cmhi Y3.4s, X3.4s, Y3.4s
ext Y3.16b, Y3.16b, Y3.16b, #12
orr Y3.16b, Y3.16b, X1.16b
.Lshared_entry:
adr x3, .Lrot24
ld1 {ROT24.4s},[x3]
add Y3.4s, Y3.4s, X3.4s
C Load state
ld1 {X0.4s,X1.4s,X2.4s}, [SRC]
mov S0.16b, X0.16b
mov S1.16b, X1.16b
mov S2.16b, X2.16b
mov S3.16b, X3.16b
mov S3p1.16b, Y3.16b
trn2 Y0.4s, X0.4s, X0.4s C 1 1 3 3
trn1 X0.4s, X0.4s, X0.4s C 0 0 2 2
trn2 Y1.4s, X1.4s, X1.4s C 5 5 7 7
trn1 X1.4s, X1.4s, X1.4s C 4 4 6 6
trn2 Y2.4s, X2.4s, X2.4s C 9 9 11 11
trn1 X2.4s, X2.4s, X2.4s C 8 8 10 10
trn2 Y3.4s, X3.4s, S3p1.4s C 13 13 15 15
trn1 X3.4s, X3.4s, S3p1.4s C 12 12 14 14
.Loop:
C Register layout (A is first block, B is second block)
C
C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
C X1: A4 B4 A6 B6 Y1: A5 B5 A7 B7
C X2: A8 B8 A10 B10 Y2: A9 B9 A11 B11
C X3: A12 B12 A14 B14 Y3: A13 B13 A15 B15
add X0.4s, X0.4s, X1.4s
add Y0.4s, Y0.4s, Y1.4s
eor X3.16b, X3.16b, X0.16b
eor Y3.16b, Y3.16b, Y0.16b
rev32 X3.8h, X3.8h
rev32 Y3.8h, Y3.8h
add X2.4s, X2.4s, X3.4s
add Y2.4s, Y2.4s, Y3.4s
eor TMP0.16b, X1.16b, X2.16b
eor TMP1.16b, Y1.16b, Y2.16b
ushr X1.4s, TMP0.4s, #20
ushr Y1.4s, TMP1.4s, #20
sli X1.4s, TMP0.4s, #12
sli Y1.4s, TMP1.4s, #12
add X0.4s, X0.4s, X1.4s
add Y0.4s, Y0.4s, Y1.4s
eor X3.16b, X3.16b, X0.16b
eor Y3.16b, Y3.16b, Y0.16b
tbl X3.16b, {X3.16b}, ROT24.16b
tbl Y3.16b, {Y3.16b}, ROT24.16b
add X2.4s, X2.4s, X3.4s
add Y2.4s, Y2.4s, Y3.4s
eor TMP0.16b, X1.16b, X2.16b
eor TMP1.16b, Y1.16b, Y2.16b
ushr X1.4s, TMP0.4s, #25
ushr Y1.4s, TMP1.4s, #25
sli X1.4s, TMP0.4s, #7
sli Y1.4s, TMP1.4s, #7
ext X1.16b, X1.16b, X1.16b, #8
ext X2.16b, X2.16b, X2.16b, #8
ext Y2.16b, Y2.16b, Y2.16b, #8
ext Y3.16b, Y3.16b, Y3.16b, #8
C Register layout:
C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
C Y1: A5 B5 A7 B7 X1: A6 B6 A4 B4 (X1 swapped)
C X2: A10 B10 A8 B8 Y2: A11 A11 A9 B9 (X2, Y2 swapped)
C Y3 A15 B15 A13 B13 X3 A12 B12 A14 B14 (Y3 swapped)
add X0.4s, X0.4s, Y1.4s
add Y0.4s, Y0.4s, X1.4s
eor Y3.16b, Y3.16b, X0.16b
eor X3.16b, X3.16b, Y0.16b
rev32 Y3.8h, Y3.8h
rev32 X3.8h, X3.8h
add X2.4s, X2.4s, Y3.4s
add Y2.4s, Y2.4s, X3.4s
eor TMP0.16b, Y1.16b, X2.16b
eor TMP1.16b, X1.16b, Y2.16b
ushr Y1.4s, TMP0.4s, #20
ushr X1.4s, TMP1.4s, #20
sli Y1.4s, TMP0.4s, #12
sli X1.4s, TMP1.4s, #12
add X0.4s, X0.4s, Y1.4s
add Y0.4s, Y0.4s, X1.4s
eor Y3.16b, Y3.16b, X0.16b
eor X3.16b, X3.16b, Y0.16b
tbl Y3.16b, {Y3.16b}, ROT24.16b
tbl X3.16b, {X3.16b}, ROT24.16b
add X2.4s, X2.4s, Y3.4s
add Y2.4s, Y2.4s, X3.4s
eor TMP0.16b, Y1.16b, X2.16b
eor TMP1.16b, X1.16b, Y2.16b
ushr Y1.4s, TMP0.4s, #25
ushr X1.4s, TMP1.4s, #25
sli Y1.4s, TMP0.4s, #7
sli X1.4s, TMP1.4s, #7
ext X1.16b, X1.16b, X1.16b, #8
ext X2.16b, X2.16b, X2.16b, #8
ext Y2.16b, Y2.16b, Y2.16b, #8
ext Y3.16b, Y3.16b, Y3.16b, #8
subs ROUNDS, ROUNDS, #2
b.ne .Loop
trn1 T0.4s, X0.4s, Y0.4s
trn2 Y0.4s, X0.4s, Y0.4s
trn1 X0.4s, X1.4s, Y1.4s
trn2 Y1.4s, X1.4s, Y1.4s
trn1 X1.4s, X2.4s, Y2.4s
trn2 Y2.4s, X2.4s, Y2.4s
trn1 X2.4s, X3.4s, Y3.4s
trn2 Y3.4s, X3.4s, Y3.4s
add T0.4s, T0.4s, S0.4s
add Y0.4s, Y0.4s, S0.4s
add X0.4s, X0.4s, S1.4s
add Y1.4s, Y1.4s, S1.4s
add X1.4s, X1.4s, S2.4s
add Y2.4s, Y2.4s, S2.4s
add X2.4s, X2.4s, S3.4s
add Y3.4s, Y3.4s, S3p1.4s
st1 {T0.16b,X0.16b,X1.16b,X2.16b}, [DST], #64
st1 {Y0.16b,Y1.16b,Y2.16b,Y3.16b}, [DST]
ret
EPILOGUE(_nettle_chacha_2core)
PROLOGUE(_nettle_chacha_2core32)
eor Y3.16b, Y3.16b, Y3.16b C {0,0,...,0}
mov w3, #1
mov Y3.s[0], w3 C {1,0,...,0}
add x3, SRC, #48
ld1 {X3.4s}, [x3]
b .Lshared_entry
EPILOGUE(_nettle_chacha_2core32)
.align 4
.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
C arm64/chacha-4core.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Torbjörn Granlund
Copyright (C) 2022 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C Register usage:
C Argments
define(`DST', `x0')
define(`SRC', `x1')
define(`ROUNDS', `x2')
C Working state
C During the loop, used to save the original values for last 4 words
C of each block. Also used as temporaries for transpose.
define(`T0', `v0')
define(`T1', `v1')
define(`T2', `v2')
define(`T3', `v3')
define(`TMP0', `v4')
define(`TMP1', `v5')
define(`TMP2', `v6')
define(`TMP3', `v7')
define(`ROT24', `v8')
C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
define(`P1',
`ifelse($1, 0, v16, $1, 1, v17, $1, 2, v18, $1, 3, v19, $1, 4, v20, $1, 5, v21, $1, 6, v22, $1, 7, v23, $1, 8, v24, $1, 9, v25, $1, 10, v26, $1, 11, v27, $1, 12, v28, $1, 13, v29, $1, 14, v30, $1, 15, v31)')
define(`P2',
`ifelse($1, 0, v16, $1, 1, v21, $1, 2, v26, $1, 3, v31, $1, 4, v20, $1, 5, v25, $1, 6, v30, $1, 7, v19, $1, 8, v24, $1, 9, v29, $1, 10, v18, $1, 11, v23, $1, 12, v28, $1, 13, v17, $1, 14, v22, $1, 15, v27)')
C Main loop for round
define(`QR',`
add $1(0).4s, $1(0).4s, $1(1).4s
add $1(4).4s, $1(4).4s, $1(5).4s
add $1(8).4s, $1(8).4s, $1(9).4s
add $1(12).4s, $1(12).4s, $1(13).4s
eor $1(3).16b, $1(3).16b, $1(0).16b
eor $1(7).16b, $1(7).16b, $1(4).16b
eor $1(11).16b, $1(11).16b, $1(8).16b
eor $1(15).16b, $1(15).16b, $1(12).16b
rev32 $1(3).8h, $1(3).8h
rev32 $1(7).8h, $1(7).8h
rev32 $1(11).8h, $1(11).8h
rev32 $1(15).8h, $1(15).8h
add $1(2).4s, $1(2).4s, $1(3).4s
add $1(6).4s, $1(6).4s, $1(7).4s
add $1(10).4s, $1(10).4s, $1(11).4s
add $1(14).4s, $1(14).4s, $1(15).4s
eor TMP0.16b, $1(1).16b, $1(2).16b
eor TMP1.16b, $1(5).16b, $1(6).16b
eor TMP2.16b, $1(9).16b, $1(10).16b
eor TMP3.16b, $1(13).16b, $1(14).16b
ushr $1(1).4s, TMP0.4s, #20
ushr $1(5).4s, TMP1.4s, #20
ushr $1(9).4s, TMP2.4s, #20
ushr $1(13).4s, TMP3.4s, #20
sli $1(1).4s, TMP0.4s, #12
sli $1(5).4s, TMP1.4s, #12
sli $1(9).4s, TMP2.4s, #12
sli $1(13).4s, TMP3.4s, #12
add $1(0).4s, $1(0).4s, $1(1).4s
add $1(4).4s, $1(4).4s, $1(5).4s
add $1(8).4s, $1(8).4s, $1(9).4s
add $1(12).4s, $1(12).4s, $1(13).4s
eor $1(3).16b, $1(3).16b, $1(0).16b
eor $1(7).16b, $1(7).16b, $1(4).16b
eor $1(11).16b, $1(11).16b, $1(8).16b
eor $1(15).16b, $1(15).16b, $1(12).16b
tbl $1(3).16b, {$1(3).16b}, ROT24.16b
tbl $1(7).16b, {$1(7).16b}, ROT24.16b
tbl $1(11).16b, {$1(11).16b}, ROT24.16b
tbl $1(15).16b, {$1(15).16b}, ROT24.16b
add $1(2).4s, $1(2).4s, $1(3).4s
add $1(6).4s, $1(6).4s, $1(7).4s
add $1(10).4s, $1(10).4s, $1(11).4s
add $1(14).4s, $1(14).4s, $1(15).4s
eor TMP0.16b, $1(1).16b, $1(2).16b
eor TMP1.16b, $1(5).16b, $1(6).16b
eor TMP2.16b, $1(9).16b, $1(10).16b
eor TMP3.16b, $1(13).16b, $1(14).16b
ushr $1(1).4s, TMP0.4s, #25
ushr $1(5).4s, TMP1.4s, #25
ushr $1(9).4s, TMP2.4s, #25
ushr $1(13).4s, TMP3.4s, #25
sli $1(1).4s, TMP0.4s, #7
sli $1(5).4s, TMP1.4s, #7
sli $1(9).4s, TMP2.4s, #7
sli $1(13).4s, TMP3.4s, #7
')
define(`TRANSPOSE',`
zip1 T0.4s, $1.4s, $3.4s C A0 A2 B0 B2
zip1 T1.4s, $2.4s, $4.4s C A1 A3 B1 B3
zip2 T2.4s, $1.4s, $3.4s C C0 C2 D0 D2
zip2 T3.4s, $2.4s, $4.4s C C1 C3 D1 D3
zip1 $1.4s, T0.4s, T1.4s C A0 A1 A2 A3
zip2 $2.4s, T0.4s, T1.4s C B0 B1 B2 B3
zip1 $3.4s, T2.4s, T3.4s C C0 C2 C1 C3
zip2 $4.4s, T2.4s, T3.4s C D0 D1 D2 D3
')
C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_4core)
mov w3, #1
dup TMP2.4s, w3 C Apply counter carries
.Lshared_entry:
C Save callee-save registers
fmov x3, d8
adr x4, .Lcnts
ld1 {TMP3.4s,ROT24.4s},[x4]
C Load state and splat
ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [SRC]
dup v20.4s, v16.s[1]
dup v24.4s, v16.s[2]
dup v28.4s, v16.s[3]
dup v16.4s, v16.s[0]
dup v21.4s, v17.s[1]
dup v25.4s, v17.s[2]
dup v29.4s, v17.s[3]
dup v17.4s, v17.s[0]
dup v22.4s, v18.s[1]
dup v26.4s, v18.s[2]
dup v30.4s, v18.s[3]
dup v18.4s, v18.s[0]
dup v23.4s, v19.s[1]
dup v27.4s, v19.s[2]
dup v31.4s, v19.s[3]
dup v19.4s, v19.s[0]
add v19.4s, v19.4s, TMP3.4s C low adds
cmhi TMP1.4s, TMP3.4s, v19.4s C compute carry-out
and TMP1.16b, TMP1.16b, TMP2.16b C discard carries for 32-bit counter variant
add v23.4s, v23.4s, TMP1.4s C apply carries
C Save all 4x4 of the last words.
mov T0.16b, v19.16b
mov T1.16b, v23.16b
mov T2.16b, v27.16b
mov T3.16b, v31.16b
.Loop:
QR(`P1')
QR(`P2')
subs ROUNDS, ROUNDS, #2
b.ne .Loop
C Add in saved original words, including counters, before
C transpose.
add v19.4s, v19.4s, T0.4s
add v23.4s, v23.4s, T1.4s
add v27.4s, v27.4s, T2.4s
add v31.4s, v31.4s, T3.4s
TRANSPOSE(v16, v20,v24, v28)
TRANSPOSE(v17, v21, v25, v29)
TRANSPOSE(v18, v22, v26, v30)
TRANSPOSE(v19, v23, v27, v31)
ld1 {T0.4s,T1.4s,T2.4s}, [SRC]
add v16.4s, v16.4s, T0.4s
add v20.4s, v20.4s, T0.4s
add v24.4s, v24.4s, T0.4s
add v28.4s, v28.4s, T0.4s
add v17.4s, v17.4s, T1.4s
add v21.4s, v21.4s, T1.4s
add v25.4s, v25.4s, T1.4s
add v29.4s, v29.4s, T1.4s
add v18.4s, v18.4s, T2.4s
add v22.4s, v22.4s, T2.4s
add v26.4s, v26.4s, T2.4s
add v30.4s, v30.4s, T2.4s
st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [DST], #64
st1 {v20.16b,v21.16b,v22.16b,v23.16b}, [DST], #64
st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [DST], #64
st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [DST]
C Restore callee-save registers
fmov d8, x3
ret
EPILOGUE(_nettle_chacha_4core)
PROLOGUE(_nettle_chacha_4core32)
eor TMP2.16b, TMP2.16b, TMP2.16b C Ignore counter carries
b .Lshared_entry
EPILOGUE(_nettle_chacha_4core32)
.align 4
.Lcnts: .long 0,1,2,3 C increments
.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
C arm64/chacha-core-internal.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Torbjörn Granlund
Copyright (C) 2022 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C Register usage:
C Argments
define(`DST', `x0')
define(`SRC', `x1')
define(`ROUNDS', `x2')
C Working state
define(`X0', `v0')
define(`X1', `v1')
define(`X2', `v2')
define(`X3', `v3')
C Original input state
define(`S0', `v4')
define(`S1', `v5')
define(`S2', `v6')
define(`S3', `v7')
define(`ROT24', `v16')
define(`TMP', `v17')
C QROUND(X0, X1, X2, X3)
define(`QROUND', `
C x0 += x1, x3 ^= x0, x3 lrot 16
C x2 += x3, x1 ^= x2, x1 lrot 12
C x0 += x1, x3 ^= x0, x3 lrot 8
C x2 += x3, x1 ^= x2, x1 lrot 7
add $1.4s, $1.4s, $2.4s
eor $4.16b, $4.16b, $1.16b
rev32 $4.8h, $4.8h
add $3.4s, $3.4s, $4.4s
eor TMP.16b, $2.16b, $3.16b
ushr $2.4s, TMP.4s, #20
sli $2.4s, TMP.4s, #12
add $1.4s, $1.4s, $2.4s
eor $4.16b, $4.16b, $1.16b
tbl $4.16b, {$4.16b}, ROT24.16b
add $3.4s, $3.4s, $4.4s
eor TMP.16b, $2.16b, $3.16b
ushr $2.4s, TMP.4s, #25
sli $2.4s, TMP.4s, #7
')
.text
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_core)
adr x3, .Lrot24
ld1 {ROT24.4s},[x3]
ld1 {X0.4s,X1.4s,X2.4s,X3.4s}, [SRC]
mov S0.16b, X0.16b
mov S1.16b, X1.16b
mov S2.16b, X2.16b
mov S3.16b, X3.16b
.Loop:
QROUND(X0, X1, X2, X3)
C Rotate rows, to get
C 0 1 2 3
C 5 6 7 4 <<< 1
C 10 11 8 9 <<< 2
C 15 12 13 14 <<< 3
ext X1.16b, X1.16b, X1.16b, #4
ext X2.16b, X2.16b, X2.16b, #8
ext X3.16b, X3.16b, X3.16b, #12
QROUND(X0, X1, X2, X3)
ext X1.16b, X1.16b, X1.16b, #12
ext X2.16b, X2.16b, X2.16b, #8
ext X3.16b, X3.16b, X3.16b, #4
subs ROUNDS, ROUNDS, #2
b.ne .Loop
add X0.4s, X0.4s, S0.4s
add X1.4s, X1.4s, S1.4s
add X2.4s, X2.4s, S2.4s
add X3.4s, X3.4s, S3.4s
st1 {X0.16b,X1.16b,X2.16b,X3.16b}, [DST]
ret
EPILOGUE(_nettle_chacha_core)
.align 4
.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
C arm64/crypto/aes128-decrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "aes128-decrypt.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`KEYS', `x0')
define(`LENGTH', `x1')
define(`DST', `x2')
define(`SRC', `x3')
define(`S0', `v0')
define(`S1', `v1')
define(`S2', `v2')
define(`S3', `v3')
define(`K0', `v16')
define(`K1', `v17')
define(`K2', `v18')
define(`K3', `v19')
define(`K4', `v20')
define(`K5', `v21')
define(`K6', `v22')
define(`K7', `v23')
define(`K8', `v24')
define(`K9', `v25')
define(`K10', `v26')
C void
C aes128_decrypt(const struct aes128_ctx *ctx,
C size_t length, uint8_t *dst,
C const uint8_t *src)
PROLOGUE(nettle_aes128_decrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ld1 {K8.4s,K9.4s,K10.4s},[KEYS]
ands x4,LENGTH,#-64
b.eq L1B
L4B_loop:
ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
AESD_ROUND_4B(S0,S1,S2,S3,K10)
AESD_ROUND_4B(S0,S1,S2,S3,K9)
AESD_ROUND_4B(S0,S1,S2,S3,K8)
AESD_ROUND_4B(S0,S1,S2,S3,K7)
AESD_ROUND_4B(S0,S1,S2,S3,K6)
AESD_ROUND_4B(S0,S1,S2,S3,K5)
AESD_ROUND_4B(S0,S1,S2,S3,K4)
AESD_ROUND_4B(S0,S1,S2,S3,K3)
AESD_ROUND_4B(S0,S1,S2,S3,K2)
AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0)
st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
subs x4,x4,#64
b.ne L4B_loop
and LENGTH,LENGTH,#63
L1B:
cbz LENGTH,Ldone
L1B_loop:
ld1 {S0.16b},[SRC],#16
AESD_ROUND_1B(S0,K10)
AESD_ROUND_1B(S0,K9)
AESD_ROUND_1B(S0,K8)
AESD_ROUND_1B(S0,K7)
AESD_ROUND_1B(S0,K6)
AESD_ROUND_1B(S0,K5)
AESD_ROUND_1B(S0,K4)
AESD_ROUND_1B(S0,K3)
AESD_ROUND_1B(S0,K2)
AESD_LAST_ROUND_1B(S0,K1,K0)
st1 {S0.16b},[DST],#16
subs LENGTH,LENGTH,#16
b.ne L1B_loop
Ldone:
ret
EPILOGUE(nettle_aes128_decrypt)
C arm64/crypto/aes128-encrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "aes128-encrypt.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`KEYS', `x0')
define(`LENGTH', `x1')
define(`DST', `x2')
define(`SRC', `x3')
define(`S0', `v0')
define(`S1', `v1')
define(`S2', `v2')
define(`S3', `v3')
define(`K0', `v16')
define(`K1', `v17')
define(`K2', `v18')
define(`K3', `v19')
define(`K4', `v20')
define(`K5', `v21')
define(`K6', `v22')
define(`K7', `v23')
define(`K8', `v24')
define(`K9', `v25')
define(`K10', `v26')
C void
C aes128_encrypt(const struct aes128_ctx *ctx,
C size_t length, uint8_t *dst,
C const uint8_t *src)
PROLOGUE(nettle_aes128_encrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ld1 {K8.4s,K9.4s,K10.4s},[KEYS]
ands x4,LENGTH,#-64
b.eq L1B
L4B_loop:
ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
AESE_ROUND_4B(S0,S1,S2,S3,K0)
AESE_ROUND_4B(S0,S1,S2,S3,K1)
AESE_ROUND_4B(S0,S1,S2,S3,K2)
AESE_ROUND_4B(S0,S1,S2,S3,K3)
AESE_ROUND_4B(S0,S1,S2,S3,K4)
AESE_ROUND_4B(S0,S1,S2,S3,K5)
AESE_ROUND_4B(S0,S1,S2,S3,K6)
AESE_ROUND_4B(S0,S1,S2,S3,K7)
AESE_ROUND_4B(S0,S1,S2,S3,K8)
AESE_LAST_ROUND_4B(S0,S1,S2,S3,K9,K10)
st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
subs x4,x4,#64
b.ne L4B_loop
and LENGTH,LENGTH,#63
L1B:
cbz LENGTH,Ldone
L1B_loop:
ld1 {S0.16b},[SRC],#16
AESE_ROUND_1B(S0,K0)
AESE_ROUND_1B(S0,K1)
AESE_ROUND_1B(S0,K2)
AESE_ROUND_1B(S0,K3)
AESE_ROUND_1B(S0,K4)
AESE_ROUND_1B(S0,K5)
AESE_ROUND_1B(S0,K6)
AESE_ROUND_1B(S0,K7)
AESE_ROUND_1B(S0,K8)
AESE_LAST_ROUND_1B(S0,K9,K10)
st1 {S0.16b},[DST],#16
subs LENGTH,LENGTH,#16
b.ne L1B_loop
Ldone:
ret
EPILOGUE(nettle_aes128_encrypt)
C arm64/crypto/aes192-decrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "aes192-decrypt.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`KEYS', `x0')
define(`LENGTH', `x1')
define(`DST', `x2')
define(`SRC', `x3')
define(`S0', `v0')
define(`S1', `v1')
define(`S2', `v2')
define(`S3', `v3')
define(`K0', `v16')
define(`K1', `v17')
define(`K2', `v18')
define(`K3', `v19')
define(`K4', `v20')
define(`K5', `v21')
define(`K6', `v22')
define(`K7', `v23')
define(`K8', `v24')
define(`K9', `v25')
define(`K10', `v26')
define(`K11', `v27')
define(`K12', `v28')
C void
C aes192_decrypt(const struct aes192_ctx *ctx,
C size_t length, uint8_t *dst,
C const uint8_t *src)
PROLOGUE(nettle_aes192_decrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ld1 {K8.4s,K9.4s,K10.4s,K11.4s},[KEYS],#64
ld1 {K12.4s},[KEYS]
ands x4,LENGTH,#-64
b.eq L1B
L4B_loop:
ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
AESD_ROUND_4B(S0,S1,S2,S3,K12)
AESD_ROUND_4B(S0,S1,S2,S3,K11)
AESD_ROUND_4B(S0,S1,S2,S3,K10)
AESD_ROUND_4B(S0,S1,S2,S3,K9)
AESD_ROUND_4B(S0,S1,S2,S3,K8)
AESD_ROUND_4B(S0,S1,S2,S3,K7)
AESD_ROUND_4B(S0,S1,S2,S3,K6)
AESD_ROUND_4B(S0,S1,S2,S3,K5)
AESD_ROUND_4B(S0,S1,S2,S3,K4)
AESD_ROUND_4B(S0,S1,S2,S3,K3)
AESD_ROUND_4B(S0,S1,S2,S3,K2)
AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0)
st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
subs x4,x4,#64
b.ne L4B_loop
and LENGTH,LENGTH,#63
L1B:
cbz LENGTH,Ldone
L1B_loop:
ld1 {S0.16b},[SRC],#16
AESD_ROUND_1B(S0,K12)
AESD_ROUND_1B(S0,K11)
AESD_ROUND_1B(S0,K10)
AESD_ROUND_1B(S0,K9)
AESD_ROUND_1B(S0,K8)
AESD_ROUND_1B(S0,K7)
AESD_ROUND_1B(S0,K6)
AESD_ROUND_1B(S0,K5)
AESD_ROUND_1B(S0,K4)
AESD_ROUND_1B(S0,K3)
AESD_ROUND_1B(S0,K2)
AESD_LAST_ROUND_1B(S0,K1,K0)
st1 {S0.16b},[DST],#16
subs LENGTH,LENGTH,#16
b.ne L1B_loop
Ldone:
ret
EPILOGUE(nettle_aes192_decrypt)
C arm64/crypto/aes192-encrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "aes192-encrypt.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`KEYS', `x0')
define(`LENGTH', `x1')
define(`DST', `x2')
define(`SRC', `x3')
define(`S0', `v0')
define(`S1', `v1')
define(`S2', `v2')
define(`S3', `v3')
define(`K0', `v16')
define(`K1', `v17')
define(`K2', `v18')
define(`K3', `v19')
define(`K4', `v20')
define(`K5', `v21')
define(`K6', `v22')
define(`K7', `v23')
define(`K8', `v24')
define(`K9', `v25')
define(`K10', `v26')
define(`K11', `v27')
define(`K12', `v28')
C void
C aes192_encrypt(const struct aes192_ctx *ctx,
C size_t length, uint8_t *dst,
C const uint8_t *src)
PROLOGUE(nettle_aes192_encrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ld1 {K8.4s,K9.4s,K10.4s,K11.4s},[KEYS],#64
ld1 {K12.4s},[KEYS]
ands x4,LENGTH,#-64
b.eq L1B
L4B_loop:
ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
AESE_ROUND_4B(S0,S1,S2,S3,K0)
AESE_ROUND_4B(S0,S1,S2,S3,K1)
AESE_ROUND_4B(S0,S1,S2,S3,K2)
AESE_ROUND_4B(S0,S1,S2,S3,K3)
AESE_ROUND_4B(S0,S1,S2,S3,K4)
AESE_ROUND_4B(S0,S1,S2,S3,K5)
AESE_ROUND_4B(S0,S1,S2,S3,K6)
AESE_ROUND_4B(S0,S1,S2,S3,K7)
AESE_ROUND_4B(S0,S1,S2,S3,K8)
AESE_ROUND_4B(S0,S1,S2,S3,K9)
AESE_ROUND_4B(S0,S1,S2,S3,K10)
AESE_LAST_ROUND_4B(S0,S1,S2,S3,K11,K12)
st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
subs x4,x4,#64
b.ne L4B_loop
and LENGTH,LENGTH,#63
L1B:
cbz LENGTH,Ldone
L1B_loop:
ld1 {S0.16b},[SRC],#16
AESE_ROUND_1B(S0,K0)
AESE_ROUND_1B(S0,K1)
AESE_ROUND_1B(S0,K2)
AESE_ROUND_1B(S0,K3)
AESE_ROUND_1B(S0,K4)
AESE_ROUND_1B(S0,K5)
AESE_ROUND_1B(S0,K6)
AESE_ROUND_1B(S0,K7)
AESE_ROUND_1B(S0,K8)
AESE_ROUND_1B(S0,K9)
AESE_ROUND_1B(S0,K10)
AESE_LAST_ROUND_1B(S0,K11,K12)
st1 {S0.16b},[DST],#16
subs LENGTH,LENGTH,#16
b.ne L1B_loop
Ldone:
ret
EPILOGUE(nettle_aes192_encrypt)
C arm64/crypto/aes256-decrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "aes256-decrypt.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`KEYS', `x0')
define(`LENGTH', `x1')
define(`DST', `x2')
define(`SRC', `x3')
define(`S0', `v0')
define(`S1', `v1')
define(`S2', `v2')
define(`S3', `v3')
define(`K0', `v16')
define(`K1', `v17')
define(`K2', `v18')
define(`K3', `v19')
define(`K4', `v20')
define(`K5', `v21')
define(`K6', `v22')
define(`K7', `v23')
define(`K8', `v24')
define(`K9', `v25')
define(`K10', `v26')
define(`K11', `v27')
define(`K12', `v28')
define(`K13', `v29')
define(`K14', `v30')
C void
C aes256_decrypt(const struct aes256_ctx *ctx,
C size_t length, uint8_t *dst,
C const uint8_t *src)
PROLOGUE(nettle_aes256_decrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ld1 {K8.4s,K9.4s,K10.4s,K11.4s},[KEYS],#64
ld1 {K12.4s,K13.4s,K14.4s},[KEYS]
ands x4,LENGTH,#-64
b.eq L1B
L4B_loop:
ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
AESD_ROUND_4B(S0,S1,S2,S3,K14)
AESD_ROUND_4B(S0,S1,S2,S3,K13)
AESD_ROUND_4B(S0,S1,S2,S3,K12)
AESD_ROUND_4B(S0,S1,S2,S3,K11)
AESD_ROUND_4B(S0,S1,S2,S3,K10)
AESD_ROUND_4B(S0,S1,S2,S3,K9)
AESD_ROUND_4B(S0,S1,S2,S3,K8)
AESD_ROUND_4B(S0,S1,S2,S3,K7)
AESD_ROUND_4B(S0,S1,S2,S3,K6)
AESD_ROUND_4B(S0,S1,S2,S3,K5)
AESD_ROUND_4B(S0,S1,S2,S3,K4)
AESD_ROUND_4B(S0,S1,S2,S3,K3)
AESD_ROUND_4B(S0,S1,S2,S3,K2)
AESD_LAST_ROUND_4B(S0,S1,S2,S3,K1,K0)
st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
subs x4,x4,#64
b.ne L4B_loop
and LENGTH,LENGTH,#63
L1B:
cbz LENGTH,Ldone
L1B_loop:
ld1 {S0.16b},[SRC],#16
AESD_ROUND_1B(S0,K14)
AESD_ROUND_1B(S0,K13)
AESD_ROUND_1B(S0,K12)
AESD_ROUND_1B(S0,K11)
AESD_ROUND_1B(S0,K10)
AESD_ROUND_1B(S0,K9)
AESD_ROUND_1B(S0,K8)
AESD_ROUND_1B(S0,K7)
AESD_ROUND_1B(S0,K6)
AESD_ROUND_1B(S0,K5)
AESD_ROUND_1B(S0,K4)
AESD_ROUND_1B(S0,K3)
AESD_ROUND_1B(S0,K2)
AESD_LAST_ROUND_1B(S0,K1,K0)
st1 {S0.16b},[DST],#16
subs LENGTH,LENGTH,#16
b.ne L1B_loop
Ldone:
ret
EPILOGUE(nettle_aes256_decrypt)
C arm64/crypto/aes256-encrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "aes256-encrypt.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`KEYS', `x0')
define(`LENGTH', `x1')
define(`DST', `x2')
define(`SRC', `x3')
define(`S0', `v0')
define(`S1', `v1')
define(`S2', `v2')
define(`S3', `v3')
define(`K0', `v16')
define(`K1', `v17')
define(`K2', `v18')
define(`K3', `v19')
define(`K4', `v20')
define(`K5', `v21')
define(`K6', `v22')
define(`K7', `v23')
define(`K8', `v24')
define(`K9', `v25')
define(`K10', `v26')
define(`K11', `v27')
define(`K12', `v28')
define(`K13', `v29')
define(`K14', `v30')
C void
C aes256_encrypt(const struct aes256_ctx *ctx,
C size_t length, uint8_t *dst,
C const uint8_t *src)
PROLOGUE(nettle_aes256_encrypt)
ld1 {K0.4s,K1.4s,K2.4s,K3.4s},[KEYS],#64
ld1 {K4.4s,K5.4s,K6.4s,K7.4s},[KEYS],#64
ld1 {K8.4s,K9.4s,K10.4s,K11.4s},[KEYS],#64
ld1 {K12.4s,K13.4s,K14.4s},[KEYS]
ands x4,LENGTH,#-64
b.eq L1B
L4B_loop:
ld1 {S0.16b,S1.16b,S2.16b,S3.16b},[SRC],#64
AESE_ROUND_4B(S0,S1,S2,S3,K0)
AESE_ROUND_4B(S0,S1,S2,S3,K1)
AESE_ROUND_4B(S0,S1,S2,S3,K2)
AESE_ROUND_4B(S0,S1,S2,S3,K3)
AESE_ROUND_4B(S0,S1,S2,S3,K4)
AESE_ROUND_4B(S0,S1,S2,S3,K5)
AESE_ROUND_4B(S0,S1,S2,S3,K6)
AESE_ROUND_4B(S0,S1,S2,S3,K7)
AESE_ROUND_4B(S0,S1,S2,S3,K8)
AESE_ROUND_4B(S0,S1,S2,S3,K9)
AESE_ROUND_4B(S0,S1,S2,S3,K10)
AESE_ROUND_4B(S0,S1,S2,S3,K11)
AESE_ROUND_4B(S0,S1,S2,S3,K12)
AESE_LAST_ROUND_4B(S0,S1,S2,S3,K13,K14)
st1 {S0.16b,S1.16b,S2.16b,S3.16b},[DST],#64
subs x4,x4,#64
b.ne L4B_loop
and LENGTH,LENGTH,#63
L1B:
cbz LENGTH,Ldone
L1B_loop:
ld1 {S0.16b},[SRC],#16
AESE_ROUND_1B(S0,K0)
AESE_ROUND_1B(S0,K1)
AESE_ROUND_1B(S0,K2)
AESE_ROUND_1B(S0,K3)
AESE_ROUND_1B(S0,K4)
AESE_ROUND_1B(S0,K5)
AESE_ROUND_1B(S0,K6)
AESE_ROUND_1B(S0,K7)
AESE_ROUND_1B(S0,K8)
AESE_ROUND_1B(S0,K9)
AESE_ROUND_1B(S0,K10)
AESE_ROUND_1B(S0,K11)
AESE_ROUND_1B(S0,K12)
AESE_LAST_ROUND_1B(S0,K13,K14)
st1 {S0.16b},[DST],#16
subs LENGTH,LENGTH,#16
b.ne L1B_loop
Ldone:
ret
EPILOGUE(nettle_aes256_encrypt)
C arm64/crypto/ghash-set-key.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Mamone Tarsha
Copyright (C) 2021 Michael Weiser
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "ghash-set-key.asm"
.arch armv8-a+crypto
.text
C common SIMD register usage:
define(`POLY', `v6')
C temporary register that assist the reduction procedure
define(`T', `v7')
C permenant register that hold the 16-byte result of pmull
define(`F', `v16')
C permenant register that hold the 16-byte result of pmull2,
C its value is accumulated on 'F' register immediately
define(`F1', `v17')
C permenant register that hold the 16-byte result of pmull
define(`R', `v18')
C permenant register that hold the 16-byte result of pmull2,
C its value is accumulated on 'F' register immediately
define(`R1', `v19')
C common macros:
C long multiply of six 64-bit polynomials and sum
C R = (in.l × param2.l) + (in.h × param2.h)
C F = (in.l × param3.l) + (in.h × param3.h)
C PMUL(in, param1, param2)
define(`PMUL', m4_assert_numargs(3)`
pmull F.1q,$3.1d,$1.1d
pmull2 F1.1q,$3.2d,$1.2d
pmull R.1q,$2.1d,$1.1d
pmull2 R1.1q,$2.2d,$1.2d
eor F.16b,F.16b,F1.16b
eor R.16b,R.16b,R1.16b
')
C Reduce 'R' and 'F' values to 128-bit output
C REDUCTION(out)
define(`REDUCTION', m4_assert_numargs(1)`
pmull T.1q,F.1d,POLY.1d
eor R.16b,R.16b,T.16b
ext R.16b,R.16b,R.16b,#8
eor $1.16b,F.16b,R.16b
')
C void _ghash_set_key (struct gcm_key *ctx, const union nettle_block16 *key)
C This function populates the gcm table as the following layout
C *******************************************************************************
C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) |
C | |
C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) |
C | |
C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) |
C | |
C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
C *******************************************************************************
C Register usage:
define(`CTX', `x0')
define(`KEY', `x1')
define(`EMSB', `v0')
define(`B', `v1')
define(`H', `v2')
define(`H2', `v3')
define(`H3', `v4')
define(`H4', `v5')
define(`Hp', `v20')
define(`Hl', `v21')
define(`Hm', `v22')
define(`H1M', `v23')
define(`H1L', `v24')
define(`H2M', `v25')
define(`H2L', `v26')
define(`H3M', `v27')
define(`H3L', `v28')
define(`H4M', `v29')
define(`H4L', `v30')
C PMUL_PARAM(in, param1, param2)
define(`PMUL_PARAM', m4_assert_numargs(3)`
pmull2 Hp.1q,$1.2d,POLY.2d
eor Hm.16b,$1.16b,Hp.16b
ext $2.16b,Hm.16b,$1.16b,#8
ext $3.16b,$1.16b,Hm.16b,#8
ext $2.16b,$2.16b,$2.16b,#8
')
PROLOGUE(_nettle_ghash_set_key)
ld1 {H.2d},[KEY]
C we treat data as big-endian doublewords for processing. Since there is no
C endianness-neutral MSB-first load operation we need to restore our desired
C byte order on little-endian systems. The same holds true for DATA below
C but not our own internal precalculated CTX (see below).
IF_LE(`
rev64 H.16b,H.16b
')
C --- calculate H = H × x mod R(X); R(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) ---
dup EMSB.16b,H.b[7]
mov x1,#0xC200000000000000
mov x2,#1
mov POLY.d[0],x1
mov POLY.d[1],x2
sshr EMSB.16b,EMSB.16b,#7
and EMSB.16b,EMSB.16b,POLY.16b
ushr B.2d,H.2d,#63
and B.16b,B.16b,POLY.16b
ext B.16b,B.16b,B.16b,#8
shl H.2d,H.2d,#1
orr H.16b,H.16b,B.16b
eor H.16b,H.16b,EMSB.16b
dup POLY.2d,POLY.d[0]
C --- calculate H^2 = H × H ---
PMUL_PARAM(H,H1M,H1L)
PMUL(H,H1M,H1L)
REDUCTION(H2)
PMUL_PARAM(H2,H2M,H2L)
C we store to the table as doubleword-vectors in current memory endianness
C because it's our own strictly internal data structure and what ghash_update
C can most naturally use
st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[CTX],#64
C --- calculate H^3 = H^1 × H^2 ---
PMUL(H2,H1M,H1L)
REDUCTION(H3)
PMUL_PARAM(H3,H3M,H3L)
C --- calculate H^4 = H^2 × H^2 ---
PMUL(H2,H2M,H2L)
REDUCTION(H4)
PMUL_PARAM(H4,H4M,H4L)
st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[CTX]
ret
EPILOGUE(_nettle_ghash_set_key)
C arm64/crypto/ghash-update.asm
ifelse(`
Copyright (C) 2020 Niels Möller and Mamone Tarsha
Copyright (C) 2021 Michael Weiser
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
.file "ghash-update.asm"
.arch armv8-a+crypto
.text
C common SIMD register usage:
define(`POLY', `v6')
C temporary register that assist the reduction procedure
define(`T', `v7')
C permenant register that hold the 16-byte result of pmull
define(`F', `v16')
C permenant register that hold the 16-byte result of pmull2,
C its value is accumulated on 'F' register immediately
define(`F1', `v17')
C permenant register that hold the 16-byte result of pmull
define(`R', `v18')
C permenant register that hold the 16-byte result of pmull2,
C its value is accumulated on 'F' register immediately
define(`R1', `v19')
C common macros:
C long multiply of six 64-bit polynomials and sum
C R = (in.l × param2.l) + (in.h × param2.h)
C F = (in.l × param3.l) + (in.h × param3.h)
C PMUL(in, param1, param2)
define(`PMUL', m4_assert_numargs(3)`
pmull F.1q,$3.1d,$1.1d
pmull2 F1.1q,$3.2d,$1.2d
pmull R.1q,$2.1d,$1.1d
pmull2 R1.1q,$2.2d,$1.2d
eor F.16b,F.16b,F1.16b
eor R.16b,R.16b,R1.16b
')
C Reduce 'R' and 'F' values to 128-bit output
C REDUCTION(out)
define(`REDUCTION', m4_assert_numargs(1)`
pmull T.1q,F.1d,POLY.1d
eor R.16b,R.16b,T.16b
ext R.16b,R.16b,R.16b,#8
eor $1.16b,F.16b,R.16b
')
C register usage:
define(`CTX', `x0')
define(`X', `x1')
define(`BLOCKS', `x2')
define(`DATA', `x3')
define(`D', `v0')
define(`C0', `v1')
define(`C1', `v2')
define(`C2', `v3')
define(`C3', `v4')
define(`R2', `v20')
define(`F2', `v21')
define(`R3', `v22')
define(`F3', `v23')
define(`H1M', `v24')
define(`H1L', `v25')
define(`H2M', `v26')
define(`H2L', `v27')
define(`H3M', `v28')
define(`H3L', `v29')
define(`H4M', `v30')
define(`H4L', `v31')
C PMUL_SUM(in, param1, param2)
define(`PMUL_SUM', m4_assert_numargs(3)`
pmull F2.1q,$3.1d,$1.1d
pmull2 F3.1q,$3.2d,$1.2d
pmull R2.1q,$2.1d,$1.1d
pmull2 R3.1q,$2.2d,$1.2d
eor F2.16b,F2.16b,F3.16b
eor R2.16b,R2.16b,R3.16b
eor F.16b,F.16b,F2.16b
eor R.16b,R.16b,R2.16b
')
C const uint8_t *_ghash_update (const struct gcm_key *key,
C union nettle_block16 *x,
C size_t blocks, const uint8_t *data)
PROLOGUE(_nettle_ghash_update)
mov x4,#0xC200000000000000
mov POLY.d[0],x4
ld1 {D.2d},[X]
IF_LE(`
rev64 D.16b,D.16b
')
ands x4,BLOCKS,#-4
b.eq L1_block
add x5,CTX,#64
ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[CTX]
ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
L4_blocks_loop:
ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
IF_LE(`
rev64 C0.16b,C0.16b
rev64 C1.16b,C1.16b
rev64 C2.16b,C2.16b
rev64 C3.16b,C3.16b
')
eor C0.16b,C0.16b,D.16b
PMUL(C1,H3M,H3L)
PMUL_SUM(C2,H2M,H2L)
PMUL_SUM(C3,H1M,H1L)
PMUL_SUM(C0,H4M,H4L)
REDUCTION(D)
subs x4,x4,#4
b.ne L4_blocks_loop
L1_block:
ands BLOCKS,BLOCKS,#3
b.eq Lghash_done
ld1 {H1M.2d,H1L.2d},[CTX]
L1_block_loop:
ld1 {C0.2d},[DATA],#16
IF_LE(`
rev64 C0.16b,C0.16b
')
eor C0.16b,C0.16b,D.16b
PMUL(C0,H1M,H1L)
REDUCTION(D)
subs BLOCKS, BLOCKS, #1
b.ne L1_block_loop
Lghash_done:
IF_LE(`
rev64 D.16b,D.16b
')
st1 {D.2d},[X]
mov x0, DATA
ret
EPILOGUE(_nettle_ghash_update)
C arm64/crypto/sha1-compress.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C This implementation uses the SHA-1 instructions of Armv8 crypto
C extension.
C SHA1C: SHA1 hash update (choose)
C SHA1H: SHA1 fixed rotate
C SHA1M: SHA1 hash update (majority)
C SHA1P: SHA1 hash update (parity)
C SHA1SU0: SHA1 schedule update 0
C SHA1SU1: SHA1 schedule update 1
.file "sha1-compress.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`STATE', `x0')
define(`INPUT', `x1')
define(`CONST0', `v0')
define(`CONST1', `v1')
define(`CONST2', `v2')
define(`CONST3', `v3')
define(`MSG0', `v4')
define(`MSG1', `v5')
define(`MSG2', `v6')
define(`MSG3', `v7')
define(`ABCD', `v16')
define(`ABCD_SAVED', `v17')
define(`E0', `v18')
define(`E0_SAVED', `v19')
define(`E1', `v20')
define(`TMP', `v21')
C void nettle_sha1_compress(uint32_t *state, const uint8_t *input)
PROLOGUE(nettle_sha1_compress)
C Initialize constants
mov w2,#0x7999
movk w2,#0x5A82,lsl #16
dup CONST0.4s,w2
mov w2,#0xEBA1
movk w2,#0x6ED9,lsl #16
dup CONST1.4s,w2
mov w2,#0xBCDC
movk w2,#0x8F1B,lsl #16
dup CONST2.4s,w2
mov w2,#0xC1D6
movk w2,#0xCA62,lsl #16
dup CONST3.4s,w2
C Load state
add x2,STATE,#16
movi E0.4s,#0
ld1 {ABCD.4s},[STATE]
ld1 {E0.s}[0],[x2]
C Save state
mov ABCD_SAVED.16b,ABCD.16b
mov E0_SAVED.16b,E0.16b
C Load message
ld1 {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT]
C Reverse for little endian
rev32 MSG0.16b,MSG0.16b
rev32 MSG1.16b,MSG1.16b
rev32 MSG2.16b,MSG2.16b
rev32 MSG3.16b,MSG3.16b
C Rounds 0-3
add TMP.4s,MSG0.4s,CONST0.4s
sha1h SFP(E1),SFP(ABCD)
sha1c QFP(ABCD),SFP(E0),TMP.4s
sha1su0 MSG0.4s,MSG1.4s,MSG2.4s
C Rounds 4-7
add TMP.4s,MSG1.4s,CONST0.4s
sha1h SFP(E0),SFP(ABCD)
sha1c QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG0.4s,MSG3.4s
sha1su0 MSG1.4s,MSG2.4s,MSG3.4s
C Rounds 8-11
add TMP.4s,MSG2.4s,CONST0.4s
sha1h SFP(E1),SFP(ABCD)
sha1c QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG1.4s,MSG0.4s
sha1su0 MSG2.4s,MSG3.4s,MSG0.4s
C Rounds 12-15
add TMP.4s,MSG3.4s,CONST0.4s
sha1h SFP(E0),SFP(ABCD)
sha1c QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG2.4s,MSG1.4s
sha1su0 MSG3.4s,MSG0.4s,MSG1.4s
C Rounds 16-19
add TMP.4s,MSG0.4s,CONST0.4s
sha1h SFP(E1),SFP(ABCD)
sha1c QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG3.4s,MSG2.4s
sha1su0 MSG0.4s,MSG1.4s,MSG2.4s
C Rounds 20-23
add TMP.4s,MSG1.4s,CONST1.4s
sha1h SFP(E0),SFP(ABCD)
sha1p QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG0.4s,MSG3.4s
sha1su0 MSG1.4s,MSG2.4s,MSG3.4s
C Rounds 24-27
add TMP.4s,MSG2.4s,CONST1.4s
sha1h SFP(E1),SFP(ABCD)
sha1p QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG1.4s,MSG0.4s
sha1su0 MSG2.4s,MSG3.4s,MSG0.4s
C Rounds 28-31
add TMP.4s,MSG3.4s,CONST1.4s
sha1h SFP(E0),SFP(ABCD)
sha1p QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG2.4s,MSG1.4s
sha1su0 MSG3.4s,MSG0.4s,MSG1.4s
C Rounds 32-35
add TMP.4s,MSG0.4s,CONST1.4s
sha1h SFP(E1),SFP(ABCD)
sha1p QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG3.4s,MSG2.4s
sha1su0 MSG0.4s,MSG1.4s,MSG2.4s
C Rounds 36-39
add TMP.4s,MSG1.4s,CONST1.4s
sha1h SFP(E0),SFP(ABCD)
sha1p QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG0.4s,MSG3.4s
sha1su0 MSG1.4s,MSG2.4s,MSG3.4s
C Rounds 40-43
add TMP.4s,MSG2.4s,CONST2.4s
sha1h SFP(E1),SFP(ABCD)
sha1m QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG1.4s,MSG0.4s
sha1su0 MSG2.4s,MSG3.4s,MSG0.4s
C Rounds 44-47
add TMP.4s,MSG3.4s,CONST2.4s
sha1h SFP(E0),SFP(ABCD)
sha1m QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG2.4s,MSG1.4s
sha1su0 MSG3.4s,MSG0.4s,MSG1.4s
C Rounds 48-51
add TMP.4s,MSG0.4s,CONST2.4s
sha1h SFP(E1),SFP(ABCD)
sha1m QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG3.4s,MSG2.4s
sha1su0 MSG0.4s,MSG1.4s,MSG2.4s
C Rounds 52-55
add TMP.4s,MSG1.4s,CONST2.4s
sha1h SFP(E0),SFP(ABCD)
sha1m QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG0.4s,MSG3.4s
sha1su0 MSG1.4s,MSG2.4s,MSG3.4s
C Rounds 56-59
add TMP.4s,MSG2.4s,CONST2.4s
sha1h SFP(E1),SFP(ABCD)
sha1m QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG1.4s,MSG0.4s
sha1su0 MSG2.4s,MSG3.4s,MSG0.4s
C Rounds 60-63
add TMP.4s,MSG3.4s,CONST3.4s
sha1h SFP(E0),SFP(ABCD)
sha1p QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG2.4s,MSG1.4s
sha1su0 MSG3.4s,MSG0.4s,MSG1.4s
C Rounds 64-67
add TMP.4s,MSG0.4s,CONST3.4s
sha1h SFP(E1),SFP(ABCD)
sha1p QFP(ABCD),SFP(E0),TMP.4s
sha1su1 MSG3.4s,MSG2.4s
sha1su0 MSG0.4s,MSG1.4s,MSG2.4s
C Rounds 68-71
add TMP.4s,MSG1.4s,CONST3.4s
sha1h SFP(E0),SFP(ABCD)
sha1p QFP(ABCD),SFP(E1),TMP.4s
sha1su1 MSG0.4s,MSG3.4s
C Rounds 72-75
add TMP.4s,MSG2.4s,CONST3.4s
sha1h SFP(E1),SFP(ABCD)
sha1p QFP(ABCD),SFP(E0),TMP.4s
C Rounds 76-79
add TMP.4s,MSG3.4s,CONST3.4s
sha1h SFP(E0),SFP(ABCD)
sha1p QFP(ABCD),SFP(E1),TMP.4s
C Combine state
add E0.4s,E0.4s,E0_SAVED.4s
add ABCD.4s,ABCD.4s,ABCD_SAVED.4s
C Store state
st1 {ABCD.4s},[STATE]
st1 {E0.s}[0],[x2]
ret
EPILOGUE(nettle_sha1_compress)
C arm64/crypto/sha256-compress-n.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
C This implementation uses the SHA-256 instructions of Armv8 crypto
C extension.
C SHA256H: SHA256 hash update (part 1)
C SHA256H2: SHA256 hash update (part 2)
C SHA256SU0: SHA256 schedule update 0
C SHA256SU1: SHA256 schedule update 1
.file "sha256-compress-n.asm"
.arch armv8-a+crypto
.text
C Register usage:
define(`STATE', `x0')
define(`K', `x1')
define(`BLOCKS', `x2')
define(`INPUT', `x3')
define(`MSG0', `v0')
define(`MSG1', `v1')
define(`MSG2', `v2')
define(`MSG3', `v3')
define(`STATE0', `v4')
define(`STATE1', `v5')
define(`CONST', `v6')
define(`TMP', `v7')
define(`STATE0_SAVED', `v16')
define(`STATE1_SAVED', `v17')
C const uint8_t *
C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
C size_t blocks, const uint8_t *input)
PROLOGUE(_nettle_sha256_compress_n)
cbz BLOCKS, .Lend
C Load state
ld1 {STATE0.4s,STATE1.4s},[STATE]
.Loop:
C Save state
mov STATE0_SAVED.16b,STATE0.16b
mov STATE1_SAVED.16b,STATE1.16b
C Load message
ld1 {MSG0.16b,MSG1.16b,MSG2.16b,MSG3.16b},[INPUT],#64
C Reverse for little endian
rev32 MSG0.16b,MSG0.16b
rev32 MSG1.16b,MSG1.16b
rev32 MSG2.16b,MSG2.16b
rev32 MSG3.16b,MSG3.16b
C Rounds 0-3
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG0.4s,CONST.4s
sha256su0 MSG0.4s,MSG1.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(STATE0_SAVED),CONST.4s
sha256su1 MSG0.4s,MSG2.4s,MSG3.4s
C Rounds 4-7
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG1.4s,CONST.4s
sha256su0 MSG1.4s,MSG2.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG1.4s,MSG3.4s,MSG0.4s
C Rounds 8-11
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG2.4s,CONST.4s
sha256su0 MSG2.4s,MSG3.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG2.4s,MSG0.4s,MSG1.4s
C Rounds 12-15
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG3.4s,CONST.4s
sha256su0 MSG3.4s,MSG0.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG3.4s,MSG1.4s,MSG2.4s
C Rounds 16-19
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG0.4s,CONST.4s
sha256su0 MSG0.4s,MSG1.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG0.4s,MSG2.4s,MSG3.4s
C Rounds 20-23
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG1.4s,CONST.4s
sha256su0 MSG1.4s,MSG2.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG1.4s,MSG3.4s,MSG0.4s
C Rounds 24-27
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG2.4s,CONST.4s
sha256su0 MSG2.4s,MSG3.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG2.4s,MSG0.4s,MSG1.4s
C Rounds 28-31
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG3.4s,CONST.4s
sha256su0 MSG3.4s,MSG0.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG3.4s,MSG1.4s,MSG2.4s
C Rounds 32-35
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG0.4s,CONST.4s
sha256su0 MSG0.4s,MSG1.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG0.4s,MSG2.4s,MSG3.4s
C Rounds 36-39
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG1.4s,CONST.4s
sha256su0 MSG1.4s,MSG2.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG1.4s,MSG3.4s,MSG0.4s
C Rounds 40-43
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG2.4s,CONST.4s
sha256su0 MSG2.4s,MSG3.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG2.4s,MSG0.4s,MSG1.4s
C Rounds 44-47
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG3.4s,CONST.4s
sha256su0 MSG3.4s,MSG0.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
sha256su1 MSG3.4s,MSG1.4s,MSG2.4s
C Rounds 48-51
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG0.4s,CONST.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
C Rounds 52-55
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG1.4s,CONST.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
C Rounds 56-59
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K],#16
add CONST.4s,MSG2.4s,CONST.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
C Rounds 60-63
mov TMP.16b,STATE0.16b
ld1 {CONST.4s},[K]
add CONST.4s,MSG3.4s,CONST.4s
sha256h QFP(STATE0),QFP(STATE1),CONST.4s
sha256h2 QFP(STATE1),QFP(TMP),CONST.4s
C Combine state
add STATE0.4s,STATE0.4s,STATE0_SAVED.4s
add STATE1.4s,STATE1.4s,STATE1_SAVED.4s
subs BLOCKS, BLOCKS, #1
sub K, K, #240
b.ne .Loop
C Store state
st1 {STATE0.4s,STATE1.4s},[STATE]
.Lend:
mov x0, INPUT
ret
EPILOGUE(_nettle_sha256_compress_n)
C arm64/fat/aes128-decrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(nettle_aes128_decrypt) picked up by configure
define(`fat_transform', `_$1_arm64')
include_src(`arm64/crypto/aes128-decrypt.asm')
C arm64/fat/aes128-encrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(nettle_aes128_encrypt) picked up by configure
define(`fat_transform', `_$1_arm64')
include_src(`arm64/crypto/aes128-encrypt.asm')
C arm64/fat/aes192-decrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(nettle_aes192_decrypt) picked up by configure
define(`fat_transform', `_$1_arm64')
include_src(`arm64/crypto/aes192-decrypt.asm')
C arm64/fat/aes192-encrypt.asm
ifelse(`
Copyright (C) 2021 Mamone Tarsha
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or
modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received copies of the GNU General Public License and
the GNU Lesser General Public License along with this program. If
not, see http://www.gnu.org/licenses/.
')
dnl PROLOGUE(nettle_aes192_encrypt) picked up by configure
define(`fat_transform', `_$1_arm64')
include_src(`arm64/crypto/aes192-encrypt.asm')