Commit db9b8594 authored by Niels Möller's avatar Niels Möller

Unroll x86_64 aesni loops.

parent 0a679090
2018-01-03 Niels Möller <nisse@lysator.liu.se>
* x86_64/aesni/aes-encrypt-internal.asm: Read subkeys into xmm
registers before the block loop, and completely unroll the round
loop.
* x86_64/aesni/aes-decrypt-internal.asm: Likewise.
2017-11-19 Niels Möller <nisse@lysator.liu.se>
* Released nettle-3.4.
......
......@@ -2,7 +2,7 @@ C x86_64/aesni/aes-decrypt-internal.asm
ifelse(<
Copyright (C) 2015 Niels Möller
Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle.
......@@ -39,15 +39,22 @@ define(<LENGTH>,<%rcx>)
define(<DST>, <%r8>)
define(<SRC>, <%r9>)
C Round counter
define(<CNT>, <%rdx>)
C Subkey pointer
define(<KEY>, <%rax>)
dnl aesdec %xmm1, %xmm0
define(<AESDEC>, <.byte 0x66, 0x0f, 0x38, 0xde, 0xc1>)
dnl aesdeclast %xmm1, %xmm0
define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
define(<KEY0>, <%xmm0>)
define(<KEY1>, <%xmm1>)
define(<KEY2>, <%xmm2>)
define(<KEY3>, <%xmm3>)
define(<KEY4>, <%xmm4>)
define(<KEY5>, <%xmm5>)
define(<KEY6>, <%xmm6>)
define(<KEY7>, <%xmm7>)
define(<KEY8>, <%xmm8>)
define(<KEY9>, <%xmm9>)
define(<KEY10>, <%xmm10>)
define(<KEY11>, <%xmm11>)
define(<KEY12>, <%xmm12>)
define(<KEY13>, <%xmm13>)
define(<KEYLAST>, <%xmm14>)
define(<BLOCK>, <%xmm15>)
.file "aes-decrypt-internal.asm"
......@@ -58,43 +65,70 @@ define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
.text
ALIGN(16)
PROLOGUE(_nettle_aes_decrypt)
W64_ENTRY(6, 2)
W64_ENTRY(6, 16)
shr $4, LENGTH
test LENGTH, LENGTH
jz .Lend
decl XREG(ROUNDS)
movups (KEYS), KEY0
movups 16(KEYS), KEY1
movups 32(KEYS), KEY2
movups 48(KEYS), KEY3
movups 64(KEYS), KEY4
movups 80(KEYS), KEY5
movups 96(KEYS), KEY6
movups 112(KEYS), KEY7
movups 128(KEYS), KEY8
movups 144(KEYS), KEY9
lea 160(KEYS), KEYS
sub $10, XREG(ROUNDS) C Also clears high half
je .Lkey_last
movups (KEYS), KEY10
movups 16(KEYS), KEY11
lea (KEYS, ROUNDS, 8), KEYS
lea (KEYS, ROUNDS, 8), KEYS
cmpl $2, XREG(ROUNDS)
je .Lkey_last
movups -32(KEYS), KEY12
movups -16(KEYS), KEY13
.Lkey_last:
movups (KEYS), KEYLAST
.Lblock_loop:
mov ROUNDS, CNT
mov KEYS, KEY
movups (SRC), %xmm0
C FIXME: Better alignment of subkeys, so we can use movaps.
movups (KEY), %xmm1
pxor %xmm1, %xmm0
C FIXME: Could use some unrolling. Also all subkeys fit in
C registers, so they could be loaded once (on W64 we would
C need to save and restore some xmm registers, though).
.Lround_loop:
add $16, KEY
movups (KEY), %xmm1
AESDEC C %xmm1, %xmm0
decl XREG(CNT)
jnz .Lround_loop
movups 16(KEY), %xmm1
AESDECLAST C %xmm1, %xmm0
movups %xmm0, (DST)
movups (SRC), BLOCK
pxor KEY0, BLOCK
aesdec KEY1, BLOCK
aesdec KEY2, BLOCK
aesdec KEY3, BLOCK
aesdec KEY4, BLOCK
aesdec KEY5, BLOCK
aesdec KEY6, BLOCK
aesdec KEY7, BLOCK
aesdec KEY8, BLOCK
aesdec KEY9, BLOCK
testl XREG(ROUNDS), XREG(ROUNDS)
je .Lblock_end
aesdec KEY10, BLOCK
aesdec KEY11, BLOCK
cmpl $2, XREG(ROUNDS)
je .Lblock_end
aesdec KEY12, BLOCK
aesdec KEY13, BLOCK
.Lblock_end:
aesdeclast KEYLAST, BLOCK
movups BLOCK, (DST)
add $16, SRC
add $16, DST
dec LENGTH
jnz .Lblock_loop
.Lend:
W64_EXIT(6, 2)
W64_EXIT(6, 16)
ret
EPILOGUE(_nettle_aes_decrypt)
......@@ -2,7 +2,7 @@ C x86_64/aesni/aes-encrypt-internal.asm
ifelse(<
Copyright (C) 2015 Niels Möller
Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle.
......@@ -39,16 +39,23 @@ define(<LENGTH>,<%rcx>)
define(<DST>, <%r8>)
define(<SRC>, <%r9>)
C Round counter
define(<CNT>, <%rdx>)
C Subkey pointer
define(<KEY>, <%rax>)
define(<KEY0>, <%xmm0>)
define(<KEY1>, <%xmm1>)
define(<KEY2>, <%xmm2>)
define(<KEY3>, <%xmm3>)
define(<KEY4>, <%xmm4>)
define(<KEY5>, <%xmm5>)
define(<KEY6>, <%xmm6>)
define(<KEY7>, <%xmm7>)
define(<KEY8>, <%xmm8>)
define(<KEY9>, <%xmm9>)
define(<KEY10>, <%xmm10>)
define(<KEY11>, <%xmm11>)
define(<KEY12>, <%xmm12>)
define(<KEY13>, <%xmm13>)
define(<KEYLAST>, <%xmm14>)
define(<BLOCK>, <%xmm15>)
dnl aesenc %xmm1, %xmm0
define(<AESENC>, <.byte 0x66, 0x0f, 0x38, 0xdc, 0xc1>)
dnl aesenclast %xmm1, %xmm0
define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
.file "aes-encrypt-internal.asm"
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
......@@ -58,43 +65,70 @@ define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
.text
ALIGN(16)
PROLOGUE(_nettle_aes_encrypt)
W64_ENTRY(6, 2)
W64_ENTRY(6, 16)
shr $4, LENGTH
test LENGTH, LENGTH
jz .Lend
decl XREG(ROUNDS)
movups (KEYS), KEY0
movups 16(KEYS), KEY1
movups 32(KEYS), KEY2
movups 48(KEYS), KEY3
movups 64(KEYS), KEY4
movups 80(KEYS), KEY5
movups 96(KEYS), KEY6
movups 112(KEYS), KEY7
movups 128(KEYS), KEY8
movups 144(KEYS), KEY9
lea 160(KEYS), KEYS
sub $10, XREG(ROUNDS) C Also clears high half
je .Lkey_last
movups (KEYS), KEY10
movups 16(KEYS), KEY11
lea (KEYS, ROUNDS, 8), KEYS
lea (KEYS, ROUNDS, 8), KEYS
cmpl $2, XREG(ROUNDS)
je .Lkey_last
movups -32(KEYS), KEY12
movups -16(KEYS), KEY13
.Lkey_last:
movups (KEYS), KEYLAST
.Lblock_loop:
mov ROUNDS, CNT
mov KEYS, KEY
movups (SRC), %xmm0
C FIXME: Better alignment of subkeys, so we can use movaps.
movups (KEY), %xmm1
pxor %xmm1, %xmm0
C FIXME: Could use some unrolling. Also all subkeys fit in
C registers, so they could be loaded once (on W64 we would
C need to save and restore some xmm registers, though).
.Lround_loop:
add $16, KEY
movups (KEY), %xmm1
AESENC C %xmm1, %xmm0
decl XREG(CNT)
jnz .Lround_loop
movups 16(KEY), %xmm1
AESENCLAST C %xmm1, %xmm0
movups %xmm0, (DST)
movups (SRC), BLOCK
pxor KEY0, BLOCK
aesenc KEY1, BLOCK
aesenc KEY2, BLOCK
aesenc KEY3, BLOCK
aesenc KEY4, BLOCK
aesenc KEY5, BLOCK
aesenc KEY6, BLOCK
aesenc KEY7, BLOCK
aesenc KEY8, BLOCK
aesenc KEY9, BLOCK
testl XREG(ROUNDS), XREG(ROUNDS)
je .Lblock_end
aesenc KEY10, BLOCK
aesenc KEY11, BLOCK
cmpl $2, XREG(ROUNDS)
je .Lblock_end
aesenc KEY12, BLOCK
aesenc KEY13, BLOCK
.Lblock_end:
aesenclast KEYLAST, BLOCK
movups BLOCK, (DST)
add $16, SRC
add $16, DST
dec LENGTH
jnz .Lblock_loop
.Lend:
W64_EXIT(6, 2)
W64_EXIT(6, 16)
ret
EPILOGUE(_nettle_aes_encrypt)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment