Commit db9b8594 authored by Niels Möller's avatar Niels Möller

Unroll x86_64 aesni loops.

parent 0a679090
2018-01-03 Niels Möller <nisse@lysator.liu.se>
* x86_64/aesni/aes-encrypt-internal.asm: Read subkeys into xmm
registers before the block loop, and completely unroll the round
loop.
* x86_64/aesni/aes-decrypt-internal.asm: Likewise.
2017-11-19 Niels Möller <nisse@lysator.liu.se> 2017-11-19 Niels Möller <nisse@lysator.liu.se>
* Released nettle-3.4. * Released nettle-3.4.
......
...@@ -2,7 +2,7 @@ C x86_64/aesni/aes-decrypt-internal.asm ...@@ -2,7 +2,7 @@ C x86_64/aesni/aes-decrypt-internal.asm
ifelse(< ifelse(<
Copyright (C) 2015 Niels Möller Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle. This file is part of GNU Nettle.
...@@ -39,15 +39,22 @@ define(<LENGTH>,<%rcx>) ...@@ -39,15 +39,22 @@ define(<LENGTH>,<%rcx>)
define(<DST>, <%r8>) define(<DST>, <%r8>)
define(<SRC>, <%r9>) define(<SRC>, <%r9>)
C Round counter define(<KEY0>, <%xmm0>)
define(<CNT>, <%rdx>) define(<KEY1>, <%xmm1>)
C Subkey pointer define(<KEY2>, <%xmm2>)
define(<KEY>, <%rax>) define(<KEY3>, <%xmm3>)
define(<KEY4>, <%xmm4>)
dnl aesdec %xmm1, %xmm0 define(<KEY5>, <%xmm5>)
define(<AESDEC>, <.byte 0x66, 0x0f, 0x38, 0xde, 0xc1>) define(<KEY6>, <%xmm6>)
dnl aesdeclast %xmm1, %xmm0 define(<KEY7>, <%xmm7>)
define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>) define(<KEY8>, <%xmm8>)
define(<KEY9>, <%xmm9>)
define(<KEY10>, <%xmm10>)
define(<KEY11>, <%xmm11>)
define(<KEY12>, <%xmm12>)
define(<KEY13>, <%xmm13>)
define(<KEYLAST>, <%xmm14>)
define(<BLOCK>, <%xmm15>)
.file "aes-decrypt-internal.asm" .file "aes-decrypt-internal.asm"
...@@ -58,43 +65,70 @@ define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>) ...@@ -58,43 +65,70 @@ define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
.text .text
ALIGN(16) ALIGN(16)
PROLOGUE(_nettle_aes_decrypt) PROLOGUE(_nettle_aes_decrypt)
W64_ENTRY(6, 2) W64_ENTRY(6, 16)
shr $4, LENGTH shr $4, LENGTH
test LENGTH, LENGTH test LENGTH, LENGTH
jz .Lend jz .Lend
decl XREG(ROUNDS) movups (KEYS), KEY0
movups 16(KEYS), KEY1
movups 32(KEYS), KEY2
movups 48(KEYS), KEY3
movups 64(KEYS), KEY4
movups 80(KEYS), KEY5
movups 96(KEYS), KEY6
movups 112(KEYS), KEY7
movups 128(KEYS), KEY8
movups 144(KEYS), KEY9
lea 160(KEYS), KEYS
sub $10, XREG(ROUNDS) C Also clears high half
je .Lkey_last
movups (KEYS), KEY10
movups 16(KEYS), KEY11
lea (KEYS, ROUNDS, 8), KEYS
lea (KEYS, ROUNDS, 8), KEYS
cmpl $2, XREG(ROUNDS)
je .Lkey_last
movups -32(KEYS), KEY12
movups -16(KEYS), KEY13
.Lkey_last:
movups (KEYS), KEYLAST
.Lblock_loop: .Lblock_loop:
mov ROUNDS, CNT movups (SRC), BLOCK
mov KEYS, KEY pxor KEY0, BLOCK
movups (SRC), %xmm0 aesdec KEY1, BLOCK
C FIXME: Better alignment of subkeys, so we can use movaps. aesdec KEY2, BLOCK
movups (KEY), %xmm1 aesdec KEY3, BLOCK
pxor %xmm1, %xmm0 aesdec KEY4, BLOCK
aesdec KEY5, BLOCK
C FIXME: Could use some unrolling. Also all subkeys fit in aesdec KEY6, BLOCK
C registers, so they could be loaded once (on W64 we would aesdec KEY7, BLOCK
C need to save and restore some xmm registers, though). aesdec KEY8, BLOCK
aesdec KEY9, BLOCK
.Lround_loop: testl XREG(ROUNDS), XREG(ROUNDS)
add $16, KEY je .Lblock_end
aesdec KEY10, BLOCK
movups (KEY), %xmm1 aesdec KEY11, BLOCK
AESDEC C %xmm1, %xmm0 cmpl $2, XREG(ROUNDS)
decl XREG(CNT) je .Lblock_end
jnz .Lround_loop
aesdec KEY12, BLOCK
movups 16(KEY), %xmm1 aesdec KEY13, BLOCK
AESDECLAST C %xmm1, %xmm0
.Lblock_end:
movups %xmm0, (DST) aesdeclast KEYLAST, BLOCK
movups BLOCK, (DST)
add $16, SRC add $16, SRC
add $16, DST add $16, DST
dec LENGTH dec LENGTH
jnz .Lblock_loop jnz .Lblock_loop
.Lend: .Lend:
W64_EXIT(6, 2) W64_EXIT(6, 16)
ret ret
EPILOGUE(_nettle_aes_decrypt) EPILOGUE(_nettle_aes_decrypt)
...@@ -2,7 +2,7 @@ C x86_64/aesni/aes-encrypt-internal.asm ...@@ -2,7 +2,7 @@ C x86_64/aesni/aes-encrypt-internal.asm
ifelse(< ifelse(<
Copyright (C) 2015 Niels Möller Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle. This file is part of GNU Nettle.
...@@ -39,16 +39,23 @@ define(<LENGTH>,<%rcx>) ...@@ -39,16 +39,23 @@ define(<LENGTH>,<%rcx>)
define(<DST>, <%r8>) define(<DST>, <%r8>)
define(<SRC>, <%r9>) define(<SRC>, <%r9>)
C Round counter define(<KEY0>, <%xmm0>)
define(<CNT>, <%rdx>) define(<KEY1>, <%xmm1>)
C Subkey pointer define(<KEY2>, <%xmm2>)
define(<KEY>, <%rax>) define(<KEY3>, <%xmm3>)
define(<KEY4>, <%xmm4>)
define(<KEY5>, <%xmm5>)
define(<KEY6>, <%xmm6>)
define(<KEY7>, <%xmm7>)
define(<KEY8>, <%xmm8>)
define(<KEY9>, <%xmm9>)
define(<KEY10>, <%xmm10>)
define(<KEY11>, <%xmm11>)
define(<KEY12>, <%xmm12>)
define(<KEY13>, <%xmm13>)
define(<KEYLAST>, <%xmm14>)
define(<BLOCK>, <%xmm15>)
dnl aesenc %xmm1, %xmm0
define(<AESENC>, <.byte 0x66, 0x0f, 0x38, 0xdc, 0xc1>)
dnl aesenclast %xmm1, %xmm0
define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
.file "aes-encrypt-internal.asm" .file "aes-encrypt-internal.asm"
C _aes_encrypt(unsigned rounds, const uint32_t *keys, C _aes_encrypt(unsigned rounds, const uint32_t *keys,
...@@ -58,43 +65,70 @@ define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>) ...@@ -58,43 +65,70 @@ define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
.text .text
ALIGN(16) ALIGN(16)
PROLOGUE(_nettle_aes_encrypt) PROLOGUE(_nettle_aes_encrypt)
W64_ENTRY(6, 2) W64_ENTRY(6, 16)
shr $4, LENGTH shr $4, LENGTH
test LENGTH, LENGTH test LENGTH, LENGTH
jz .Lend jz .Lend
decl XREG(ROUNDS) movups (KEYS), KEY0
movups 16(KEYS), KEY1
movups 32(KEYS), KEY2
movups 48(KEYS), KEY3
movups 64(KEYS), KEY4
movups 80(KEYS), KEY5
movups 96(KEYS), KEY6
movups 112(KEYS), KEY7
movups 128(KEYS), KEY8
movups 144(KEYS), KEY9
lea 160(KEYS), KEYS
sub $10, XREG(ROUNDS) C Also clears high half
je .Lkey_last
movups (KEYS), KEY10
movups 16(KEYS), KEY11
lea (KEYS, ROUNDS, 8), KEYS
lea (KEYS, ROUNDS, 8), KEYS
cmpl $2, XREG(ROUNDS)
je .Lkey_last
movups -32(KEYS), KEY12
movups -16(KEYS), KEY13
.Lkey_last:
movups (KEYS), KEYLAST
.Lblock_loop: .Lblock_loop:
mov ROUNDS, CNT movups (SRC), BLOCK
mov KEYS, KEY pxor KEY0, BLOCK
movups (SRC), %xmm0 aesenc KEY1, BLOCK
C FIXME: Better alignment of subkeys, so we can use movaps. aesenc KEY2, BLOCK
movups (KEY), %xmm1 aesenc KEY3, BLOCK
pxor %xmm1, %xmm0 aesenc KEY4, BLOCK
aesenc KEY5, BLOCK
C FIXME: Could use some unrolling. Also all subkeys fit in aesenc KEY6, BLOCK
C registers, so they could be loaded once (on W64 we would aesenc KEY7, BLOCK
C need to save and restore some xmm registers, though). aesenc KEY8, BLOCK
aesenc KEY9, BLOCK
.Lround_loop: testl XREG(ROUNDS), XREG(ROUNDS)
add $16, KEY je .Lblock_end
aesenc KEY10, BLOCK
movups (KEY), %xmm1 aesenc KEY11, BLOCK
AESENC C %xmm1, %xmm0 cmpl $2, XREG(ROUNDS)
decl XREG(CNT) je .Lblock_end
jnz .Lround_loop
aesenc KEY12, BLOCK
movups 16(KEY), %xmm1 aesenc KEY13, BLOCK
AESENCLAST C %xmm1, %xmm0
.Lblock_end:
movups %xmm0, (DST) aesenclast KEYLAST, BLOCK
movups BLOCK, (DST)
add $16, SRC add $16, SRC
add $16, DST add $16, DST
dec LENGTH dec LENGTH
jnz .Lblock_loop jnz .Lblock_loop
.Lend: .Lend:
W64_EXIT(6, 2) W64_EXIT(6, 16)
ret ret
EPILOGUE(_nettle_aes_encrypt) EPILOGUE(_nettle_aes_encrypt)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment