diff --git a/x86_64/aes-decrypt-internal.asm b/x86_64/aes-decrypt-internal.asm index d3a48f26ea298ed08197c1d7eeea242c13dad780..1e9d700baeee8c42f67f0dd02afe9d5db324a7d6 100644 --- a/x86_64/aes-decrypt-internal.asm +++ b/x86_64/aes-decrypt-internal.asm @@ -24,28 +24,25 @@ C Register usage: C AES state, use two of them define(<SA>,<%eax>) define(<SB>,<%ebx>) -define(<SC>,<%ebp>) -define(<SD>,<%r9d>) +define(<SC>,<%ecx>) +define(<SD>,<%edx>) define(<TA>,<%r10d>) define(<TB>,<%r11d>) define(<TC>,<%r12d>) -define(<TD>,<%r13d>) define(<CTX>, <%rdi>) define(<TABLE>, <%rsi>) -define(<LENGTH>,<%edx>) C Length is only 32 bits -define(<DST>, <%rcx>) +define(<PARAM_LENGTH>,<%edx>) C Length is only 32 bits +define(<PARAM_DST>, <%rcx>) define(<SRC>, <%r8>) +define(<DST>, <%r9>) define(<KEY>,<%r14>) define(<COUNT>, <%r15d>) +define(<BLOCK_COUNT>, <%r13d>) -C Put the outer loop counter on the stack, and reuse the LENGTH -C register as a temporary. - -define(<FRAME_COUNT>, <(%rsp)>) -define(<TMP>,<%rdx>) +define(<TMP>,<%rbp>) .file "aes-decrypt-internal.asm" @@ -56,7 +53,7 @@ define(<TMP>,<%rdx>) .text ALIGN(4) PROLOGUE(_nettle_aes_decrypt) - test LENGTH, LENGTH + test PARAM_LENGTH, PARAM_LENGTH jz .Lend C save all registers that need to be saved @@ -67,11 +64,9 @@ PROLOGUE(_nettle_aes_decrypt) push %r14 push %r15 - C Allocates 4 bytes more than we need, for nicer alignment. - sub $8, %rsp - - shrl $4, LENGTH - movl LENGTH, FRAME_COUNT + mov PARAM_DST, DST + movl PARAM_LENGTH, BLOCK_COUNT + shrl $4, BLOCK_COUNT .Lblock_loop: mov CTX,KEY @@ -95,19 +90,19 @@ PROLOGUE(_nettle_aes_decrypt) AES_ROUND(TABLE, SC,SB,SA,SD, TC, TMP) xorl 8(KEY),TC - AES_ROUND(TABLE, SD,SC,SB,SA, TD, TMP) - xorl 12(KEY),TD + AES_ROUND(TABLE, SD,SC,SB,SA, SD, TMP) + xorl 12(KEY),SD - AES_ROUND(TABLE, TA,TD,TC,TB, SA, TMP) + AES_ROUND(TABLE, TA,SD,TC,TB, SA, TMP) xorl 16(KEY), SA - AES_ROUND(TABLE, TB,TA,TD,TC, SB, TMP) + AES_ROUND(TABLE, TB,TA,SD,TC, SB, TMP) xorl 20(KEY),SB - AES_ROUND(TABLE, TC,TB,TA,TD, SC, TMP) + AES_ROUND(TABLE, TC,TB,TA,SD, SC, TMP) xorl 24(KEY),SC - AES_ROUND(TABLE, TD,TC,TB,TA, SD, TMP) + AES_ROUND(TABLE, SD,TC,TB,TA, SD, TMP) xorl 28(KEY),SD add $32,KEY C point to next key @@ -125,13 +120,13 @@ PROLOGUE(_nettle_aes_decrypt) AES_ROUND(TABLE, SC,SB,SA,SD, TC, TMP) xorl 8(KEY),TC - AES_ROUND(TABLE, SD,SC,SB,SA, TD, TMP) - xorl 12(KEY),TD + AES_ROUND(TABLE, SD,SC,SB,SA, SD, TMP) + xorl 12(KEY),SD - AES_FINAL_ROUND(TA,TD,TC,TB, TABLE, SA, TMP) - AES_FINAL_ROUND(TB,TA,TD,TC, TABLE, SB, TMP) - AES_FINAL_ROUND(TC,TB,TA,TD, TABLE, SC, TMP) - AES_FINAL_ROUND(TD,TC,TB,TA, TABLE, SD, TMP) + AES_FINAL_ROUND(TA,SD,TC,TB, TABLE, SA, TMP) + AES_FINAL_ROUND(TB,TA,SD,TC, TABLE, SB, TMP) + AES_FINAL_ROUND(TC,TB,TA,SD, TABLE, SC, TMP) + AES_FINAL_ROUND(SD,TC,TB,TA, TABLE, SD, TMP) C Inverse S-box substitution mov $3, COUNT @@ -145,11 +140,10 @@ PROLOGUE(_nettle_aes_decrypt) AES_STORE(SA,SB,SC,SD, KEY, DST) add $16, DST - decl FRAME_COUNT + decl BLOCK_COUNT jnz .Lblock_loop - add $8, %rsp pop %r15 pop %r14 pop %r13 diff --git a/x86_64/aes-encrypt-internal.asm b/x86_64/aes-encrypt-internal.asm index e89dc2bac2d1c2bddfc3ba63eb9dd22635541a25..839d3341ffa8289ba5dba34d7116498390389262 100644 --- a/x86_64/aes-encrypt-internal.asm +++ b/x86_64/aes-encrypt-internal.asm @@ -1,4 +1,3 @@ -C -*- mode: asm; asm-comment-char: ?C; -*- C nettle, low-level cryptographics library C C Copyright (C) 2001, 2002, 2005, 2008 Rafael R. Sevilla, Niels M�ller @@ -25,28 +24,25 @@ C Register usage: C AES state, use two of them define(<SA>,<%eax>) define(<SB>,<%ebx>) -define(<SC>,<%ebp>) -define(<SD>,<%r9d>) +define(<SC>,<%ecx>) +define(<SD>,<%edx>) define(<TA>,<%r10d>) define(<TB>,<%r11d>) define(<TC>,<%r12d>) -define(<TD>,<%r13d>) define(<CTX>, <%rdi>) define(<TABLE>, <%rsi>) -define(<LENGTH>,<%edx>) C Length is only 32 bits -define(<DST>, <%rcx>) +define(<PARAM_LENGTH>,<%edx>) C Length is only 32 bits +define(<PARAM_DST>, <%rcx>) define(<SRC>, <%r8>) +define(<DST>, <%r9>) define(<KEY>,<%r14>) define(<COUNT>, <%r15d>) +define(<BLOCK_COUNT>, <%r13d>) -C Put the outer loop counter on the stack, and reuse the LENGTH -C register as a temporary. - -define(<FRAME_COUNT>, <(%rsp)>) -define(<TMP>,<%rdx>) +define(<TMP>,<%rbp>) .file "aes-encrypt-internal.asm" @@ -57,7 +53,7 @@ define(<TMP>,<%rdx>) .text ALIGN(4) PROLOGUE(_nettle_aes_encrypt) - test LENGTH, LENGTH + test PARAM_LENGTH, PARAM_LENGTH jz .Lend C save all registers that need to be saved @@ -68,11 +64,9 @@ PROLOGUE(_nettle_aes_encrypt) push %r14 push %r15 - C Allocates 4 bytes more than we need, for nicer alignment. - sub $8, %rsp - - shrl $4, LENGTH - movl LENGTH, FRAME_COUNT + mov PARAM_DST, DST + movl PARAM_LENGTH, BLOCK_COUNT + shrl $4, BLOCK_COUNT .Lblock_loop: mov CTX,KEY @@ -96,19 +90,19 @@ PROLOGUE(_nettle_aes_encrypt) AES_ROUND(TABLE, SC,SD,SA,SB, TC, TMP) xorl 8(KEY),TC - AES_ROUND(TABLE, SD,SA,SB,SC, TD, TMP) - xorl 12(KEY),TD + AES_ROUND(TABLE, SD,SA,SB,SC, SD, TMP) + xorl 12(KEY),SD - AES_ROUND(TABLE, TA,TB,TC,TD, SA, TMP) + AES_ROUND(TABLE, TA,TB,TC,SD, SA, TMP) xorl 16(KEY), SA - AES_ROUND(TABLE, TB,TC,TD,TA, SB, TMP) + AES_ROUND(TABLE, TB,TC,SD,TA, SB, TMP) xorl 20(KEY),SB - AES_ROUND(TABLE, TC,TD,TA,TB, SC, TMP) + AES_ROUND(TABLE, TC,SD,TA,TB, SC, TMP) xorl 24(KEY),SC - AES_ROUND(TABLE, TD,TA,TB,TC, SD, TMP) + AES_ROUND(TABLE, SD,TA,TB,TC, SD, TMP) xorl 28(KEY),SD add $32,KEY C point to next key @@ -126,13 +120,13 @@ PROLOGUE(_nettle_aes_encrypt) AES_ROUND(TABLE, SC,SD,SA,SB, TC, TMP) xorl 8(KEY),TC - AES_ROUND(TABLE, SD,SA,SB,SC, TD, TMP) - xorl 12(KEY),TD + AES_ROUND(TABLE, SD,SA,SB,SC, SD, TMP) + xorl 12(KEY),SD - AES_FINAL_ROUND(TA,TB,TC,TD, TABLE, SA, TMP) - AES_FINAL_ROUND(TB,TC,TD,TA, TABLE, SB, TMP) - AES_FINAL_ROUND(TC,TD,TA,TB, TABLE, SC, TMP) - AES_FINAL_ROUND(TD,TA,TB,TC, TABLE, SD, TMP) + AES_FINAL_ROUND(TA,TB,TC,SD, TABLE, SA, TMP) + AES_FINAL_ROUND(TB,TC,SD,TA, TABLE, SB, TMP) + AES_FINAL_ROUND(TC,SD,TA,TB, TABLE, SC, TMP) + AES_FINAL_ROUND(SD,TA,TB,TC, TABLE, SD, TMP) C S-box substitution mov $3, COUNT @@ -146,11 +140,10 @@ PROLOGUE(_nettle_aes_encrypt) AES_STORE(SA,SB,SC,SD, KEY, DST) add $16, DST - decl FRAME_COUNT + decl BLOCK_COUNT jnz .Lblock_loop - add $8, %rsp pop %r15 pop %r14 pop %r13 diff --git a/x86_64/aes.m4 b/x86_64/aes.m4 index 26f4b29ac6b74092dfa6e325fd63f55f4b830c7a..9f251c50d2e8ba8ed609e5d752d7a126b8815900 100644 --- a/x86_64/aes.m4 +++ b/x86_64/aes.m4 @@ -17,6 +17,28 @@ define(<LREG>,<ifelse( $1, %r14d, %r14b, $1, %r15d, %r15b)>)dnl +define(<HREG>,<ifelse( + $1, %eax, %ah, + $1, %ebx, %bh, + $1, %ecx, %ch, + $1, %edx, %dh, + error)>) + +dnl MOVE_HREG(src, dst) +define(<MOVE_HREG>, <ifelse( + $1, %eax, <movzb %ah, $2 + >, + $1, %ebx, <movzb %bh, $2 + >, + $1, %ecx, <movzb %ch, $2 + >, + $1, %edx, <movzb %dh, $2 + >, + <movl $1, $2 + shr <$>8, $2 + and <$>0xff, $2 + >)>) + define(<XREG>,<ifelse( $1, %rax, %eax, $1, %rbx, %ebx, @@ -26,8 +48,8 @@ define(<XREG>,<ifelse( $1, %rdi, %edi, $1, %rbp, %ebp, $1, %rsp, %esp, - $1, %r8d, %r8d, - $1, %r9d, %r9d, + $1, %r8, %r8d, + $1, %r9, %r9d, $1, %r10,%r10d, $1, %r11,%r11d, $1, %r12,%r12d, @@ -72,9 +94,7 @@ dnl Computes one word of the AES round. Leaves result in $6. define(<AES_ROUND>, < movzb LREG($2), $7 movl AES_TABLE0 ($1, $7, 4),$6 - movl $3, XREG($7) - shr <$>8,$7 - and <$>0xff,$7 + MOVE_HREG($3, XREG($7)) xorl AES_TABLE1 ($1, $7, 4),$6 movl $4,XREG($7) shr <$>16,$7