From b878b6d47b3f8cc7feacb6beb5351e6598d8a398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se> Date: Sun, 25 Jul 2010 20:23:56 +0200 Subject: [PATCH] New file. Rev: nettle/x86/camellia-crypt-internal.asm:1.1 --- x86/camellia-crypt-internal.asm | 213 ++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 x86/camellia-crypt-internal.asm diff --git a/x86/camellia-crypt-internal.asm b/x86/camellia-crypt-internal.asm new file mode 100644 index 00000000..b5c491c8 --- /dev/null +++ b/x86/camellia-crypt-internal.asm @@ -0,0 +1,213 @@ +C -*- mode: asm; asm-comment-char: ?C; -*- +C nettle, low-level cryptographics library +C +C Copyright (C) 2010, Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +C Register usage: + +C Camellia state, 128-bit value in little endian order. +C L0, H0 corresponds to D1 in the spec and i0 in the C implementation. +C while L1, H1 corresponds to D2/i1. +define(<L0>,<%eax>) +define(<H0>,<%ebx>) +define(<L1>,<%ecx>) +define(<H1>,<%edx>) + +define(<TMP>,<%ebp>) +define(<KEY>,<%esi>) +define(<T>,<%edi>) + +C Locals on the stack + +define(<FRAME_L0>, <(%esp)>) +define(<FRAME_H0>, <4(%esp)>) +define(<FRAME_L1>, <8(%esp)>) +define(<FRAME_H1>, <12(%esp)>) +define(<FRAME_CNT>, <16(%esp)>) + +C Arguments on stack. +define(<FRAME_CTX>, <40(%esp)>) +define(<FRAME_TABLE>, <44(%esp)>) +define(<FRAME_LENGTH>, <48(%esp)>) +define(<FRAME_DST>, <52(%esp)>) +define(<FRAME_SRC>, <56(%esp)>) + +define(<SP1110>, <(T,$1,4)>) +define(<SP0222>, <1024(T,$1,4)>) +define(<SP3033>, <2048(T,$1,4)>) +define(<SP4404>, <3072(T,$1,4)>) + +C ROUND(xl, xh, yl, yh, key-offset) +C xl and xh are rotated 16 bits at the end +C yl and yh are read from stack, and left in registers +define(<ROUND>, < + movzbl LREG($1), TMP + movl SP1110(TMP), $4 + movzbl HREG($1), TMP + xorl SP4404(TMP), $4 + roll <$>16, $1 + + movzbl LREG($2), TMP + movl SP4404(TMP), $3 + movzbl HREG($2), TMP + xorl SP3033(TMP), $3 + roll <$>16, $2 + + movzbl LREG($1), TMP + xorl SP3033(TMP), $4 + movzbl HREG($1), TMP + xorl SP0222(TMP), $4 + + movzbl LREG($2), TMP + xorl SP0222(TMP), $3 + movzbl HREG($2), TMP + xorl SP1110(TMP), $3 + + xorl $5(KEY), $4 + xorl $5 + 4(KEY), $3 + + xorl $3, $4 + rorl <$>8, $3 + xorl $4, $3 + + xorl FRAME_$3, $3 + xorl FRAME_$4, $4 +>) + +C Six rounds, with inputs and outputs in registers. +define(<ROUND6>, < + movl L0, FRAME_L0 + movl H0, FRAME_H0 + movl L1, FRAME_L1 + movl H1, FRAME_H1 + + ROUND(L0,H0,<L1>,<H1>,0) + movl L1, FRAME_L1 + movl H1, FRAME_H1 + ROUND(L1,H1,<L0>,<H0>,8) + movl L0, FRAME_L0 + movl H0, FRAME_H0 + ROUND(L0,H0,<L1>,<H1>,16) + movl L1, FRAME_L1 + movl H1, FRAME_H1 + ROUND(L1,H1,<L0>,<H0>,24) + movl L0, FRAME_L0 + movl H0, FRAME_H0 + ROUND(L0,H0,<L1>,<H1>,32) + ROUND(L1,H1,<L0>,<H0>,40) + roll <$>16, L1 + roll <$>16, H1 +>) + +C FL(x0, x1, key-offset) +define(<FL>, < + movl $3 + 4(KEY), TMP + andl $2, TMP + roll <$>1, TMP + xorl TMP, $1 + movl $3(KEY), TMP + orl $1, TMP + xorl TMP, $2 +>) +C FLINV(x0, x1, key-offset) +define(<FLINV>, < + movl $3(KEY), TMP + orl $1, TMP + xorl TMP, $2 + movl $3 + 4(KEY), TMP + andl $2, TMP + roll <$>1, TMP + xorl TMP, $1 +>) + +.file "camellia-encrypt-internal.asm" + + C _camellia_crypt(struct camellia_context *ctx, + C const struct camellia_table *T, + C unsigned length, uint8_t *dst, + C uint8_t *src) + .text + ALIGN(4) +PROLOGUE(_nettle_camellia_crypt) + C save all registers that need to be saved + pushl %ebx C 32(%esp) + pushl %ebp C 28(%esp) + pushl %esi C 24(%esp) + pushl %edi C 20(%esp) + + subl $20, %esp + + movl FRAME_LENGTH, %ebp + testl %ebp,%ebp + jz .Lend + +.Lblock_loop: + C Load data, note that we'll happily do unaligned loads + movl FRAME_SRC, TMP + movl (TMP), H0 + bswap H0 + movl 4(TMP), L0 + bswap L0 + movl 8(TMP), H1 + bswap H1 + movl 12(TMP), L1 + bswap L1 + addl $16, FRAME_SRC + movl FRAME_CTX, KEY + movl (KEY), TMP + subl $8, TMP + mov TMP, FRAME_CNT + C Whitening using first subkey + xor 4(KEY), L0 + xor 8(KEY), H0 + add $12, KEY + + movl FRAME_TABLE, T + + ROUND6 +.Lround_loop: + add $64, KEY + FL(L0, H0, -16) + FLINV(L1, H1, -8) + ROUND6 + sub $8, FRAME_CNT + ja .Lround_loop + + movl FRAME_DST, TMP + bswap H0 + movl H0,8(TMP) + bswap L0 + movl L0,12(TMP) + xorl 52(KEY), H1 + bswap H1 + movl H1, 0(TMP) + xorl 48(KEY), L1 + bswap L1 + movl L1, 4(TMP) + addl $16, FRAME_DST + subl $16, FRAME_LENGTH + ja .Lblock_loop + +.Lend: + addl $20, %esp + popl %edi + popl %esi + popl %ebp + popl %ebx + ret +EPILOGUE(_nettle_camellia_crypt) -- GitLab