diff --git a/ChangeLog b/ChangeLog index 155af0edd44b4223e4ec791bd604231a51d7b2f3..9ab115b1197855d280c2f59b64307c3bfb7b03be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2013-04-16 Niels Möller <nisse@lysator.liu.se> + * x86_64/umac-nh-n.asm: New file, 3.5 time speedup. + * umac32.c (umac32_digest): Fix nonce caching. * umac64.c (umac64_digest): Likewise. diff --git a/x86_64/umac-nh-n.asm b/x86_64/umac-nh-n.asm new file mode 100644 index 0000000000000000000000000000000000000000..fd145ccd126f415bbdc15f71778d1172e128d991 --- /dev/null +++ b/x86_64/umac-nh-n.asm @@ -0,0 +1,261 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2013 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +C MA 02111-1301, USA. + +define(<OUT>, <%rdi>) +define(<ITERS>, <%rsi>) +define(<KEY>, <%rdx>) +define(<LENGTH>, <%rcx>) +define(<MSG>, <%r8>) + +define(<XM0>, <%xmm0>) +define(<XM1>, <%xmm1>) +define(<XM2>, <%xmm2>) +define(<XM3>, <%xmm3>) +define(<XK0>, <%xmm4>) +define(<XK1>, <%xmm5>) +define(<XK2>, <%xmm6>) +define(<XK3>, <%xmm7>) +define(<XT0>, <%xmm8>) +define(<XT1>, <%xmm9>) +define(<XT2>, <%xmm10>) +define(<XT3>, <%xmm11>) +define(<XY0>, <%xmm12>) +define(<XY1>, <%xmm13>) + +C Copy [0,1,2,3] to [1,1,3,3] +define(<HI2LO>, <pshufd <$>0xf5,>) +C FIXME: Would be nice if we could force the key array to be 16-byte +C aligned. + + .file "umac-nh-n.asm" + + C umac_nh_n(uint64_t *out, unsigned n, const uint32_t *key, + C unsigned length, const uint8_t *msg) + .text + ALIGN(4) +PROLOGUE(_nettle_umac_nh_n) + W64_ENTRY(5, 14) + pxor XY0, XY0 + cmp $3, ITERS + jc .Lnh2 + je .Lnh3 + +.Lnh4: + movups (KEY), XK0 + movups 16(KEY), XT2 + movups 32(KEY), XK2 + lea 48(KEY), KEY + C Leave XK2 untuched, and put permuted keys in XK0, XK1, XT2, XT3 + movaps XK0, XT0 + movaps XK0, XK1 + + punpcklqdq XT2, XK0 C [0,1,4,5] + punpckhqdq XT2, XK1 C [2,3,6,7] + movaps XT2, XT3 + punpcklqdq XK2, XT2 C [4,5, 8, 9] + punpckhqdq XK2, XT3 C [6,7,10,11] + + movaps XY0, XY1 + +.Loop4: + movups (MSG), XT0 + movups 16(MSG), XT1 + + pshufd $0xee, XT1, XM3 C [6,7,6,7] + pshufd $0x44, XT1, XM2 C [4,5,4,5] + pshufd $0xee, XT0, XM1 C [2,3,2,3] + pshufd $0x44, XT0, XM0 C [0,1,0,1] + + paddd XM0, XK0 + paddd XM1, XK1 + paddd XM2, XT2 + paddd XM3, XT3 + + HI2LO XK0, XT0 + HI2LO XT2, XT1 + pmuludq XK0, XT2 + pmuludq XT0, XT1 + paddq XT2, XY0 + paddq XT1, XY0 + + HI2LO XK1, XT0 + HI2LO XT3, XT1 + pmuludq XK1, XT3 + pmuludq XT0, XT1 + paddq XT3, XY0 + paddq XT1, XY0 + + movaps XK2, XK0 + movaps XK2, XK1 + movups (KEY), XT2 + movups 16(KEY), XK2 + punpcklqdq XT2, XK0 C [ 8, 9,12,13] + punpckhqdq XT2, XK1 C [10,11,14,15] + movaps XT2, XT3 + + punpcklqdq XK2, XT2 C [12,13,16,17] + punpckhqdq XK2, XT3 C [14,15,18,19] + + paddd XK0, XM0 + paddd XK1, XM1 + paddd XT2, XM2 + paddd XT3, XM3 + + HI2LO XM0, XT0 + HI2LO XM2, XT1 + pmuludq XM0, XM2 + pmuludq XT0, XT1 + paddq XM2, XY1 + paddq XT1, XY1 + + HI2LO XM1, XT0 + HI2LO XM3, XT1 + pmuludq XM1, XM3 + pmuludq XT0, XT1 + paddq XM3, XY1 + paddq XT1, XY1 + + subl $32, XREG(LENGTH) + lea 32(MSG), MSG + lea 32(KEY), KEY + ja .Loop4 + + movups XY0, (OUT) + movups XY1, 16(OUT) + + W64_EXIT(5, 14) + ret + +.Lnh3: + movups (KEY), XK0 + movups 16(KEY), XK1 + movaps XY0, XY1 +.Loop3: + lea 32(KEY), KEY + movups (MSG), XT0 + movups 16(MSG), XT1 + movups (KEY), XK2 + movups 16(KEY), XK3 + pshufd $0xee, XT1, XM3 C [6,7,6,7] + pshufd $0x44, XT1, XM2 C [4,5,4,5] + pshufd $0xee, XT0, XM1 C [2,3,2,3] + pshufd $0x44, XT0, XM0 C [0,1,0,1] + + C Iteration 2 + paddd XK2, XT0 + paddd XK3, XT1 + HI2LO XT0, XT2 + HI2LO XT1, XT3 + pmuludq XT0, XT1 + pmuludq XT2, XT3 + paddq XT1, XY1 + paddq XT3, XY1 + + C Iteration 0,1 + movaps XK0, XT0 + punpcklqdq XK1, XK0 C [0,1,4,5] + punpckhqdq XK1, XT0 C [2,3,6,7] + paddd XK0, XM0 + paddd XT0, XM1 + movaps XK2, XK0 + movaps XK1, XT0 + punpcklqdq XK2, XK1 C [4,5,8,9] + punpckhqdq XK2, XT0 C [6,7,10,11] + paddd XK1, XM2 + paddd XT0, XM3 + + HI2LO XM0, XT0 + HI2LO XM2, XT1 + pmuludq XM0, XM2 + pmuludq XT0, XT1 + paddq XM2, XY0 + paddq XT1, XY0 + + HI2LO XM1, XT0 + HI2LO XM3, XT1 + pmuludq XM1, XM3 + pmuludq XT0, XT1 + paddq XM3, XY0 + paddq XT1, XY0 + subl $32, XREG(LENGTH) + lea 32(MSG), MSG + movaps XK2, XK0 + movaps XK3, XK1 + + ja .Loop3 + + pshufd $0xe, XY1, XT0 + paddq XT0, XY1 + movups XY0, (OUT) + movlpd XY1, 16(OUT) + + W64_EXIT(5, 14) + ret + +.Lnh2: + C Explode message as [0,1,0,1] [2,3,2,3] [4,5,4,5] [6,7, 6, 7] + C Interleave keys as [0,1,4,5] [2,3,6,7] [4,5,8,9] [7,8,10,11] + movups (KEY), XK0 + lea 16(KEY), KEY +.Loop2: + movups (MSG), XM0 + movups 16(MSG), XM1 + pshufd $0xee, XM1, XM3 C [6,7,6,7] + pshufd $0x44, XM1, XM2 C [4,5,4,5] + pshufd $0xee, XM0, XM1 C [2,3,2,3] + pshufd $0x44, XM0, XM0 C [0,1,0,1] + + movups (KEY), XK1 + movups 16(KEY), XK2 + movaps XK0, XT0 + punpcklqdq XK1, XK0 C [0,1,4,5] + punpckhqdq XK1, XT0 C [2,3,6,7] + paddd XK0, XM0 + paddd XT0, XM1 + movaps XK2, XK0 + movaps XK1, XT0 + punpcklqdq XK2, XK1 C [4,5,8,9] + punpckhqdq XK2, XT0 C [6,7,10,11] + paddd XK1, XM2 + paddd XT0, XM3 + + HI2LO XM0, XT0 + HI2LO XM2, XT1 + pmuludq XM0, XM2 + pmuludq XT0, XT1 + paddq XM2, XY0 + paddq XT1, XY0 + + HI2LO XM1, XT0 + HI2LO XM3, XT1 + pmuludq XM1, XM3 + pmuludq XT0, XT1 + paddq XM3, XY0 + paddq XT1, XY0 + subl $32, XREG(LENGTH) + lea 32(MSG), MSG + lea 32(KEY), KEY + + ja .Loop2 + + movups XY0, (OUT) +.Lend: + W64_EXIT(5, 14) + ret +EPILOGUE(_nettle_umac_nh_n)