From 044d24b0ad6bce4269bc9b5d44f179b70fa24c4a Mon Sep 17 00:00:00 2001 From: Mamone Tarsha <maamoun.tk@googlemail.com> Date: Tue, 18 Jan 2022 19:32:31 +0200 Subject: [PATCH] [S390x] Optimize Chacha20 --- fat-s390x.c | 42 ++++- s390x/fat/chacha-2core.asm | 35 ++++ s390x/fat/chacha-4core.asm | 35 ++++ s390x/fat/chacha-core-internal-2.asm | 36 ++++ s390x/vf/chacha-2core.asm | 230 ++++++++++++++++++++++++ s390x/vf/chacha-4core.asm | 255 +++++++++++++++++++++++++++ s390x/vf/chacha-core-internal.asm | 131 ++++++++++++++ 7 files changed, 759 insertions(+), 5 deletions(-) create mode 100644 s390x/fat/chacha-2core.asm create mode 100644 s390x/fat/chacha-4core.asm create mode 100644 s390x/fat/chacha-core-internal-2.asm create mode 100644 s390x/vf/chacha-2core.asm create mode 100644 s390x/vf/chacha-4core.asm create mode 100644 s390x/vf/chacha-core-internal.asm diff --git a/fat-s390x.c b/fat-s390x.c index db793e2c..6c68c445 100644 --- a/fat-s390x.c +++ b/fat-s390x.c @@ -268,6 +268,18 @@ DECLARE_FAT_FUNC(nettle_sha3_permute, sha3_permute_func) DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, c) DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, s390x) +DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) +DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c); +DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, s390x); + +DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func) +DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core) +DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core) + +DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func) +DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core) +DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core) + static void CONSTRUCTOR fat_init (void) { @@ -281,18 +293,20 @@ fat_init (void) if (features.have_vector_facility) { if (verbose) - fprintf (stderr, "libnettle: enabling vectorized memxor3.\n"); + fprintf (stderr, "libnettle: enabling vector facility code.\n"); nettle_memxor3_vec = _nettle_memxor3_s390x; - - if (verbose) - fprintf (stderr, "libnettle: enabling vectorized sha3 permute.\n"); nettle_sha3_permute_vec = _nettle_sha3_permute_s390x; + _nettle_chacha_core_vec = _nettle_chacha_core_s390x; + nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core; + nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core; } else { nettle_memxor3_vec = _nettle_memxor3_c; - nettle_sha3_permute_vec = _nettle_sha3_permute_c; + _nettle_chacha_core_vec = _nettle_chacha_core_c; + nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core; + nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core; } /* AES128 */ @@ -497,3 +511,21 @@ DEFINE_FAT_FUNC(_nettle_sha512_compress, void, /* SHA3 */ DEFINE_FAT_FUNC(nettle_sha3_permute, void, (struct sha3_state *state), (state)) + +DEFINE_FAT_FUNC(_nettle_chacha_core, void, + (uint32_t *dst, const uint32_t *src, unsigned rounds), + (dst, src, rounds)) + +DEFINE_FAT_FUNC(nettle_chacha_crypt, void, + (struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src), + (ctx, length, dst, src)) + +DEFINE_FAT_FUNC(nettle_chacha_crypt32, void, + (struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src), + (ctx, length, dst, src)) diff --git a/s390x/fat/chacha-2core.asm b/s390x/fat/chacha-2core.asm new file mode 100644 index 00000000..a97de67e --- /dev/null +++ b/s390x/fat/chacha-2core.asm @@ -0,0 +1,35 @@ +C s390x/fat/chacha-2core.asm + +ifelse(` + Copyright (C) 2022 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_fat_chacha_2core) picked up by configure + +include_src(`s390x/vf/chacha-2core.asm') diff --git a/s390x/fat/chacha-4core.asm b/s390x/fat/chacha-4core.asm new file mode 100644 index 00000000..ad9e893c --- /dev/null +++ b/s390x/fat/chacha-4core.asm @@ -0,0 +1,35 @@ +C s390x/fat/chacha-4core.asm + +ifelse(` + Copyright (C) 2022 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure + +include_src(`s390x/vf/chacha-4core.asm') diff --git a/s390x/fat/chacha-core-internal-2.asm b/s390x/fat/chacha-core-internal-2.asm new file mode 100644 index 00000000..8e17e01a --- /dev/null +++ b/s390x/fat/chacha-core-internal-2.asm @@ -0,0 +1,36 @@ +C s390x/fat/chacha-core-internal-2.asm + +ifelse(` + Copyright (C) 2022 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_chacha_core) picked up by configure + +define(`fat_transform', `$1_s390x') +include_src(`s390x/vf/chacha-core-internal.asm') diff --git a/s390x/vf/chacha-2core.asm b/s390x/vf/chacha-2core.asm new file mode 100644 index 00000000..34bd9a27 --- /dev/null +++ b/s390x/vf/chacha-2core.asm @@ -0,0 +1,230 @@ +C s390x/vf/chacha-2core.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Torbjörn Granlund + Copyright (C) 2022 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +C Argments +define(`DST', `%r2') +define(`SRC', `%r3') +define(`ROUNDS', `%r4') + +C State, even elements in X, odd elements in Y +define(`X0', `%v1') +define(`X1', `%v2') +define(`X2', `%v3') +define(`X3', `%v29') +define(`Y0', `%v4') +define(`Y1', `%v5') +define(`Y2', `%v6') +define(`Y3', `%v7') + +C Original input state +define(`S0', `%v24') +define(`S1', `%v25') +define(`S2', `%v26') +define(`S3', `%v27') +define(`S3p1', `%v28') + +define(`T0', `%v0') + +define(`BRW', `%v30') +define(`EW', `%v30') +define(`OW', `%v31') + +.file "chacha-2core.asm" +.machine "z13" + +.text +C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_chacha_2core) + + vzero X1 + vleif X1, 1, 0 + + vl X3, 48(SRC) + + vaccf Y3, X3, X1 C Counter carry out + vsldb Y3, Y3, Y3, 12 + vo Y3, Y3, X1 + +.Lshared_entry: + vaf Y3, Y3, X3 + + vlm X0, X2, 0(SRC) + + vlr S0, X0 + vlr S1, X1 + vlr S2, X2 + vlr S3, X3 + vlr S3p1, Y3 + + larl %r5,.Lword_even + vlm EW, OW, 0(%r5) + + vperm Y0, X0, X0, OW C 1 1 3 3 + vperm X0, X0, X0, EW C 0 0 2 2 + vperm Y1, X1, X1, OW C 5 5 7 7 + vperm X1, X1, X1, EW C 4 4 6 6 + vperm Y2, X2, X2, OW C 9 9 11 11 + vperm X2, X2, X2, EW C 8 8 10 10 + vperm Y3, X3, S3p1, OW C 13 13 15 15 + vperm X3, X3, S3p1, EW C 12 12 14 14 + + srlg ROUNDS, ROUNDS, 1 +.Loop: +C Register layout (A is first block, B is second block) +C +C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3 +C X1: A4 B4 A6 B6 Y1: A5 B5 A7 B7 +C X2: A8 B8 A10 B10 Y2: A9 B9 A11 B11 +C X3: A12 B12 A14 B14 Y3: A13 B13 A15 B15 + vaf X0, X0, X1 + vaf Y0, Y0, Y1 + vx X3, X3, X0 + vx Y3, Y3, Y0 + verllf X3, X3, 16 + verllf Y3, Y3, 16 + + vaf X2, X2, X3 + vaf Y2, Y2, Y3 + vx X1, X1, X2 + vx Y1, Y1, Y2 + verllf X1, X1, 12 + verllf Y1, Y1, 12 + + vaf X0, X0, X1 + vaf Y0, Y0, Y1 + vx X3, X3, X0 + vx Y3, Y3, Y0 + verllf X3, X3, 8 + verllf Y3, Y3, 8 + + vaf X2, X2, X3 + vaf Y2, Y2, Y3 + vx X1, X1, X2 + vx Y1, Y1, Y2 + verllf X1, X1, 7 + verllf Y1, Y1, 7 + + vpdi X1, X1, X1, 0b0100 + vpdi X2, X2, X2, 0b0100 + vpdi Y2, Y2, Y2, 0b0100 + vpdi Y3, Y3, Y3, 0b0100 + +C Register layout: +C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3 +C Y1: A5 B5 A7 B7 X1: A6 B6 A4 B4 (X1 swapped) +C X2: A10 B10 A8 B8 Y2: A11 A11 A9 B9 (X2, Y2 swapped) +C Y3 A15 B15 A13 B13 X3 A12 B12 A14 B14 (Y3 swapped) + + vaf X0, X0, Y1 + vaf Y0, Y0, X1 + vx Y3, Y3, X0 + vx X3, X3, Y0 + verllf Y3, Y3, 16 + verllf X3, X3, 16 + + vaf X2, X2, Y3 + vaf Y2, Y2, X3 + vx Y1, Y1, X2 + vx X1, X1, Y2 + verllf Y1, Y1, 12 + verllf X1, X1, 12 + + vaf X0, X0, Y1 + vaf Y0, Y0, X1 + vx Y3, Y3, X0 + vx X3, X3, Y0 + verllf Y3, Y3, 8 + verllf X3, X3, 8 + + vaf X2, X2, Y3 + vaf Y2, Y2, X3 + vx Y1, Y1, X2 + vx X1, X1, Y2 + verllf Y1, Y1, 7 + verllf X1, X1, 7 + + vpdi X1, X1, X1, 0b0100 + vpdi X2, X2, X2, 0b0100 + vpdi Y2, Y2, Y2, 0b0100 + vpdi Y3, Y3, Y3, 0b0100 + + brctg ROUNDS, .Loop + + vperm T0, X0, Y0, EW + vperm Y0, X0, Y0, OW + + vperm X0, X1, Y1, EW + vperm Y1, X1, Y1, OW + + vperm X1, X2, Y2, EW + vperm Y2, X2, Y2, OW + + vperm X2, X3, Y3, EW + vperm Y3, X3, Y3, OW + + vaf T0, T0, S0 + vaf Y0, Y0, S0 + vaf X0, X0, S1 + vaf Y1, Y1, S1 + vaf X1, X1, S2 + vaf Y2, Y2, S2 + vaf X2, X2, S3 + vaf Y3, Y3, S3p1 + + vl BRW, 32(%r5) + vperm T0, T0, T0, BRW + vperm X0, X0, X0, BRW + vperm X1, X1, X1, BRW + vperm X2, X2, X2, BRW + vperm Y0, Y0, Y0, BRW + vperm Y1, Y1, Y1, BRW + vperm Y2, Y2, Y2, BRW + vperm Y3, Y3, Y3, BRW + + vstm T0, Y3, 0(DST) + br RA +EPILOGUE(_nettle_chacha_2core) + +PROLOGUE(_nettle_chacha_2core32) + vzero Y3 + vleif Y3, 1, 0 + vl X3, 48(SRC) + j .Lshared_entry +EPILOGUE(_nettle_chacha_2core32) + +.align 16 +.Lword_even: .long 0x00010203,0x10111213,0x08090A0B,0x18191A1B +.Lword_odd: .long 0x04050607,0x14151617,0x0C0D0E0F,0x1C1D1E1F +.Lword_byte_reverse: .long 0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C diff --git a/s390x/vf/chacha-4core.asm b/s390x/vf/chacha-4core.asm new file mode 100644 index 00000000..276de9f1 --- /dev/null +++ b/s390x/vf/chacha-4core.asm @@ -0,0 +1,255 @@ +C s390x/vf/chacha-4core.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Torbjörn Granlund + Copyright (C) 2022 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +C Argments +define(`DST', `%r2') +define(`SRC', `%r3') +define(`ROUNDS', `%r4') + +C Working state + +define(`BRW', `%v24') + +C During the loop, used to save the original values for last 4 words +C of each block. Also used as temporaries for transpose. +define(`T0', `%v25') +define(`T1', `%v26') +define(`T2', `%v27') +define(`T3', `%v28') + +C Main loop for round +define(`QR',` + vaf $1, $1, $2 + vaf $5, $5, $6 + vaf $9, $9, $10 + vaf $13, $13, $14 + vx $4, $4, $1 + vx $8, $8, $5 + vx $12, $12, $9 + vx $16, $16, $13 + verllf $4, $4, 16 + verllf $8, $8, 16 + verllf $12, $12, 16 + verllf $16, $16, 16 + + vaf $3, $3, $4 + vaf $7, $7, $8 + vaf $11, $11, $12 + vaf $15, $15, $16 + vx $2, $2, $3 + vx $6, $6, $7 + vx $10, $10, $11 + vx $14, $14, $15 + verllf $2, $2, 12 + verllf $6, $6, 12 + verllf $10, $10, 12 + verllf $14, $14, 12 + + vaf $1, $1, $2 + vaf $5, $5, $6 + vaf $9, $9, $10 + vaf $13, $13, $14 + vx $4, $4, $1 + vx $8, $8, $5 + vx $12, $12, $9 + vx $16, $16, $13 + verllf $4, $4, 8 + verllf $8, $8, 8 + verllf $12, $12, 8 + verllf $16, $16, 8 + + vaf $3, $3, $4 + vaf $7, $7, $8 + vaf $11, $11, $12 + vaf $15, $15, $16 + vx $2, $2, $3 + vx $6, $6, $7 + vx $10, $10, $11 + vx $14, $14, $15 + verllf $2, $2, 7 + verllf $6, $6, 7 + verllf $10, $10, 7 + verllf $14, $14, 7 +') + +define(`TRANSPOSE',` + vmrhf T0, $1, $3 C A0 A2 B0 B2 + vmrhf T1, $2, $4 C A1 A3 B1 B3 + vmrlf T2, $1, $3 C C0 C2 D0 D2 + vmrlf T3, $2, $4 C C1 C3 D1 D3 + + vmrhf $1, T0, T1 C A0 A1 A2 A3 + vmrlf $2, T0, T1 C B0 B1 B2 B3 + vmrhf $3, T2, T3 C C0 C2 C1 C3 + vmrlf $4, T2, T3 C D0 D1 D2 D3 +') + +.file "chacha-4core.asm" +.machine "z13" + +.text +C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_chacha_4core) + + vrepif T2, 1 C Apply counter carries + +.Lshared_entry: + + C Save callee-save registers + ALLOC_STACK(%r1,64) C Allocate 64-byte space on stack + C Save non-volatile floating point registers + std %f8,0(%r1) + std %f9,8(%r1) + std %f10,16(%r1) + std %f11,24(%r1) + std %f12,32(%r1) + std %f13,40(%r1) + std %f14,48(%r1) + std %f15,56(%r1) + + larl %r5,.Lword_byte_reverse + vlm BRW, T0, 0(%r5) + +C Load state and splat + vlm %v0, %v3, 0(SRC) + + vrepf %v4, %v0, 1 + vrepf %v8, %v0, 2 + vrepf %v12, %v0, 3 + vrepf %v0, %v0, 0 + vrepf %v5, %v1, 1 + vrepf %v9, %v1, 2 + vrepf %v13, %v1, 3 + vrepf %v1, %v1, 0 + vrepf %v6, %v2, 1 + vrepf %v10, %v2, 2 + vrepf %v14, %v2, 3 + vrepf %v2, %v2, 0 + vrepf %v7, %v3, 1 + vrepf %v11, %v3, 2 + vrepf %v15, %v3, 3 + vrepf %v3, %v3, 0 + + vaccf T1, %v3, T0 C low adds + vaf %v3, %v3, T0 C compute carry-out + vn T1, T1, T2 C discard carries for 32-bit counter variant + vaf %v7, %v7, T1 C apply carries + + C Save all 4x4 of the last words. + vlr T0, %v3 + vlr T1, %v7 + vlr T2, %v11 + vlr T3, %v15 + + srlg ROUNDS, ROUNDS, 1 + +.Loop: + QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15) + QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11) + brctg ROUNDS, .Loop + + C Add in saved original words, including counters, before + C transpose. + vaf %v3, %v3, T0 + vaf %v7, %v7, T1 + vaf %v11, %v11, T2 + vaf %v15, %v15, T3 + + TRANSPOSE(%v0, %v4, %v8, %v12) + TRANSPOSE(%v1, %v5, %v9, %v13) + TRANSPOSE(%v2, %v6, %v10, %v14) + TRANSPOSE(%v3, %v7, %v11, %v15) + + vlm T0, T2, 0(SRC) + + vaf %v0, %v0, T0 + vaf %v4, %v4, T0 + vaf %v8, %v8, T0 + vaf %v12, %v12, T0 + + vperm %v0, %v0, %v0, BRW + vperm %v4, %v4, %v4, BRW + vperm %v8, %v8, %v8, BRW + vperm %v12, %v12, %v12, BRW + + vaf %v1, %v1, T1 + vaf %v5, %v5, T1 + vaf %v9, %v9, T1 + vaf %v13, %v13, T1 + + vperm %v1, %v1, %v1, BRW + vperm %v5, %v5, %v5, BRW + vperm %v9, %v9, %v9, BRW + vperm %v13, %v13, %v13, BRW + + vaf %v2, %v2, T2 + vaf %v6, %v6, T2 + vaf %v10, %v10, T2 + vaf %v14, %v14, T2 + + vperm %v2, %v2, %v2, BRW + vperm %v6, %v6, %v6, BRW + vperm %v10, %v10, %v10, BRW + vperm %v14, %v14, %v14, BRW + + vperm %v3, %v3, %v3, BRW + vperm %v7, %v7, %v7, BRW + vperm %v11, %v11, %v11, BRW + vperm %v15, %v15, %v15, BRW + + vstm %v0, %v15, 0(DST) + + C Restore callee-save registers + ld %f8,0(%r1) + ld %f9,8(%r1) + ld %f10,16(%r1) + ld %f11,24(%r1) + ld %f12,32(%r1) + ld %f13,40(%r1) + ld %f14,48(%r1) + ld %f15,56(%r1) + FREE_STACK(64) C Deallocate stack space + br RA +EPILOGUE(_nettle_chacha_4core) + +PROLOGUE(_nettle_chacha_4core32) + + vzero T2 C Ignore counter carries + j .Lshared_entry +EPILOGUE(_nettle_chacha_4core32) + +.align 16 +.Lword_byte_reverse: .long 0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C +.Lcnts: .long 0,1,2,3 C increments diff --git a/s390x/vf/chacha-core-internal.asm b/s390x/vf/chacha-core-internal.asm new file mode 100644 index 00000000..c80c7e63 --- /dev/null +++ b/s390x/vf/chacha-core-internal.asm @@ -0,0 +1,131 @@ +C s390x/vf/chacha-2core.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Torbjörn Granlund + Copyright (C) 2022 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +C Argments +define(`DST', `%r2') +define(`SRC', `%r3') +define(`ROUNDS', `%r4') + +C Working state +define(`X0', `%v0') +define(`X1', `%v1') +define(`X2', `%v2') +define(`X3', `%v3') + +C Original input state +define(`S0', `%v4') +define(`S1', `%v5') +define(`S2', `%v6') +define(`S3', `%v7') + +define(`BRW', `%v24') + +C QROUND(X0, X1, X2, X3) +define(`QROUND', ` + C x0 += x1, x3 ^= x0, x3 lrot 16 + C x2 += x3, x1 ^= x2, x1 lrot 12 + C x0 += x1, x3 ^= x0, x3 lrot 8 + C x2 += x3, x1 ^= x2, x1 lrot 7 + + vaf $1, $1, $2 + vx $4, $4, $1 + verllf $4, $4, 16 + + vaf $3, $3, $4 + vx $2, $2, $3 + verllf $2, $2, 12 + + vaf $1, $1, $2 + vx $4, $4, $1 + verllf $4, $4, 8 + + vaf $3, $3, $4 + vx $2, $2, $3 + verllf $2, $2, 7 +') + +.file "chacha-core-internal.asm" +.machine "z13" + +.text +C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_chacha_core) + vlm X0, X3, 0(SRC) + + vlr S0, X0 + vlr S1, X1 + vlr S2, X2 + vlr S3, X3 + + srlg ROUNDS, ROUNDS, 1 +.Loop: +QROUND(X0, X1, X2, X3) + C Rotate rows, to get + C 0 1 2 3 + C 5 6 7 4 <<< 1 + C 10 11 8 9 <<< 2 + C 15 12 13 14 <<< 3 + + vsldb X1, X1, X1, 4 + vsldb X2, X2, X2, 8 + vsldb X3, X3, X3, 12 + + QROUND(X0, X1, X2, X3) + + C Inverse rotation + vsldb X1, X1, X1, 12 + vsldb X2, X2, X2, 8 + vsldb X3, X3, X3, 4 + + brctg ROUNDS, .Loop + + vaf X0, X0, S0 + vaf X1, X1, S1 + vaf X2, X2, S2 + vaf X3, X3, S3 + + larl %r5,.Lword_byte_reverse + vl BRW, 0(%r5) + vperm X0, X0, X0, BRW + vperm X1, X1, X1, BRW + vperm X2, X2, X2, BRW + vperm X3, X3, X3, BRW + + vstm X0, X3, 0(DST) + br RA +EPILOGUE(_nettle_chacha_core) + +.align 16 +.Lword_byte_reverse: .long 0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C -- GitLab