From 044d24b0ad6bce4269bc9b5d44f179b70fa24c4a Mon Sep 17 00:00:00 2001
From: Mamone Tarsha <maamoun.tk@googlemail.com>
Date: Tue, 18 Jan 2022 19:32:31 +0200
Subject: [PATCH] [S390x] Optimize Chacha20

---
 fat-s390x.c                          |  42 ++++-
 s390x/fat/chacha-2core.asm           |  35 ++++
 s390x/fat/chacha-4core.asm           |  35 ++++
 s390x/fat/chacha-core-internal-2.asm |  36 ++++
 s390x/vf/chacha-2core.asm            | 230 ++++++++++++++++++++++++
 s390x/vf/chacha-4core.asm            | 255 +++++++++++++++++++++++++++
 s390x/vf/chacha-core-internal.asm    | 131 ++++++++++++++
 7 files changed, 759 insertions(+), 5 deletions(-)
 create mode 100644 s390x/fat/chacha-2core.asm
 create mode 100644 s390x/fat/chacha-4core.asm
 create mode 100644 s390x/fat/chacha-core-internal-2.asm
 create mode 100644 s390x/vf/chacha-2core.asm
 create mode 100644 s390x/vf/chacha-4core.asm
 create mode 100644 s390x/vf/chacha-core-internal.asm

diff --git a/fat-s390x.c b/fat-s390x.c
index db793e2c..6c68c445 100644
--- a/fat-s390x.c
+++ b/fat-s390x.c
@@ -268,6 +268,18 @@ DECLARE_FAT_FUNC(nettle_sha3_permute, sha3_permute_func)
 DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, c)
 DECLARE_FAT_FUNC_VAR(sha3_permute, sha3_permute_func, s390x)
 
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, s390x);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -281,18 +293,20 @@ fat_init (void)
   if (features.have_vector_facility)
   {
     if (verbose)
-      fprintf (stderr, "libnettle: enabling vectorized memxor3.\n");
+      fprintf (stderr, "libnettle: enabling vector facility code.\n");
     nettle_memxor3_vec = _nettle_memxor3_s390x;
-
-    if (verbose)
-      fprintf (stderr, "libnettle: enabling vectorized sha3 permute.\n");
     nettle_sha3_permute_vec = _nettle_sha3_permute_s390x;
+    _nettle_chacha_core_vec = _nettle_chacha_core_s390x;
+    nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+    nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
   }
   else
   {
      nettle_memxor3_vec = _nettle_memxor3_c;
-     
      nettle_sha3_permute_vec = _nettle_sha3_permute_c;
+     _nettle_chacha_core_vec = _nettle_chacha_core_c;
+     nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+     nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
   }
 
   /* AES128 */
@@ -497,3 +511,21 @@ DEFINE_FAT_FUNC(_nettle_sha512_compress, void,
 /* SHA3 */
 DEFINE_FAT_FUNC(nettle_sha3_permute, void,
 		(struct sha3_state *state), (state))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+		(uint32_t *dst, const uint32_t *src, unsigned rounds),
+		(dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+		(struct chacha_ctx *ctx,
+		 size_t length,
+		 uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+		(struct chacha_ctx *ctx,
+		 size_t length,
+		 uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, length, dst, src))
diff --git a/s390x/fat/chacha-2core.asm b/s390x/fat/chacha-2core.asm
new file mode 100644
index 00000000..a97de67e
--- /dev/null
+++ b/s390x/fat/chacha-2core.asm
@@ -0,0 +1,35 @@
+C s390x/fat/chacha-2core.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_2core) picked up by configure
+
+include_src(`s390x/vf/chacha-2core.asm')
diff --git a/s390x/fat/chacha-4core.asm b/s390x/fat/chacha-4core.asm
new file mode 100644
index 00000000..ad9e893c
--- /dev/null
+++ b/s390x/fat/chacha-4core.asm
@@ -0,0 +1,35 @@
+C s390x/fat/chacha-4core.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`s390x/vf/chacha-4core.asm')
diff --git a/s390x/fat/chacha-core-internal-2.asm b/s390x/fat/chacha-core-internal-2.asm
new file mode 100644
index 00000000..8e17e01a
--- /dev/null
+++ b/s390x/fat/chacha-core-internal-2.asm
@@ -0,0 +1,36 @@
+C s390x/fat/chacha-core-internal-2.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_s390x')
+include_src(`s390x/vf/chacha-core-internal.asm')
diff --git a/s390x/vf/chacha-2core.asm b/s390x/vf/chacha-2core.asm
new file mode 100644
index 00000000..34bd9a27
--- /dev/null
+++ b/s390x/vf/chacha-2core.asm
@@ -0,0 +1,230 @@
+C s390x/vf/chacha-2core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `%r2')
+define(`SRC', `%r3')
+define(`ROUNDS', `%r4')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `%v1')
+define(`X1', `%v2')
+define(`X2', `%v3')
+define(`X3', `%v29')
+define(`Y0', `%v4')
+define(`Y1', `%v5')
+define(`Y2', `%v6')
+define(`Y3', `%v7')
+
+C Original input state
+define(`S0', `%v24')
+define(`S1', `%v25')
+define(`S2', `%v26')
+define(`S3', `%v27')
+define(`S3p1', `%v28')
+
+define(`T0', `%v0')
+
+define(`BRW', `%v30')
+define(`EW', `%v30')
+define(`OW', `%v31')
+
+.file "chacha-2core.asm"
+.machine "z13"
+
+.text
+C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_2core)
+
+	vzero	X1
+	vleif	X1, 1, 0
+
+	vl		X3, 48(SRC)
+
+	vaccf	Y3, X3, X1	C Counter carry out
+	vsldb	Y3, Y3, Y3, 12
+	vo		Y3, Y3, X1
+
+.Lshared_entry:
+	vaf		Y3, Y3, X3
+
+	vlm		X0, X2, 0(SRC)
+
+	vlr		S0, X0
+	vlr		S1, X1
+	vlr		S2, X2
+	vlr		S3, X3
+	vlr		S3p1, Y3
+
+	larl	%r5,.Lword_even
+	vlm		EW, OW, 0(%r5)
+
+	vperm	Y0, X0, X0, OW	C  1  1  3  3
+	vperm	X0, X0, X0, EW	C  0  0  2  2
+	vperm	Y1, X1, X1, OW	C  5  5  7  7
+	vperm	X1, X1, X1, EW	C  4  4  6  6
+	vperm	Y2, X2, X2, OW	C  9  9 11 11
+	vperm	X2, X2, X2, EW	C  8  8 10 10
+	vperm	Y3, X3, S3p1, OW	C 13 13 15 15
+	vperm	X3, X3, S3p1, EW	C 12 12 14 14
+
+	srlg	ROUNDS, ROUNDS, 1
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
+C X1:  A4  B4  A6  B6  Y1:  A5  B5  A7  B7
+C X2:  A8  B8 A10 B10  Y2:  A9  B9 A11 B11
+C X3: A12 B12 A14 B14  Y3: A13 B13 A15 B15
+	vaf		X0, X0, X1
+	 vaf	Y0, Y0, Y1
+	vx		X3, X3, X0
+	 vx		Y3, Y3, Y0
+	verllf	X3, X3, 16
+	 verllf	Y3, Y3, 16
+
+	vaf		X2, X2, X3
+	 vaf	Y2, Y2, Y3
+	vx		X1, X1, X2
+	 vx		Y1, Y1, Y2
+	verllf	X1, X1, 12
+	 verllf	Y1, Y1, 12
+
+	vaf		X0, X0, X1
+	 vaf	Y0, Y0, Y1
+	vx		X3, X3, X0
+	 vx		Y3, Y3, Y0
+	verllf	X3, X3, 8
+	 verllf	Y3, Y3, 8
+
+	vaf		X2, X2, X3
+	 vaf	Y2, Y2, Y3
+	vx		X1, X1, X2
+	 vx		Y1, Y1, Y2
+	verllf	X1, X1, 7
+	 verllf	Y1, Y1, 7
+
+	vpdi	X1, X1, X1, 0b0100
+	vpdi	X2, X2, X2, 0b0100
+	vpdi	Y2, Y2, Y2, 0b0100
+	vpdi	Y3, Y3, Y3, 0b0100
+
+C Register layout:
+C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
+C Y1:  A5  B5  A7  B7  X1:  A6  B6  A4  B4 (X1 swapped)
+C X2: A10 B10  A8  B8  Y2: A11 A11  A9  B9 (X2, Y2 swapped)
+C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (Y3 swapped)
+
+	vaf		X0, X0, Y1
+	 vaf	Y0, Y0, X1
+	vx		Y3, Y3, X0
+	 vx		X3, X3, Y0
+	verllf	Y3, Y3, 16
+	 verllf	X3, X3, 16
+
+	vaf		X2, X2, Y3
+	 vaf	Y2, Y2, X3
+	vx		Y1, Y1, X2
+	 vx		X1, X1, Y2
+	verllf	Y1, Y1, 12
+	 verllf	X1, X1, 12
+
+	vaf		X0, X0, Y1
+	 vaf	Y0, Y0, X1
+	vx		Y3, Y3, X0
+	 vx		X3, X3, Y0
+	verllf	Y3, Y3, 8
+	 verllf	X3, X3, 8
+
+	vaf		X2, X2, Y3
+	 vaf	Y2, Y2, X3
+	vx		Y1, Y1, X2
+	 vx		X1, X1, Y2
+	verllf	Y1, Y1, 7
+	 verllf	X1, X1, 7
+
+	vpdi	X1, X1, X1, 0b0100
+	vpdi	X2, X2, X2, 0b0100
+	vpdi	Y2, Y2, Y2, 0b0100
+	vpdi	Y3, Y3, Y3, 0b0100
+
+	brctg	ROUNDS, .Loop
+
+	vperm	T0, X0, Y0, EW
+	vperm	Y0, X0, Y0, OW
+
+	vperm	X0, X1, Y1, EW
+	vperm	Y1, X1, Y1, OW
+
+	vperm	X1, X2, Y2, EW
+	vperm	Y2, X2, Y2, OW
+
+	vperm	X2, X3, Y3, EW
+	vperm	Y3, X3, Y3, OW
+
+	vaf		T0, T0, S0
+	vaf		Y0, Y0, S0
+	vaf		X0, X0, S1
+	vaf		Y1, Y1, S1
+	vaf		X1, X1, S2
+	vaf		Y2, Y2, S2
+	vaf		X2, X2, S3
+	vaf		Y3, Y3, S3p1
+
+	vl		BRW, 32(%r5)
+	vperm	T0, T0, T0, BRW
+	vperm	X0, X0, X0, BRW
+	vperm	X1, X1, X1, BRW
+	vperm	X2, X2, X2, BRW
+	vperm	Y0, Y0, Y0, BRW
+	vperm	Y1, Y1, Y1, BRW
+	vperm	Y2, Y2, Y2, BRW
+	vperm	Y3, Y3, Y3, BRW
+
+	vstm	T0, Y3, 0(DST)
+	br		RA
+EPILOGUE(_nettle_chacha_2core)
+
+PROLOGUE(_nettle_chacha_2core32)
+	vzero	Y3
+	vleif	Y3, 1, 0
+	vl		X3, 48(SRC)
+	j		.Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+.align	16
+.Lword_even: .long	0x00010203,0x10111213,0x08090A0B,0x18191A1B
+.Lword_odd: .long	0x04050607,0x14151617,0x0C0D0E0F,0x1C1D1E1F
+.Lword_byte_reverse: .long	0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C
diff --git a/s390x/vf/chacha-4core.asm b/s390x/vf/chacha-4core.asm
new file mode 100644
index 00000000..276de9f1
--- /dev/null
+++ b/s390x/vf/chacha-4core.asm
@@ -0,0 +1,255 @@
+C s390x/vf/chacha-4core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `%r2')
+define(`SRC', `%r3')
+define(`ROUNDS', `%r4')
+
+C Working state
+
+define(`BRW', `%v24')
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `%v25')
+define(`T1', `%v26')
+define(`T2', `%v27')
+define(`T3', `%v28')
+
+C Main loop for round
+define(`QR',`
+	vaf		$1, $1, $2
+	vaf		$5, $5, $6
+	vaf		$9, $9, $10
+	vaf		$13, $13, $14
+	vx		$4, $4, $1
+	vx		$8, $8, $5
+	vx		$12, $12, $9
+	vx		$16, $16, $13
+	verllf	$4, $4, 16
+	verllf	$8, $8, 16
+	verllf	$12, $12, 16
+	verllf	$16, $16, 16
+
+	vaf		$3, $3, $4
+	vaf		$7, $7, $8
+	vaf		$11, $11, $12
+	vaf		$15, $15, $16
+	vx		$2, $2, $3
+	vx		$6, $6, $7
+	vx		$10, $10, $11
+	vx		$14, $14, $15
+	verllf	$2, $2, 12
+	verllf	$6, $6, 12
+	verllf	$10, $10, 12
+	verllf	$14, $14, 12
+
+	vaf		$1, $1, $2
+	vaf		$5, $5, $6
+	vaf		$9, $9, $10
+	vaf		$13, $13, $14
+	vx		$4, $4, $1
+	vx		$8, $8, $5
+	vx		$12, $12, $9
+	vx		$16, $16, $13
+	verllf	$4, $4, 8
+	verllf	$8, $8, 8
+	verllf	$12, $12, 8
+	verllf	$16, $16, 8
+
+	vaf		$3, $3, $4
+	vaf		$7, $7, $8
+	vaf		$11, $11, $12
+	vaf		$15, $15, $16
+	vx		$2, $2, $3
+	vx		$6, $6, $7
+	vx		$10, $10, $11
+	vx		$14, $14, $15
+	verllf	$2, $2, 7
+	verllf	$6, $6, 7
+	verllf	$10, $10, 7
+	verllf	$14, $14, 7
+')
+
+define(`TRANSPOSE',`
+	vmrhf	T0, $1, $3		C A0 A2 B0 B2
+	vmrhf	T1, $2, $4		C A1 A3 B1 B3
+	vmrlf	T2, $1, $3		C C0 C2 D0 D2
+	vmrlf	T3, $2, $4		C C1 C3 D1 D3
+
+	vmrhf	$1, T0, T1		C A0 A1 A2 A3
+	vmrlf	$2, T0, T1		C B0 B1 B2 B3
+	vmrhf	$3, T2, T3		C C0 C2 C1 C3
+	vmrlf	$4, T2, T3		C D0 D1 D2 D3
+')
+
+.file "chacha-4core.asm"
+.machine "z13"
+
+.text
+C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_4core)
+
+	vrepif	T2, 1		C Apply counter carries
+
+.Lshared_entry:
+
+	C Save callee-save registers
+    ALLOC_STACK(%r1,64)		C Allocate 64-byte space on stack
+    C Save non-volatile floating point registers
+    std		%f8,0(%r1)
+    std		%f9,8(%r1)
+	std		%f10,16(%r1)
+    std		%f11,24(%r1)
+	std		%f12,32(%r1)
+    std		%f13,40(%r1)
+	std		%f14,48(%r1)
+    std		%f15,56(%r1)
+
+	larl	%r5,.Lword_byte_reverse
+	vlm		BRW, T0, 0(%r5)
+
+C Load state and splat
+	vlm		%v0, %v3, 0(SRC)
+
+	vrepf	%v4, %v0, 1
+	vrepf	%v8, %v0, 2
+	vrepf	%v12, %v0, 3
+	vrepf	%v0, %v0, 0
+	vrepf	%v5, %v1, 1
+	vrepf	%v9, %v1, 2
+	vrepf	%v13, %v1, 3
+	vrepf	%v1, %v1, 0
+	vrepf	%v6, %v2, 1
+	vrepf	%v10, %v2, 2
+	vrepf	%v14, %v2, 3
+	vrepf	%v2, %v2, 0
+	vrepf	%v7, %v3, 1
+	vrepf	%v11, %v3, 2
+	vrepf	%v15, %v3, 3
+	vrepf	%v3, %v3, 0
+
+	vaccf	T1, %v3, T0		C low adds
+	vaf		%v3, %v3, T0	C compute carry-out
+	vn		T1, T1, T2		C discard carries for 32-bit counter variant
+	vaf		%v7, %v7, T1	C apply carries
+
+	C Save all 4x4 of the last words.
+	vlr		T0, %v3
+	vlr		T1, %v7
+	vlr		T2, %v11
+	vlr		T3, %v15
+
+	srlg	ROUNDS, ROUNDS, 1
+
+.Loop:
+	QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15)
+	QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11)
+	brctg	ROUNDS, .Loop
+
+	C Add in saved original words, including counters, before
+	C transpose.
+	vaf		%v3, %v3, T0
+	vaf		%v7, %v7, T1
+	vaf		%v11, %v11, T2
+	vaf		%v15, %v15, T3
+
+	TRANSPOSE(%v0, %v4, %v8, %v12)
+	TRANSPOSE(%v1, %v5, %v9, %v13)
+	TRANSPOSE(%v2, %v6, %v10, %v14)
+	TRANSPOSE(%v3, %v7, %v11, %v15)
+
+	vlm		T0, T2, 0(SRC)
+
+	vaf		%v0, %v0, T0
+	vaf		%v4, %v4, T0
+	vaf		%v8, %v8, T0
+	vaf		%v12, %v12, T0
+
+	vperm	%v0, %v0, %v0, BRW
+	vperm	%v4, %v4, %v4, BRW
+	vperm	%v8, %v8, %v8, BRW
+	vperm	%v12, %v12, %v12, BRW
+
+	vaf		%v1, %v1, T1
+	vaf		%v5, %v5, T1
+	vaf		%v9, %v9, T1
+	vaf		%v13, %v13, T1
+
+	vperm	%v1, %v1, %v1, BRW
+	vperm	%v5, %v5, %v5, BRW
+	vperm	%v9, %v9, %v9, BRW
+	vperm	%v13, %v13, %v13, BRW
+
+	vaf		%v2, %v2, T2
+	vaf		%v6, %v6, T2
+	vaf		%v10, %v10, T2
+	vaf		%v14, %v14, T2
+
+	vperm	%v2, %v2, %v2, BRW
+	vperm	%v6, %v6, %v6, BRW
+	vperm	%v10, %v10, %v10, BRW
+	vperm	%v14, %v14, %v14, BRW
+
+	vperm	%v3, %v3, %v3, BRW
+	vperm	%v7, %v7, %v7, BRW
+	vperm	%v11, %v11, %v11, BRW
+	vperm	%v15, %v15, %v15, BRW
+
+	vstm	%v0, %v15, 0(DST)
+
+	C Restore callee-save registers
+	ld		%f8,0(%r1)
+    ld		%f9,8(%r1)
+	ld		%f10,16(%r1)
+    ld		%f11,24(%r1)
+	ld		%f12,32(%r1)
+    ld		%f13,40(%r1)
+	ld		%f14,48(%r1)
+    ld		%f15,56(%r1)
+    FREE_STACK(64)		C Deallocate stack space
+	br		RA
+EPILOGUE(_nettle_chacha_4core)
+
+PROLOGUE(_nettle_chacha_4core32)
+
+	vzero	T2			C Ignore counter carries
+	j		.Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+.align	16
+.Lword_byte_reverse: .long	0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C
+.Lcnts: .long	0,1,2,3	C increments
diff --git a/s390x/vf/chacha-core-internal.asm b/s390x/vf/chacha-core-internal.asm
new file mode 100644
index 00000000..c80c7e63
--- /dev/null
+++ b/s390x/vf/chacha-core-internal.asm
@@ -0,0 +1,131 @@
+C s390x/vf/chacha-2core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `%r2')
+define(`SRC', `%r3')
+define(`ROUNDS', `%r4')
+
+C Working state
+define(`X0', `%v0')
+define(`X1', `%v1')
+define(`X2', `%v2')
+define(`X3', `%v3')
+
+C Original input state
+define(`S0', `%v4')
+define(`S1', `%v5')
+define(`S2', `%v6')
+define(`S3', `%v7')
+
+define(`BRW', `%v24')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+	C x0 += x1, x3 ^= x0, x3 lrot 16
+	C x2 += x3, x1 ^= x2, x1 lrot 12
+	C x0 += x1, x3 ^= x0, x3 lrot 8
+	C x2 += x3, x1 ^= x2, x1 lrot 7
+
+	vaf		$1, $1, $2
+	vx		$4, $4, $1
+	verllf	$4, $4, 16
+
+	vaf		$3, $3, $4
+	vx		$2, $2, $3
+	verllf	$2, $2, 12
+
+	vaf		$1, $1, $2
+	vx		$4, $4, $1
+	verllf	$4, $4, 8
+
+	vaf		$3, $3, $4
+	vx		$2, $2, $3
+	verllf	$2, $2, 7
+')
+
+.file "chacha-core-internal.asm"
+.machine "z13"
+
+.text
+C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_core)
+	vlm		X0, X3, 0(SRC)
+
+	vlr		S0, X0
+	vlr		S1, X1
+	vlr		S2, X2
+	vlr		S3, X3
+
+	srlg	ROUNDS, ROUNDS, 1
+.Loop:
+QROUND(X0, X1, X2, X3)
+	C Rotate rows, to get
+	C	 0  1  2  3
+	C	 5  6  7  4  <<< 1
+	C	10 11  8  9  <<< 2
+	C	15 12 13 14  <<< 3
+
+	vsldb	X1, X1, X1, 4
+	vsldb	X2, X2, X2, 8
+	vsldb	X3, X3, X3, 12
+
+	QROUND(X0, X1, X2, X3)
+
+	C Inverse rotation
+	vsldb	X1, X1, X1, 12
+	vsldb	X2, X2, X2, 8
+	vsldb	X3, X3, X3, 4
+
+	brctg	ROUNDS, .Loop
+
+	vaf		X0, X0, S0
+	vaf		X1, X1, S1
+	vaf		X2, X2, S2
+	vaf		X3, X3, S3
+
+	larl	%r5,.Lword_byte_reverse
+	vl		BRW, 0(%r5)
+	vperm	X0, X0, X0, BRW
+	vperm	X1, X1, X1, BRW
+	vperm	X2, X2, X2, BRW
+	vperm	X3, X3, X3, BRW
+
+	vstm	X0, X3, 0(DST)
+	br		RA
+EPILOGUE(_nettle_chacha_core)
+
+.align	16
+.Lword_byte_reverse: .long	0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C
-- 
GitLab