From 3e2e2b15ba43d6c2cb7e7a7b1db663be727b1071 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 25 Nov 2020 15:38:47 +0100
Subject: [PATCH] ppc: Support big-endian for _chacha_2core.

---
 ChangeLog                     |  5 +++++
 powerpc64/p7/chacha-2core.asm | 17 +++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 4c91ccbc..2bff6ccc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-11-25  Niels Möller  <nisse@lysator.liu.se>
+
+	* powerpc64/p7/chacha-2core.asm: Add byte swapping of output, for
+	big-endian builds.
+
 2020-11-24  Niels Möller  <nisse@lysator.liu.se>
 
 	Enable ppc chacha_2core in fat builds.
diff --git a/powerpc64/p7/chacha-2core.asm b/powerpc64/p7/chacha-2core.asm
index 725d72af..265918b6 100644
--- a/powerpc64/p7/chacha-2core.asm
+++ b/powerpc64/p7/chacha-2core.asm
@@ -212,6 +212,23 @@ C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (Y3 swapped)
 	vadduwm X2, X2, S3
 	vadduwm Y3, Y3, S3p1
 
+IF_BE(`
+	C Output always stored in little-endian byte order.
+	C Can reuse S0 and S1 to construct permutation mask mask.
+	li	 r9, 0
+	lvsl	 S0, r9, r9	C 00 01 02 03 ... 0c 0d 0e 0f
+	vspltisb S1, 0x03	C 03 03 03 03 ... 03 03 03 03
+	vxor	 S1, S1, S0	C 03 02 01 00 ... 0f 0e 0d 0c
+
+	vperm	T0, T0, T0, S1
+	vperm	X0, X0, X0, S1
+	vperm	X1, X1, X1, S1
+	vperm	X2, X2, X2, S1
+	vperm	Y0, Y0, Y0, S1
+	vperm	Y1, Y1, Y1, S1
+	vperm	Y2, Y2, Y2, S1
+	vperm	Y3, Y3, Y3, S1
+')
 	stxvw4x	VSR(T0), 0, DST
 	stxvw4x	VSR(X0), r6, DST
 	stxvw4x	VSR(X1), r7, DST
-- 
GitLab