From 25e4316f0e75fb16cc99111bfecd4bd8f4f2c812 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Wed, 8 Jul 2020 10:46:45 +0200
Subject: [PATCH] Rearrange salsa20, enabling ARM fat builds to use
 sala20_2core.

---
 ChangeLog                 |  21 ++++++++
 Makefile.in               |   2 +-
 arm/fat/salsa20-2core.asm |  36 +++++++++++++
 configure.ac              |   1 +
 fat-arm.c                 |  13 +++++
 fat-setup.h               |   5 ++
 salsa20-crypt-internal.c  | 111 ++++++++++++++++++++++++++++++++++++++
 salsa20-crypt.c           |  51 +-----------------
 salsa20-internal.h        |  19 +++++++
 salsa20r12-crypt.c        |  51 +-----------------
 10 files changed, 209 insertions(+), 101 deletions(-)
 create mode 100644 arm/fat/salsa20-2core.asm
 create mode 100644 salsa20-crypt-internal.c

diff --git a/ChangeLog b/ChangeLog
index f7b2e939..84ed4923 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2020-07-08  Niels Möller  <nisse@lysator.liu.se>
+
+	Rearrange salsa20, enabling ARM fat builds to use sala20_2core.
+	* salsa20-crypt-internal.c (_salsa20_crypt_2core)
+	(_salsa20_crypt_1core): New file, new functions. One or the other
+	is used for implementing salsa20_crypt and salsa20r12_crypt,
+	depending on availability of salsa20_2core.
+	* salsa20-crypt.c (salsa20_crypt): Call _salsa20_crypt.
+	* salsa20r12-crypt.c (salsa20r12_crypt): Likewise.
+	* salsa20-internal.h: Declare new internal functions.
+	* Makefile.in (nettle_SOURCES): Add salsa20-crypt-internal.c.
+	* fat-setup.h (salsa20_crypt_func): New typedef.
+	* fat-arm.c (_salsa20_crypt): Select _salsa20_crypt
+	implementation, use 2core version when Neon instructions are
+	available.
+	* arm/fat/salsa20-2core.asm: New file, including Neon
+	implementation. Trigger configure's HAVE_NATIVE_fat_salsa20_2core,
+	* configure.ac: Add HAVE_NATIVE_fat_salsa20_2core, to identify the
+	case that salsa20_2core is defined, but runtime checks are needed
+	to determine if it is usable.
+
 2020-07-06  Niels Möller  <nisse@lysator.liu.se>
 
 	* testsuite/salsa20-test.c (test_salsa20_core): New function, test
diff --git a/Makefile.in b/Makefile.in
index 77efb5c9..042ebe5f 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -123,7 +123,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c \
 		 poly1305-aes.c poly1305-internal.c \
 		 realloc.c \
 		 ripemd160.c ripemd160-compress.c ripemd160-meta.c \
-		 salsa20-core-internal.c \
+		 salsa20-core-internal.c salsa20-crypt-internal.c \
 		 salsa20-crypt.c salsa20r12-crypt.c salsa20-set-key.c \
 		 salsa20-set-nonce.c \
 		 salsa20-128-set-key.c salsa20-256-set-key.c \
diff --git a/arm/fat/salsa20-2core.asm b/arm/fat/salsa20-2core.asm
new file mode 100644
index 00000000..43d9a1d0
--- /dev/null
+++ b/arm/fat/salsa20-2core.asm
@@ -0,0 +1,36 @@
+C arm/fat/salsa20-2core.asm
+
+
+ifelse(<
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+>)
+
+dnl PROLOGUE(_nettle_fat_salsa20_2core) picked up by configure
+
+include_src(<arm/neon/salsa20-2core.asm>)
diff --git a/configure.ac b/configure.ac
index 3f6c2f3b..a89f3ec3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -574,6 +574,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_gcm_hash8
 #undef HAVE_NATIVE_salsa20_core
 #undef HAVE_NATIVE_salsa20_2core
+#undef HAVE_NATIVE_fat_salsa20_2core
 #undef HAVE_NATIVE_sha1_compress
 #undef HAVE_NATIVE_sha256_compress
 #undef HAVE_NATIVE_sha512_compress
diff --git a/fat-arm.c b/fat-arm.c
index 48feb5d4..a3f0f860 100644
--- a/fat-arm.c
+++ b/fat-arm.c
@@ -43,6 +43,7 @@
 #include "nettle-types.h"
 
 #include "aes-internal.h"
+#include "salsa20-internal.h"
 #include "fat-setup.h"
 
 struct arm_features
@@ -147,6 +148,10 @@ DECLARE_FAT_FUNC(_nettle_salsa20_core, salsa20_core_func)
 DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, c)
 DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, neon)
 
+DECLARE_FAT_FUNC(_nettle_salsa20_crypt, salsa20_crypt_func)
+DECLARE_FAT_FUNC_VAR(salsa20_crypt, salsa20_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(salsa20_crypt, salsa20_crypt_func, 2core)
+
 DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, armv6)
@@ -212,6 +217,7 @@ fat_init (void)
       if (verbose)
 	fprintf (stderr, "libnettle: enabling neon code.\n");
       _nettle_salsa20_core_vec = _nettle_salsa20_core_neon;
+      _nettle_salsa20_crypt_vec = _nettle_salsa20_crypt_2core;
       _nettle_sha512_compress_vec = _nettle_sha512_compress_neon;
       nettle_sha3_permute_vec = _nettle_sha3_permute_neon;
       _nettle_umac_nh_vec = _nettle_umac_nh_neon;
@@ -223,6 +229,7 @@ fat_init (void)
       if (verbose)
 	fprintf (stderr, "libnettle: not enabling neon code.\n");
       _nettle_salsa20_core_vec = _nettle_salsa20_core_c;
+      _nettle_salsa20_crypt_vec = _nettle_salsa20_crypt_1core;
       _nettle_sha512_compress_vec = _nettle_sha512_compress_c;
       nettle_sha3_permute_vec = _nettle_sha3_permute_c;
       _nettle_umac_nh_vec = _nettle_umac_nh_c;
@@ -249,6 +256,12 @@ DEFINE_FAT_FUNC(_nettle_salsa20_core, void,
 		(uint32_t *dst, const uint32_t *src, unsigned rounds),
 		(dst, src, rounds))
 
+DEFINE_FAT_FUNC(_nettle_salsa20_crypt, void,
+		(struct salsa20_ctx *ctx, unsigned rounds,
+		 size_t length, uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, rounds, length, dst, src))
+
 DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 		(uint32_t *state, const uint8_t *input),
 		(state, input))
diff --git a/fat-setup.h b/fat-setup.h
index b623ebf9..58b687fd 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -93,6 +93,8 @@
 #define ENV_VERBOSE "NETTLE_FAT_VERBOSE"
 #define ENV_OVERRIDE "NETTLE_FAT_OVERRIDE"
 
+struct salsa20_ctx;
+
 /* DECLARE_FAT_FUNC(name, ftype)
  *
  *   name is the public function, e.g., _nettle_aes_encrypt.
@@ -162,6 +164,9 @@ typedef void aes_crypt_internal_func (unsigned rounds, const uint32_t *keys,
 typedef void *(memxor_func)(void *dst, const void *src, size_t n);
 
 typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds);
+typedef void salsa20_crypt_func (struct salsa20_ctx *ctx, unsigned rounds,
+				 size_t length, uint8_t *dst,
+				 const uint8_t *src);
 
 typedef void sha1_compress_func(uint32_t *state, const uint8_t *input);
 typedef void sha256_compress_func(uint32_t *state, const uint8_t *input, const uint32_t *k);
diff --git a/salsa20-crypt-internal.c b/salsa20-crypt-internal.c
new file mode 100644
index 00000000..cc46d024
--- /dev/null
+++ b/salsa20-crypt-internal.c
@@ -0,0 +1,111 @@
+/* salsa20-crypt-internal.c
+
+   The Salsa20 stream cipher.
+
+   Copyright (C) 2012 Simon Josefsson
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <string.h>
+
+#include "salsa20.h"
+#include "salsa20-internal.h"
+
+#include "macros.h"
+#include "memxor.h"
+
+#if HAVE_NATIVE_salsa20_2core
+#define _salsa20_crypt_2core _salsa20_crypt
+#elif !HAVE_NATIVE_fat_salsa20_2core
+#define _salsa20_crypt_1core _salsa20_crypt
+#endif
+
+#if HAVE_NATIVE_salsa20_2core || HAVE_NATIVE_fat_salsa20_2core
+void
+_salsa20_crypt_2core(struct salsa20_ctx *ctx, unsigned rounds,
+		     size_t length, uint8_t *dst,
+		     const uint8_t *src)
+{
+  uint32_t x[2*_SALSA20_INPUT_LENGTH];
+  while (length > SALSA20_BLOCK_SIZE)
+    {
+      _salsa20_2core (x, ctx->input, rounds);
+      ctx->input[8] += 2;
+      ctx->input[9] += (ctx->input[8] < 2);
+      if (length < 2 * SALSA20_BLOCK_SIZE)
+	{
+	  memxor3 (dst, src, x, length);
+	  return;
+	}
+      memxor3 (dst, src, x, 2*SALSA20_BLOCK_SIZE);
+
+      length -= 2*SALSA20_BLOCK_SIZE;
+      dst += 2*SALSA20_BLOCK_SIZE;
+      src += 2*SALSA20_BLOCK_SIZE;
+    }
+  _salsa20_core (x, ctx->input, rounds);
+  ctx->input[9] += (++ctx->input[8] == 0);
+  memxor3 (dst, src, x, length);
+}
+#endif
+
+#if !HAVE_NATIVE_salsa20_2core
+void
+_salsa20_crypt_1core(struct salsa20_ctx *ctx, unsigned rounds,
+		     size_t length,
+		     uint8_t *dst,
+		     const uint8_t *src)
+{
+  for (;;)
+    {
+      uint32_t x[_SALSA20_INPUT_LENGTH];
+
+      _salsa20_core (x, ctx->input, rounds);
+
+      ctx->input[9] += (++ctx->input[8] == 0);
+
+      /* stopping at 2^70 length per nonce is user's responsibility */
+
+      if (length <= SALSA20_BLOCK_SIZE)
+	{
+	  memxor3 (dst, src, x, length);
+	  return;
+	}
+      memxor3 (dst, src, x, SALSA20_BLOCK_SIZE);
+
+      length -= SALSA20_BLOCK_SIZE;
+      dst += SALSA20_BLOCK_SIZE;
+      src += SALSA20_BLOCK_SIZE;
+    }
+}
+#endif
diff --git a/salsa20-crypt.c b/salsa20-crypt.c
index b25cfc3d..2031d42d 100644
--- a/salsa20-crypt.c
+++ b/salsa20-crypt.c
@@ -41,14 +41,9 @@
 # include "config.h"
 #endif
 
-#include <string.h>
-
 #include "salsa20.h"
 #include "salsa20-internal.h"
 
-#include "macros.h"
-#include "memxor.h"
-
 void
 salsa20_crypt(struct salsa20_ctx *ctx,
 	      size_t length,
@@ -58,49 +53,5 @@ salsa20_crypt(struct salsa20_ctx *ctx,
   if (!length)
     return;
 
-#if HAVE_NATIVE_salsa20_2core
-  uint32_t x[2*_SALSA20_INPUT_LENGTH];
-  while (length > SALSA20_BLOCK_SIZE)
-    {
-      _salsa20_2core (x, ctx->input, 20);
-      ctx->input[8] += 2;
-      ctx->input[9] += (ctx->input[8] < 2);
-      if (length < 2 * SALSA20_BLOCK_SIZE)
-	{
-	  memxor3 (c, m, x, length);
-	  return;
-	}
-      memxor3 (c, m, x, 2*SALSA20_BLOCK_SIZE);
-
-      length -= 2*SALSA20_BLOCK_SIZE;
-      c += 2*SALSA20_BLOCK_SIZE;
-      m += 2*SALSA20_BLOCK_SIZE;
-    }
-  _salsa20_core (x, ctx->input, 20);
-  ctx->input[9] += (++ctx->input[8] == 0);
-  memxor3 (c, m, x, length);
-  return;
-#else
-  for (;;)
-    {
-      uint32_t x[_SALSA20_INPUT_LENGTH];
-
-      _salsa20_core (x, ctx->input, 20);
-
-      ctx->input[9] += (++ctx->input[8] == 0);
-
-      /* stopping at 2^70 length per nonce is user's responsibility */
-      
-      if (length <= SALSA20_BLOCK_SIZE)
-	{
-	  memxor3 (c, m, x, length);
-	  return;
-	}
-      memxor3 (c, m, x, SALSA20_BLOCK_SIZE);
-
-      length -= SALSA20_BLOCK_SIZE;
-      c += SALSA20_BLOCK_SIZE;
-      m += SALSA20_BLOCK_SIZE;
-  }
-#endif
+  _salsa20_crypt (ctx, 20, length, c, m);
 }
diff --git a/salsa20-internal.h b/salsa20-internal.h
index fc1bb310..8d7684e0 100644
--- a/salsa20-internal.h
+++ b/salsa20-internal.h
@@ -36,14 +36,33 @@
 #define NETTLE_SALSA20_INTERNAL_H_INCLUDED
 
 #include "nettle-types.h"
+#include "salsa20.h"
 
 #define _salsa20_core _nettle_salsa20_core
 #define _salsa20_2core _nettle_salsa20_2core
+#define _salsa20_crypt _nettle_salsa20_crypt
+#define _salsa20_crypt_1core _nettle_salsa20_crypt_1core
+#define _salsa20_crypt_2core _nettle_salsa20_crypt_2core
 
 void
 _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds);
 
+void
+_salsa20_crypt(struct salsa20_ctx *ctx, unsigned rounds,
+	       size_t length, uint8_t *dst,
+	       const uint8_t *src);
+
+/* Functions available only in some configurations */
 void
 _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds);
 
+void
+_salsa20_crypt_1core(struct salsa20_ctx *ctx, unsigned rounds,
+		     size_t length, uint8_t *dst,
+		     const uint8_t *src);
+void
+_salsa20_crypt_2core(struct salsa20_ctx *ctx, unsigned rounds,
+		     size_t length, uint8_t *dst,
+		     const uint8_t *src);
+
 #endif /* NETTLE_SALSA20_INTERNAL_H_INCLUDED */
diff --git a/salsa20r12-crypt.c b/salsa20r12-crypt.c
index 41e32d8b..9515251a 100644
--- a/salsa20r12-crypt.c
+++ b/salsa20r12-crypt.c
@@ -41,14 +41,9 @@
 # include "config.h"
 #endif
 
-#include <string.h>
-
 #include "salsa20.h"
 #include "salsa20-internal.h"
 
-#include "macros.h"
-#include "memxor.h"
-
 void
 salsa20r12_crypt(struct salsa20_ctx *ctx,
 		 size_t length,
@@ -57,50 +52,6 @@ salsa20r12_crypt(struct salsa20_ctx *ctx,
 {
   if (!length)
     return;
-  
-#if HAVE_NATIVE_salsa20_2core
-  uint32_t x[2*_SALSA20_INPUT_LENGTH];
-  while (length > SALSA20_BLOCK_SIZE)
-    {
-      _salsa20_2core (x, ctx->input, 12);
-      ctx->input[8] += 2;
-      ctx->input[9] += (ctx->input[8] < 2);
-      if (length < 2 * SALSA20_BLOCK_SIZE)
-	{
-	  memxor3 (c, m, x, length);
-	  return;
-	}
-      memxor3 (c, m, x, 2*SALSA20_BLOCK_SIZE);
-
-      length -= 2*SALSA20_BLOCK_SIZE;
-      c += 2*SALSA20_BLOCK_SIZE;
-      m += 2*SALSA20_BLOCK_SIZE;
-    }
-  _salsa20_core (x, ctx->input, 12);
-  ctx->input[9] += (++ctx->input[8] == 0);
-  memxor3 (c, m, x, length);
-  return;
-#else
-  for (;;)
-    {
-      uint32_t x[_SALSA20_INPUT_LENGTH];
-
-      _salsa20_core (x, ctx->input, 12);
 
-      ctx->input[9] += (++ctx->input[8] == 0);
-
-      /* stopping at 2^70 length per nonce is user's responsibility */
-      
-      if (length <= SALSA20_BLOCK_SIZE)
-	{
-	  memxor3 (c, m, x, length);
-	  return;
-	}
-      memxor3 (c, m, x, SALSA20_BLOCK_SIZE);
-
-      length -= SALSA20_BLOCK_SIZE;
-      c += SALSA20_BLOCK_SIZE;
-      m += SALSA20_BLOCK_SIZE;
-    }
-#endif
+  _salsa20_crypt (ctx, 12, length, c, m);
 }
-- 
GitLab