From 0325eaf3ac938322b964d52716b15b106adeae0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Mon, 8 Jan 2018 22:28:52 +0100
Subject: [PATCH] CTR mode optimizations for 16-byte block size.

---
 ChangeLog |   7 +++
 ctr.c     | 132 +++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 119 insertions(+), 20 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 5975cab8..f31a3017 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -9,6 +9,13 @@
 
 2018-01-08  Niels Möller  <nisse@lysator.liu.se>
 
+	* ctr.c (ctr_crypt16): New function, with optimizations specific
+	to 16-byte block size.
+	(ctr_fill16): New helper function, definition depending on
+	WORDS_BIGENDIAN, and little endian version requiring
+	HAVE_BUILTIN_BSWAP64.
+	(ctr_crypt): Use ctr_crypt16, when appropriate.
+
 	* nettle-types.h (union nettle_block16): Add uint64_t field.
 
 	* configure.ac: Check for __builtin_bswap64, define
diff --git a/ctr.c b/ctr.c
index f4164466..8295e1af 100644
--- a/ctr.c
+++ b/ctr.c
@@ -62,36 +62,128 @@ ctr_fill (size_t block_size, uint8_t *ctr, size_t length, uint8_t *buffer)
   return i;
 }
 
+#if WORDS_BIGENDIAN
+# define USE_CTR_CRYPT16 1
+static void
+ctr_fill16(uint8_t *ctr, size_t blocks, uint64_t *buffer)
+{
+  uint64_t hi, lo;
+  hi = READ_UINT64(ctr);
+  lo = READ_UINT64(ctr + 8);
+
+  while (blocks-- > 0)
+    {
+      *buffer++ = hi;
+      *buffer++ = lo;
+      hi += !(++lo);
+    }
+  WRITE_UINT64(ctr, hi);
+  WRITE_UINT64(ctr + 8, lo);
+}
+#else /* !WORDS_BIGENDIAN */
+# if HAVE_BUILTIN_BSWAP64
+#  define USE_CTR_CRYPT16 1
+static void
+ctr_fill16(uint8_t *ctr, size_t blocks, uint64_t *buffer)
+{
+  uint64_t hi, lo;
+  /* Read hi in native endianness */
+  hi = LE_READ_UINT64(ctr);
+  lo = READ_UINT64(ctr + 8);
+
+  while (blocks-- > 0)
+    {
+      *buffer++ = hi;
+      *buffer++ = __builtin_bswap64(lo);
+      if (!++lo)
+	hi = __builtin_bswap64(__builtin_bswap64(hi) + 1);
+    }
+  LE_WRITE_UINT64(ctr, hi);
+  WRITE_UINT64(ctr + 8, lo);
+}
+# else /* ! HAVE_BUILTIN_BSWAP64 */
+#  define USE_CTR_CRYPT16 0
+# endif
+#endif /* !WORDS_BIGENDIAN */
+
+#if USE_CTR_CRYPT16
+static size_t
+ctr_crypt16(const void *ctx, nettle_cipher_func *f,
+	    uint8_t *ctr,
+	    size_t length, uint8_t *dst,
+	    const uint8_t *src)
+{
+  if (dst != src && !((uintptr_t) dst % sizeof(uint64_t)))
+    {
+      size_t blocks = length / 16u;
+      ctr_fill16 (ctr, blocks, (uint64_t *) dst);
+      f(ctx, blocks * 16, dst, dst);
+      memxor (dst, src, blocks * 16);
+      return blocks * 16;
+    }
+  else
+    {
+      /* Construct an aligned buffer of consecutive counter values, of
+	 size at most CTR_BUFFER_LIMIT. */
+      TMP_DECL(buffer, union nettle_block16, CTR_BUFFER_LIMIT / 16);
+      size_t blocks = (length + 15) / 16u;
+      size_t i;
+      TMP_ALLOC(buffer, MIN(blocks, CTR_BUFFER_LIMIT / 16));
+
+      for (i = 0; blocks >= CTR_BUFFER_LIMIT / 16;
+	   i += CTR_BUFFER_LIMIT, blocks -= CTR_BUFFER_LIMIT / 16)
+	{
+	  ctr_fill16 (ctr, CTR_BUFFER_LIMIT / 16, buffer->u64);
+	  f(ctx, CTR_BUFFER_LIMIT, buffer->b, buffer->b);
+	  if (length - i < CTR_BUFFER_LIMIT)
+	    goto done;
+	  memxor3 (dst, src, buffer->b, CTR_BUFFER_LIMIT);
+	}
+
+      if (blocks > 0)
+	{
+	  assert (length - i < CTR_BUFFER_LIMIT);
+	  ctr_fill16 (ctr, blocks, buffer->u64);
+	  f(ctx, blocks * 16, buffer->b, buffer->b);
+	done:
+	  memxor3 (dst + i, src + i, buffer->b, length - i);
+      }
+      return length;
+    }
+}
+#endif /* USE_CTR_CRYPT16 */
+
 void
 ctr_crypt(const void *ctx, nettle_cipher_func *f,
 	  size_t block_size, uint8_t *ctr,
 	  size_t length, uint8_t *dst,
 	  const uint8_t *src)
 {
-  if (src != dst)
+#if USE_CTR_CRYPT16
+  if (block_size == 16)
     {
-      if (length == block_size)
-	{
-	  f(ctx, block_size, dst, ctr);
-	  INCREMENT(block_size, ctr);
-	  memxor(dst, src, block_size);
-	}
-      else
-	{
-	  size_t filled = ctr_fill (block_size, ctr, length, dst);
+      size_t done = ctr_crypt16(ctx, f, ctr, length, dst, src);
+      length -= done;
+      src += done;
+      dst += done;
+    }
+#endif
 
-	  f(ctx, filled, dst, dst);
-	  memxor(dst, src, filled);
+  if(src != dst)
+    {
+      size_t filled = ctr_fill (block_size, ctr, length, dst);
+
+      f(ctx, filled, dst, dst);
+      memxor(dst, src, filled);
 
-	  if (filled < length)
-	    {
-	      TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
-	      TMP_ALLOC(buffer, block_size);
+      if (filled < length)
+	{
+	  TMP_DECL(block, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
+	  TMP_ALLOC(block, block_size);
 
-	      f(ctx, block_size, buffer, ctr);
-	      INCREMENT(block_size, ctr);
-	      memxor3(dst + filled, src + filled, buffer, length - filled);
-	    }
+	  f(ctx, block_size, block, ctr);
+	  INCREMENT(block_size, ctr);
+	  memxor3(dst + filled, src + filled, block, length - filled);
 	}
     }
   else
-- 
GitLab