From 54b2d297afa86a84fc3dc23e0529fb5120ef5a99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
Date: Mon, 8 Jan 2018 08:45:17 +0100
Subject: [PATCH] Increase buffer size for in-place CTR.

---
 ChangeLog |  4 ++++
 ctr.c     | 65 ++++++++++++++++++++++++++-----------------------------
 2 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index c927848c..aa9608d7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -9,6 +9,10 @@
 
 2018-01-08  Niels Möller  <nisse@lysator.liu.se>
 
+	* ctr.c (ctr_crypt): For in-place operation, increase max buffer
+	size from 4 blocks to 512 bytes, similarly to CBC and CFB.
+	Improves in-place aes128 CTR performance by 25% on x86_64.
+
 	* examples/nettle-benchmark.c (time_cipher): Benchmark in-place
 	operation separately, for cbc_decrypt and ctr_crypt.
 
diff --git a/ctr.c b/ctr.c
index f81f74ad..42048833 100644
--- a/ctr.c
+++ b/ctr.c
@@ -45,7 +45,8 @@
 #include "memxor.h"
 #include "nettle-internal.h"
 
-#define NBLOCKS 4
+/* Don't allocate any more space than this on the stack */
+#define CTR_BUFFER_LIMIT 512
 
 void
 ctr_crypt(const void *ctx, nettle_cipher_func *f,
@@ -90,47 +91,43 @@ ctr_crypt(const void *ctx, nettle_cipher_func *f,
     }
   else
     {
-      if (length > block_size)
-	{
-	  TMP_DECL(buffer, uint8_t, NBLOCKS * NETTLE_MAX_CIPHER_BLOCK_SIZE);
-	  size_t chunk = NBLOCKS * block_size;
-
-	  TMP_ALLOC(buffer, chunk);
+      /* For in-place CTR, construct a buffer of consecutive counter
+	 values, of size at most CTR_BUFFER_LIMIT. */
+      TMP_DECL(buffer, uint8_t, CTR_BUFFER_LIMIT);
+
+      size_t buffer_size;
+      if (length < block_size)
+	buffer_size = block_size;
+      else if (length <= CTR_BUFFER_LIMIT)
+	buffer_size = length;
+      else
+	buffer_size = CTR_BUFFER_LIMIT;
 
-	  for (; length >= chunk;
-	       length -= chunk, src += chunk, dst += chunk)
-	    {
-	      unsigned n;
-	      uint8_t *p;	  
-	      for (n = 0, p = buffer; n < NBLOCKS; n++, p += block_size)
-		{
-		  memcpy (p, ctr, block_size);
-		  INCREMENT(block_size, ctr);
-		}
-	      f(ctx, chunk, buffer, buffer);
-	      memxor(dst, buffer, chunk);
-	    }
+      TMP_ALLOC(buffer, buffer_size);
 
-	  if (length > 0)
+      while (length >= block_size)
+	{
+	  size_t i;
+	  for (i = 0;
+	       i + block_size <= buffer_size && i + block_size <= length;
+	       i += block_size)
 	    {
-	      /* Final, possibly partial, blocks */
-	      for (chunk = 0; chunk < length; chunk += block_size)
-		{
-		  memcpy (buffer + chunk, ctr, block_size);
-		  INCREMENT(block_size, ctr);
-		}
-	      f(ctx, chunk, buffer, buffer);
-	      memxor3(dst, src, buffer, length);
+	      memcpy (buffer + i, ctr, block_size);
+	      INCREMENT(block_size, ctr);
 	    }
+	  assert (i > 0);
+	  f(ctx, i, buffer, buffer);
+	  memxor(dst, buffer, i);
+	  length -= i;
+	  dst += i;
 	}
-      else if (length > 0)
-      	{
-	  TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
-	  TMP_ALLOC(buffer, block_size);
 
+      /* Final, possibly partial, block. */
+      if (length > 0)
+	{
 	  f(ctx, block_size, buffer, ctr);
 	  INCREMENT(block_size, ctr);
-	  memxor3(dst, src, buffer, length);
+	  memxor(dst, buffer, length);
 	}
     }
 }
-- 
GitLab