diff --git a/ctr.c b/ctr.c
index d94136c15c86ff79b70780f0b902716aa445e1b1..d217013cd36df25a5449f1dc0c58b210d14921ab 100644
--- a/ctr.c
+++ b/ctr.c
@@ -36,40 +36,93 @@
 #include "macros.h"
 #include "memxor.h"
 #include "nettle-internal.h"
-  
+
+#define NBLOCKS 4
+
 void
 ctr_crypt(void *ctx, nettle_crypt_func f,
 	  unsigned block_size, uint8_t *ctr,
 	  unsigned length, uint8_t *dst,
 	  const uint8_t *src)
 {
-  TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
-  TMP_ALLOC(buffer, block_size);
-
   if (src != dst)
     {
-      for (; length >= block_size; length -= block_size, src += block_size, dst += block_size)
+      if (length == block_size)
 	{
 	  f(ctx, block_size, dst, ctr);
-	  memxor(dst, src, block_size);
 	  INCREMENT(block_size, ctr);
+	  memxor(dst, src, block_size);
+	}
+      else
+	{
+	  unsigned left;
+	  uint8_t *p;	  
+
+	  for (p = dst, left = length;
+	       left >= block_size;
+	       left -= block_size, p += block_size)
+	    {
+	      memcpy (p, ctr, block_size);
+	      INCREMENT(block_size, ctr);
+	    }
+
+	  f(ctx, length - left, dst, dst);
+	  memxor(dst, src, length - left);
+
+	  if (left)
+	    {
+	      TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
+	      TMP_ALLOC(buffer, block_size);
+
+	      f(ctx, block_size, buffer, ctr);
+	      INCREMENT(block_size, ctr);
+	      memxor3(dst + length - left, src + length - left, buffer, left);
+	    }
 	}
     }
   else
     {
-      for (; length >= block_size; length -= block_size, src += block_size, dst += block_size)
+      if (length <= block_size)
 	{
+	  TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
+	  TMP_ALLOC(buffer, block_size);
+
 	  f(ctx, block_size, buffer, ctr);
-	  memxor3(dst, src, buffer, block_size);
 	  INCREMENT(block_size, ctr);
-	}      
-    }
-  if (length > 0)
-    {
-      /* A final partial block */
+	  memxor3(dst, src, buffer, length);
+	}
+      else
+	{
+	  TMP_DECL(buffer, uint8_t, NBLOCKS * NETTLE_MAX_CIPHER_BLOCK_SIZE);
+	  unsigned chunk = NBLOCKS * block_size;
+
+	  TMP_ALLOC(buffer, chunk);
 
-      f(ctx, block_size, buffer, ctr);
-      memxor3(dst, src, buffer, length);
-      INCREMENT(block_size, ctr);
+	  for (; length >= chunk;
+	       length -= chunk, src += chunk, dst += chunk)
+	    {
+	      unsigned n;
+	      uint8_t *p;	  
+	      for (n = 0, p = buffer; n < NBLOCKS; n++, p += block_size)
+		{
+		  memcpy (p, ctr, block_size);
+		  INCREMENT(block_size, ctr);
+		}
+	      f(ctx, chunk, buffer, buffer);
+	      memxor(dst, buffer, chunk);
+	    }
+
+	  if (length > 0)
+	    {
+	      /* Final, possibly partial, blocks */
+	      for (chunk = 0; chunk < length; chunk += block_size)
+		{
+		  memcpy (buffer + chunk, ctr, block_size);
+		  INCREMENT(block_size, ctr);
+		}
+	      f(ctx, chunk, buffer, buffer);
+	      memxor3(dst, src, buffer, length);
+	    }
+	}
     }
 }