diff --git a/ChangeLog b/ChangeLog
index 00d7b2c89be8254936b3c898706d45a49fa23792..643c38b4591deda878f950cb16b7409d9e369dc7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,20 @@
 2022-07-05  Niels Möller  <nisse@lysator.liu.se>
 
 	* md-internal.h (MD_FILL_OR_RETURN): New file, new macro.
+	* sha256-compress-n.c (_nettle_sha256_compress_n): New file and
+	function, replacing...
+	* sha256-compress.c (_nettle_sha256_compress): ...deleted file and
+	function.
+	* sha2-internal.h (_nettle_sha256_compress_n): Declare new function..
+	* sha256.c (sha256_compress): Update to use
+	_nettle_sha256_compress_n and MD_FILL_OR_RETURN.
+	* x86_64/sha256-compress-n.asm: New file. replacing...
+	* x86_64/sha256-compress.asm: ...deleted file.
+	* x86_64/sha_ni/sha256-compress-n.asm: New file. replacing...
+	* x86_64/sha_ni/sha256-compress.asm: ...deleted file.
+	* fat-setup.h (sha256_compress_n_func): New typedef, replacing...
+	(sha256_compress_func): ... deleted typedef.
+	* fat-x86_64.c: Update fat setup.
 
 2022-06-20  Niels Möller  <nisse@lysator.liu.se>
 
diff --git a/Makefile.in b/Makefile.in
index ba5364076bcc9b0da50a2f9310b3454a08cf7cd3..64027d4d962cc4716de1bf518dd74a7efdeb11f6 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -138,7 +138,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 salsa20-set-nonce.c \
 		 salsa20-128-set-key.c salsa20-256-set-key.c \
 		 sha1.c sha1-compress.c sha1-meta.c \
-		 sha256.c sha256-compress.c sha224-meta.c sha256-meta.c \
+		 sha256.c sha256-compress-n.c sha224-meta.c sha256-meta.c \
 		 sha512.c sha512-compress.c sha384-meta.c sha512-meta.c \
 		 sha512-224-meta.c sha512-256-meta.c \
 		 sha3.c sha3-permute.c \
diff --git a/configure.ac b/configure.ac
index 73c6fc21ec430cf4be9c745da50eceee3b935080..cb30dfb3ef056e8f095cec61cabcf325a4b34318 100644
--- a/configure.ac
+++ b/configure.ac
@@ -591,7 +591,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
 		chacha-core-internal.asm \
 		salsa20-crypt.asm salsa20-core-internal.asm \
 		serpent-encrypt.asm serpent-decrypt.asm \
-		sha1-compress.asm sha256-compress.asm sha512-compress.asm \
+		sha1-compress.asm sha256-compress-n.asm sha512-compress.asm \
 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
 
 # Assembler files which generate additional object files if they are used.
@@ -607,7 +607,7 @@ asm_nettle_optional_list="cpuid.asm cpu-facility.asm \
   chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
   ghash-set-key-2.asm ghash-update-2.asm \
   salsa20-2core.asm salsa20-core-internal-2.asm \
-  sha1-compress-2.asm sha256-compress-2.asm \
+  sha1-compress-2.asm sha256-compress-n-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
   umac-nh-n-2.asm umac-nh-2.asm"
 
@@ -757,7 +757,7 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_salsa20_2core
 #undef HAVE_NATIVE_fat_salsa20_2core
 #undef HAVE_NATIVE_sha1_compress
-#undef HAVE_NATIVE_sha256_compress
+#undef HAVE_NATIVE_sha256_compress_n
 #undef HAVE_NATIVE_sha512_compress
 #undef HAVE_NATIVE_sha3_permute
 #undef HAVE_NATIVE_umac_nh
diff --git a/fat-setup.h b/fat-setup.h
index e77cce0288a0e5cff1671e17913b2f5b0b06a757..70bc2687d5cedc534552bed4928db7fe5bc36855 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -178,7 +178,9 @@ typedef void salsa20_crypt_func (struct salsa20_ctx *ctx, unsigned rounds,
 				 const uint8_t *src);
 
 typedef void sha1_compress_func(uint32_t *state, const uint8_t *input);
-typedef void sha256_compress_func(uint32_t *state, const uint8_t *input, const uint32_t *k);
+typedef const uint8_t *
+sha256_compress_n_func(uint32_t *state, const uint32_t *k,
+		       size_t blocks, const uint8_t *input);
 
 struct sha3_state;
 typedef void sha3_permute_func (struct sha3_state *state);
diff --git a/fat-x86_64.c b/fat-x86_64.c
index 47cf78ae49c06d6349d12ccea0427726cf304daa..0a2fedf46098065e28328a6cf9ba25e6052d4ccd 100644
--- a/fat-x86_64.c
+++ b/fat-x86_64.c
@@ -155,9 +155,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, x86_64)
 DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, sha_ni)
 
-DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, x86_64)
-DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, sha_ni)
+DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, x86_64)
+DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, sha_ni)
 
 DECLARE_FAT_FUNC(_nettle_ghash_set_key, ghash_set_key_func)
 DECLARE_FAT_FUNC_VAR(ghash_set_key, ghash_set_key_func, c)
@@ -228,14 +228,14 @@ fat_init (void)
       if (verbose)
 	fprintf (stderr, "libnettle: using sha_ni instructions.\n");
       nettle_sha1_compress_vec = _nettle_sha1_compress_sha_ni;
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_sha_ni;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_sha_ni;
     }
   else
     {
       if (verbose)
 	fprintf (stderr, "libnettle: not using sha_ni instructions.\n");
       nettle_sha1_compress_vec = _nettle_sha1_compress_x86_64;
-      _nettle_sha256_compress_vec = _nettle_sha256_compress_x86_64;
+      _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_x86_64;
     }
 
   if (features.have_pclmul)
@@ -315,9 +315,10 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 		(uint32_t *state, const uint8_t *input),
 		(state, input))
 
-DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
-		(uint32_t *state, const uint8_t *input, const uint32_t *k),
-		(state, input, k))
+DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *,
+		(uint32_t *state, const uint32_t *k,
+		 size_t blocks, const uint8_t *input),
+		(state, k, blocks, input))
 
 DEFINE_FAT_FUNC(_nettle_ghash_set_key, void,
 		(struct gcm_key *ctx, const union nettle_block16 *key),
diff --git a/sha2-internal.h b/sha2-internal.h
index 40f25a5f86adfb31898cf1e9450d757353a24dba..93080bee67a633a3ae7edae02b001421173a34c5 100644
--- a/sha2-internal.h
+++ b/sha2-internal.h
@@ -39,8 +39,9 @@
 /* Internal compression function. STATE points to 8 uint32_t words,
    DATA points to 64 bytes of input data, possibly unaligned, and K
    points to the table of constants. */
-void
-_nettle_sha256_compress(uint32_t *state, const uint8_t *data, const uint32_t *k);
+const uint8_t *
+_nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+			  size_t blocks, const uint8_t *data);
 
 /* Internal compression function. STATE points to 8 uint64_t words,
    DATA points to 128 bytes of input data, possibly unaligned, and K
diff --git a/sha256-compress.c b/sha256-compress-n.c
similarity index 59%
rename from sha256-compress.c
rename to sha256-compress-n.c
index cf17e3e1400c3510f3abc73a74de0fda6d9c51c6..1e40cb1d19e604944a073071b91d0c0a93208b55 100644
--- a/sha256-compress.c
+++ b/sha256-compress-n.c
@@ -1,8 +1,8 @@
-/* sha256-compress.c
+/* sha256-compress-n.c
 
    The compression function of the sha256 hash function.
 
-   Copyright (C) 2001, 2010 Niels Möller
+   Copyright (C) 2001, 2010, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -124,20 +124,12 @@ _nettle_sha256_compress_c(uint32_t *state, const uint8_t *input, const uint32_t
 #define _nettle_sha256_compress _nettle_sha256_compress_c
 #endif
 
-void
-_nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+const uint8_t *
+_nettle_sha256_compress_n(uint32_t *state, const uint32_t *table,
+			  size_t blocks, const uint8_t *input)
 {
-  uint32_t data[SHA256_DATA_LENGTH];
   uint32_t A, B, C, D, E, F, G, H;     /* Local vars */
-  unsigned i;
-  uint32_t *d;
 
-  for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4)
-    {
-      data[i] = READ_UINT32(input);
-    }
-
-  /* Set up first buffer and local data buffer */
   A = state[0];
   B = state[1];
   C = state[2];
@@ -146,55 +138,68 @@ _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k
   F = state[5];
   G = state[6];
   H = state[7];
-  
-  /* Heavy mangling */
-  /* First 16 subrounds that act on the original data */
 
-  DEBUG(-1);
-  for (i = 0, d = data; i<16; i+=8, k += 8, d+= 8)
+  for (; blocks > 0; blocks--)
     {
-      ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i);
-      ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1);
-      ROUND(G, H, A, B, C, D, E, F, k[2], d[2]);
-      ROUND(F, G, H, A, B, C, D, E, k[3], d[3]);
-      ROUND(E, F, G, H, A, B, C, D, k[4], d[4]);
-      ROUND(D, E, F, G, H, A, B, C, k[5], d[5]);
-      ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6);
-      ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7);
-    }
+      uint32_t data[SHA256_DATA_LENGTH];
+      unsigned i;
+      const uint32_t *k;
+      uint32_t *d;
+      for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4)
+	{
+	  data[i] = READ_UINT32(input);
+	}
+
+      /* Heavy mangling */
+      /* First 16 subrounds that act on the original data */
+
+      DEBUG(-1);
+      for (i = 0, d = data, k = table; i<16; i+=8, k += 8, d+= 8)
+	{
+	  ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i);
+	  ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1);
+	  ROUND(G, H, A, B, C, D, E, F, k[2], d[2]);
+	  ROUND(F, G, H, A, B, C, D, E, k[3], d[3]);
+	  ROUND(E, F, G, H, A, B, C, D, k[4], d[4]);
+	  ROUND(D, E, F, G, H, A, B, C, k[5], d[5]);
+	  ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6);
+	  ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7);
+	}
   
-  for (; i<64; i += 16, k+= 16)
-    {
-      ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data,  0)); DEBUG(i);
-      ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data,  1)); DEBUG(i+1);
-      ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data,  2)); DEBUG(i+2);
-      ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data,  3)); DEBUG(i+3);
-      ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data,  4)); DEBUG(i+4);
-      ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data,  5)); DEBUG(i+5);
-      ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data,  6)); DEBUG(i+6);
-      ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data,  7)); DEBUG(i+7);
-      ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data,  8)); DEBUG(i+8);
-      ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data,  9)); DEBUG(i+9);
-      ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10);
-      ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11);
-      ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12);
-      ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13);
-      ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14);
-      ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15);
-    }
-
-  /* Update state */
-  state[0] += A;
-  state[1] += B;
-  state[2] += C;
-  state[3] += D;
-  state[4] += E;
-  state[5] += F;
-  state[6] += G;
-  state[7] += H;
+      for (; i<64; i += 16, k+= 16)
+	{
+	  ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data,  0)); DEBUG(i);
+	  ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data,  1)); DEBUG(i+1);
+	  ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data,  2)); DEBUG(i+2);
+	  ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data,  3)); DEBUG(i+3);
+	  ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data,  4)); DEBUG(i+4);
+	  ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data,  5)); DEBUG(i+5);
+	  ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data,  6)); DEBUG(i+6);
+	  ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data,  7)); DEBUG(i+7);
+	  ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data,  8)); DEBUG(i+8);
+	  ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data,  9)); DEBUG(i+9);
+	  ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10);
+	  ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11);
+	  ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12);
+	  ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13);
+	  ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14);
+	  ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15);
+	}
+
+      /* Update state */
+      state[0] = A = state[0] + A;
+      state[1] = B = state[1] + B;
+      state[2] = C = state[2] + C;
+      state[3] = D = state[3] + D;
+      state[4] = E = state[4] + E;
+      state[5] = F = state[5] + F;
+      state[6] = G = state[6] + G;
+      state[7] = H = state[7] + H;
 #if SHA256_DEBUG
-  fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n",
-	  state[0], state[1], state[2], state[3],
-	  state[4], state[5], state[6], state[7]);
+      fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n",
+	      state[0], state[1], state[2], state[3],
+	      state[4], state[5], state[6], state[7]);
 #endif
+    }
+  return input;
 }
diff --git a/sha256.c b/sha256.c
index 3872ca6fa9aac2ab9744e7532eece05af95fd02b..0c9c21a0e3a8b9bb9403ebf686dcb0a0e85b848b 100644
--- a/sha256.c
+++ b/sha256.c
@@ -46,6 +46,7 @@
 #include "sha2-internal.h"
 
 #include "macros.h"
+#include "md-internal.h"
 #include "nettle-write.h"
 
 /* Generated by the shadata program. */
@@ -70,6 +71,12 @@ K[64] =
   0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL, 
 };
 
+void
+sha256_compress(uint32_t *state, const uint8_t *input)
+{
+  _nettle_sha256_compress_n(state, K, 1, input);
+}
+
 #define COMPRESS(ctx, data) (sha256_compress((ctx)->state, (data)))
 
 /* Initialize the SHA values */
@@ -97,7 +104,22 @@ void
 sha256_update(struct sha256_ctx *ctx,
 	      size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, ctx->count++);
+  size_t blocks;
+  if (ctx->index > 0)
+    {
+      /* Try to fill partial block */
+      MD_FILL_OR_RETURN (ctx, length, data);
+      sha256_compress (ctx->state, ctx->block);
+      ctx->count++;
+    }
+
+  blocks = length >> 6;
+  data = _nettle_sha256_compress_n (ctx->state, K, blocks, data);
+  ctx->count += blocks;
+  length &= 63;
+
+  memcpy (ctx->block, data, length);
+  ctx->index = length;
 }
 
 static void
@@ -161,9 +183,3 @@ sha224_digest(struct sha256_ctx *ctx,
   sha256_write_digest(ctx, length, digest);
   sha224_init(ctx);
 }
-
-void
-sha256_compress(uint32_t *state, const uint8_t *input)
-{
-  _nettle_sha256_compress(state, input, K);
-}
diff --git a/x86_64/fat/sha256-compress-2.asm b/x86_64/fat/sha256-compress-n-2.asm
similarity index 92%
rename from x86_64/fat/sha256-compress-2.asm
rename to x86_64/fat/sha256-compress-n-2.asm
index 996cf8c53329db80d6d316da2a0050cc31990ccf..60f7c8f6b65184733aed88f54475a16b9c90cb8b 100644
--- a/x86_64/fat/sha256-compress-2.asm
+++ b/x86_64/fat/sha256-compress-n-2.asm
@@ -1,4 +1,4 @@
-C x86_64/fat/sha256-compress-2.asm
+C x86_64/fat/sha256-compress-n-2.asm
 
 ifelse(`
    Copyright (C) 2018 Niels Möller
@@ -31,4 +31,4 @@ ifelse(`
 ')
 
 define(`fat_transform', `$1_sha_ni')
-include_src(`x86_64/sha_ni/sha256-compress.asm')
+include_src(`x86_64/sha_ni/sha256-compress-n.asm')
diff --git a/x86_64/fat/sha256-compress.asm b/x86_64/fat/sha256-compress-n.asm
similarity index 92%
rename from x86_64/fat/sha256-compress.asm
rename to x86_64/fat/sha256-compress-n.asm
index 2aaeb5e89ab3af665f0344d8dbc3033165663cef..fc35885836e0161756f02eba82d9fbb11e941758 100644
--- a/x86_64/fat/sha256-compress.asm
+++ b/x86_64/fat/sha256-compress-n.asm
@@ -1,4 +1,4 @@
-C x86_64/fat/sha256-compress.asm
+C x86_64/fat/sha256-compress-n.asm
 
 ifelse(`
    Copyright (C) 2018 Niels Möller
@@ -31,4 +31,4 @@ ifelse(`
 ')
 
 define(`fat_transform', `$1_x86_64')
-include_src(`x86_64/sha256-compress.asm')
+include_src(`x86_64/sha256-compress-n.asm')
diff --git a/x86_64/sha256-compress.asm b/x86_64/sha256-compress-n.asm
similarity index 77%
rename from x86_64/sha256-compress.asm
rename to x86_64/sha256-compress-n.asm
index 5ed669b1f0b5e77412601c95132a354b4b068925..e10d260c65e16b1b0ea1cc61cdaf49a01c02d127 100644
--- a/x86_64/sha256-compress.asm
+++ b/x86_64/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C x86_64/sha256-compress.asm
+C x86_64/sha256-compress-n.asm
 
 ifelse(`
-   Copyright (C) 2013 Niels Möller
+   Copyright (C) 2013, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -30,21 +30,24 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-	.file "sha256-compress.asm"
+	.file "sha256-compress-n.asm"
 define(`STATE', `%rdi')
-define(`INPUT', `%rsi')
-define(`K', `%rdx')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
+define(`STATE_SAVED', `64(%rsp)')
+
 define(`SA', `%eax')
 define(`SB', `%ebx')
-define(`SC', `%ecx')
+define(`SC', `%ebp')
 define(`SD', `%r8d')
 define(`SE', `%r9d')
 define(`SF', `%r10d')
 define(`SG', `%r11d')
 define(`SH', `%r12d')
 define(`T0', `%r13d')
-define(`T1', `%edi')	C Overlap STATE
-define(`COUNT', `%r14')
+define(`T1', `%r14d')
+define(`COUNT', `%rdi')	C Overlap STATE
 define(`W', `%r15d')
 
 define(`EXPN', `
@@ -123,18 +126,21 @@ define(`NOEXPN', `
 	movl	W, OFFSET($1)(%rsp, COUNT, 4)
 ')
 
-	C void
-	C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+	C const uint8_t *
+	C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+	C                           size_t blocks, const uint8_t *input)
 
 	.text
 	ALIGN(16)
 
-PROLOGUE(_nettle_sha256_compress)
+PROLOGUE(_nettle_sha256_compress_n)
 	W64_ENTRY(3, 0)
+	test	BLOCKS, BLOCKS
+	jz	.Lend
 
 	sub	$120, %rsp
-	mov	%rbx, 64(%rsp)
-	mov	STATE, 72(%rsp)	C Save state, to free a register
+	mov	STATE, STATE_SAVED	C Save state, to free a register
+	mov	%rbx, 72(%rsp)
 	mov	%rbp, 80(%rsp)
 	mov	%r12, 88(%rsp)
 	mov	%r13, 96(%rsp)
@@ -149,7 +155,9 @@ PROLOGUE(_nettle_sha256_compress)
 	movl	20(STATE), SF
 	movl	24(STATE), SG
 	movl	28(STATE), SH
-	xor	COUNT, COUNT
+
+.Loop_block:
+	xorl	XREG(COUNT), XREG(COUNT)
 	ALIGN(16)
 
 .Loop1:
@@ -161,8 +169,8 @@ PROLOGUE(_nettle_sha256_compress)
 	NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5)
 	NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6)
 	NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7)
-	add	$8, COUNT
-	cmp	$16, COUNT
+	addl	$8, XREG(COUNT)
+	cmpl	$16, XREG(COUNT)
 	jne	.Loop1
 
 .Loop2:
@@ -182,22 +190,35 @@ PROLOGUE(_nettle_sha256_compress)
 	EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,13)
 	EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,14)
 	EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,15)
-	add	$16, COUNT
-	cmp	$64, COUNT
+	addl	$16, XREG(COUNT)
+	cmpl	$64, XREG(COUNT)
 	jne	.Loop2
 
-	mov	72(%rsp), STATE
-
-	addl	SA, (STATE)
-	addl	SB, 4(STATE)
-	addl	SC, 8(STATE)
-	addl	SD, 12(STATE)
-	addl	SE, 16(STATE)
-	addl	SF, 20(STATE)
-	addl	SG, 24(STATE)
-	addl	SH, 28(STATE)
-
-	mov	64(%rsp), %rbx
+	mov	STATE_SAVED, STATE
+
+	addl	(STATE), SA
+	addl	4(STATE), SB
+	addl	8(STATE), SC
+	addl	12(STATE), SD
+	addl	16(STATE), SE
+	addl	20(STATE), SF
+	addl	24(STATE), SG
+	addl	28(STATE), SH
+
+	movl	SA, (STATE)
+	movl	SB, 4(STATE)
+	movl	SC, 8(STATE)
+	movl	SD, 12(STATE)
+	movl	SE, 16(STATE)
+	movl	SF, 20(STATE)
+	movl	SG, 24(STATE)
+	movl	SH, 28(STATE)
+
+	add	$64, INPUT
+	dec	BLOCKS
+	jnz	.Loop_block
+
+	mov	72(%rsp), %rbx
 	mov	80(%rsp), %rbp
 	mov	88(%rsp), %r12
 	mov	96(%rsp), %r13
@@ -205,6 +226,8 @@ PROLOGUE(_nettle_sha256_compress)
 	mov	112(%rsp),%r15
 
 	add	$120, %rsp
+.Lend:
+	mov	INPUT, %rax
 	W64_EXIT(3, 0)
 	ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)
diff --git a/x86_64/sha_ni/sha256-compress.asm b/x86_64/sha_ni/sha256-compress-n.asm
similarity index 87%
rename from x86_64/sha_ni/sha256-compress.asm
rename to x86_64/sha_ni/sha256-compress-n.asm
index 00bd3cd3f18d565976ad3392691eb981dc940c6b..005909df531275b0b3ee1a5385bacc7aef36669b 100644
--- a/x86_64/sha_ni/sha256-compress.asm
+++ b/x86_64/sha_ni/sha256-compress-n.asm
@@ -1,7 +1,7 @@
-C x86_64/sha_ni/sha256-compress.asm
+C x86_64/sha_ni/sha256-compress-n.asm
 
 ifelse(`
-   Copyright (C) 2018 Niels Möller
+   Copyright (C) 2018, 2022 Niels Möller
 
    This file is part of GNU Nettle.
 
@@ -30,10 +30,11 @@ ifelse(`
    not, see http://www.gnu.org/licenses/.
 ')
 
-	.file "sha256-compress.asm"
+	.file "sha256-compress-n.asm"
 define(`STATE', `%rdi')
-define(`INPUT', `%rsi')
-define(`K', `%rdx')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
 
 define(`MSGK',`%xmm0')	C Implicit operand of sha256rnds2
 define(`MSG0',`%xmm1')
@@ -45,7 +46,7 @@ define(`CDGH',`%xmm6')
 define(`ABEF_ORIG',`%xmm7')
 define(`CDGH_ORIG', `%xmm8')
 define(`SWAP_MASK',`%xmm9')
-define(`TMP', `%xmm9')	C Overlaps SWAP_MASK
+define(`TMP', `%xmm10')
 
 C QROUND(M0, M1, M2, M3, R)
 define(`QROUND', `
@@ -69,15 +70,19 @@ define(`TRANSPOSE', `
 	punpcklqdq $1, $3
 ')
 
-	C void
-	C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k)
+	C const uint8_t *
+	C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+	C                           size_t blocks, const uint8_t *input)
 
 	.text
 	ALIGN(16)
 .Lswap_mask:
 	.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
-PROLOGUE(_nettle_sha256_compress)
-	W64_ENTRY(3, 10)
+PROLOGUE(_nettle_sha256_compress_n)
+	W64_ENTRY(4, 11)
+	test	BLOCKS, BLOCKS
+	jz	.Lend
+
 	movups	(STATE), TMP
 	movups	16(STATE), ABEF
 
@@ -88,12 +93,13 @@ PROLOGUE(_nettle_sha256_compress)
 
 	movdqa	.Lswap_mask(%rip), SWAP_MASK
 
-	movdqa	ABEF, ABEF_ORIG
-	movdqa	CDGH, CDGH_ORIG
-
+.Loop:
 	movups	(INPUT), MSG0
 	pshufb	SWAP_MASK, MSG0
 
+	movdqa	ABEF, ABEF_ORIG
+	movdqa	CDGH, CDGH_ORIG
+
 	movdqa	(K), MSGK
 	paddd	MSG0, MSGK
 	sha256rnds2 ABEF, CDGH		C Round 0-1
@@ -163,6 +169,10 @@ PROLOGUE(_nettle_sha256_compress)
 	paddd ABEF_ORIG, ABEF
 	paddd CDGH_ORIG, CDGH
 
+	add	$64, INPUT
+	dec	BLOCKS
+	jnz	.Loop
+
 	TRANSPOSE(ABEF, CDGH, TMP)
 
 	pshufd	$0x1b, CDGH, CDGH
@@ -170,6 +180,8 @@ PROLOGUE(_nettle_sha256_compress)
 	movups	CDGH, 0(STATE)
 	movups	TMP, 16(STATE)
 
-	W64_EXIT(3, 10)
+.Lend:
+	mov	INPUT, %rax
+	W64_EXIT(4, 11)
 	ret
-EPILOGUE(_nettle_sha256_compress)
+EPILOGUE(_nettle_sha256_compress_n)