diff --git a/ChangeLog b/ChangeLog
index 1a9e2c4ac61a0aed08311d61afccd47a9a4688cd..20726f1a2c2f138f9cc6c58874281694c25e2d5f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,11 +1,13 @@
 2013-04-16  Niels Möller  <nisse@lysator.liu.se>
 
-	* umac-l2.c (_umac_l2): Deleted redundant memcpy.
-
 	* umac.h (umac32_ctx, umac64_ctx, umac96_ctx, umac128_ctx): Make
 	block count an uint64_t. Reorder some elements to put short values
 	together.
 	* umac-l2.c (_umac_l2, _umac_l2_final): Make count argument an uint64_t.
+	(_umac_l2): Deleted redundant memcpy.
+	(_umac_l2, _umac_l2_final): Store input buffer at end of the
+	poly64/poly128 state. Deleted l1_out from corresponding context
+	structs, and updated all callers.
 
 	* configure.ac: Changed version number to 2.7.
 	(LIBNETTLE_MINOR): Bumped library version, to 4.6
diff --git a/umac-l2.c b/umac-l2.c
index 6a31893a96182e2cdd2c130a2ea8d432f15ebd3f..cdf7d81a9d72a6f8e7cea417661c520e2fa9c6e0 100644
--- a/umac-l2.c
+++ b/umac-l2.c
@@ -57,8 +57,9 @@ _umac_l2_init (unsigned size, uint32_t *k)
 
 void
 _umac_l2(const uint32_t *key, uint64_t *state, unsigned n,
-	 uint64_t count, uint64_t *prev, const uint64_t *m)
+	 uint64_t count, const uint64_t *m)
 {
+  uint64_t *prev = state + 2*n;
   unsigned i;
 
   if (count == 0)
@@ -94,8 +95,9 @@ _umac_l2(const uint32_t *key, uint64_t *state, unsigned n,
 
 void
 _umac_l2_final(const uint32_t *key, uint64_t *state, unsigned n,
-	       uint64_t count, uint64_t *prev)
+	       uint64_t count)
 {
+  uint64_t *prev = state + 2*n;
   unsigned i;
 
   assert (count > 0);
diff --git a/umac.h b/umac.h
index 6f4d831c7adc87db1e15266bd72aff545540ae1b..339130deba55d2d334eae5e411222eb1ff47709e 100644
--- a/umac.h
+++ b/umac.h
@@ -73,11 +73,10 @@ extern "C" {
   uint32_t l3_key2[(n)];				\
   /* AES cipher for encrypting the nonce */		\
   struct aes_ctx pdf_key;				\
-  /* Buffer l1 output for one block.			\
-     FIXME: Make part of l2 state? */			\
-  uint64_t l1_out[(n)];					\
-  /* For both poly64-hashing and poly128 hashing */	\
-  uint64_t l2_state[2*(n)];				\
+  /* The l2_state consists of 2*n uint64_t, for poly64	\
+     and poly128 hashing, followed by n additional	\
+     uint64_t used as an input buffer. */		\
+  uint64_t l2_state[3*(n)];				\
   /* Input to the pdf_key, zero-padded and low bits	\
      cleared if appropriate. */				\
   uint8_t nonce[AES_BLOCK_SIZE];			\
@@ -219,11 +218,11 @@ _umac_l2_init (unsigned size, uint32_t *k);
 
 void
 _umac_l2(const uint32_t *key, uint64_t *state, unsigned n,
-	 uint64_t count, uint64_t *prev, const uint64_t *m);
+	 uint64_t count, const uint64_t *m);
 
 void
 _umac_l2_final(const uint32_t *key, uint64_t *state, unsigned n,
-	       uint64_t count, uint64_t *prev);
+	       uint64_t count);
 
 void
 _umac_l3_init (unsigned size, uint64_t *k);
diff --git a/umac128.c b/umac128.c
index 6d33b4f9a7463db87a72a529929f03dfb8d43258..a5b5b69073a0f0437daef513cf1c0093f29477fe 100644
--- a/umac128.c
+++ b/umac128.c
@@ -66,8 +66,7 @@ umac128_set_nonce (struct umac128_ctx *ctx,
     __umac128_y[1] += 8*UMAC_BLOCK_SIZE;				\
     __umac128_y[2] += 8*UMAC_BLOCK_SIZE;				\
     __umac128_y[3] += 8*UMAC_BLOCK_SIZE;				\
-    _umac_l2 (ctx->l2_key, ctx->l2_state, 4, ctx->count++,		\
-	      ctx->l1_out, __umac128_y);				\
+    _umac_l2 (ctx->l2_key, ctx->l2_state, 4, ctx->count++, __umac128_y); \
   } while (0)
 
 void
@@ -100,8 +99,7 @@ umac128_digest (struct umac128_ctx *ctx,
       y[1] += 8 * ctx->index;
       y[2] += 8 * ctx->index;
       y[3] += 8 * ctx->index;
-      _umac_l2 (ctx->l2_key, ctx->l2_state, 4, ctx->count++,
-		ctx->l1_out, y);
+      _umac_l2 (ctx->l2_key, ctx->l2_state, 4, ctx->count++, y);
     }
   assert (ctx->count > 0);
 
@@ -110,7 +108,7 @@ umac128_digest (struct umac128_ctx *ctx,
 
   INCREMENT (ctx->nonce_length, ctx->nonce);
 
-  _umac_l2_final (ctx->l2_key, ctx->l2_state, 4, ctx->count, ctx->l1_out);
+  _umac_l2_final (ctx->l2_key, ctx->l2_state, 4, ctx->count);
   for (i = 0; i < 4; i++)
     tag[i] ^= ctx->l3_key2[i] ^ _umac_l3 (ctx->l3_key1 + 8*i,
 					  ctx->l2_state + 2*i);
diff --git a/umac32.c b/umac32.c
index ce859c15996a116642900ab44b8a530a0b3939f4..c50dfc6b7491cff35e314cda9e00d6209bf96e2a 100644
--- a/umac32.c
+++ b/umac32.c
@@ -66,8 +66,7 @@ umac32_set_nonce (struct umac32_ctx *ctx,
     uint64_t __umac32_y							\
       = _umac_nh (ctx->l1_key, UMAC_BLOCK_SIZE, block)			\
       + 8*UMAC_BLOCK_SIZE ;						\
-    _umac_l2 (ctx->l2_key, ctx->l2_state, 1, ctx->count++,		\
-	      ctx->l1_out, &__umac32_y);				\
+    _umac_l2 (ctx->l2_key, ctx->l2_state, 1, ctx->count++, &__umac32_y); \
   } while (0)
 
 void
@@ -96,8 +95,7 @@ umac32_digest (struct umac32_ctx *ctx,
 
       y = _umac_nh (ctx->l1_key, ctx->index + pad, ctx->block)
 	+ 8 * ctx->index;
-      _umac_l2 (ctx->l2_key, ctx->l2_state, 1, ctx->count++,
-		ctx->l1_out, &y);
+      _umac_l2 (ctx->l2_key, ctx->l2_state, 1, ctx->count++, &y);
     }
   assert (ctx->count > 0);
   if ( !(ctx->nonce_low & _UMAC_NONCE_CACHED))
@@ -122,7 +120,7 @@ umac32_digest (struct umac32_ctx *ctx,
 	INCREMENT (i, ctx->nonce);
     }
 
-  _umac_l2_final (ctx->l2_key, ctx->l2_state, 1, ctx->count, ctx->l1_out);
+  _umac_l2_final (ctx->l2_key, ctx->l2_state, 1, ctx->count);
   pad ^= ctx->l3_key2[0] ^ _umac_l3 (ctx->l3_key1, ctx->l2_state);
   memcpy (digest, &pad, length);
 
diff --git a/umac64.c b/umac64.c
index e92b95cb6ef6f8120dab3e815259c051e45575d2..e740e91c0b316e0ae22b5757c8f588a16fa1a33d 100644
--- a/umac64.c
+++ b/umac64.c
@@ -67,8 +67,7 @@ umac64_set_nonce (struct umac64_ctx *ctx,
     _umac_nh_n (__umac64_y, 2, ctx->l1_key, UMAC_BLOCK_SIZE, block);	\
     __umac64_y[0] += 8*UMAC_BLOCK_SIZE;					\
     __umac64_y[1] += 8*UMAC_BLOCK_SIZE;					\
-    _umac_l2 (ctx->l2_key, ctx->l2_state, 2, ctx->count++,		\
-	      ctx->l1_out, __umac64_y);					\
+    _umac_l2 (ctx->l2_key, ctx->l2_state, 2, ctx->count++, __umac64_y);	\
   } while (0)
 
 void
@@ -99,8 +98,7 @@ umac64_digest (struct umac64_ctx *ctx,
       _umac_nh_n (y, 2, ctx->l1_key, ctx->index + pad, ctx->block);
       y[0] += 8 * ctx->index;
       y[1] += 8 * ctx->index;
-      _umac_l2 (ctx->l2_key, ctx->l2_state, 2, ctx->count++,
-		ctx->l1_out, y);
+      _umac_l2 (ctx->l2_key, ctx->l2_state, 2, ctx->count++, y);
     }
   assert (ctx->count > 0);
   if ( !(ctx->nonce_low & _UMAC_NONCE_CACHED))
@@ -124,7 +122,7 @@ umac64_digest (struct umac64_ctx *ctx,
 	INCREMENT (i, ctx->nonce);
     }
 
-  _umac_l2_final (ctx->l2_key, ctx->l2_state, 2, ctx->count, ctx->l1_out);
+  _umac_l2_final (ctx->l2_key, ctx->l2_state, 2, ctx->count);
   tag[0] = pad[0] ^ ctx->l3_key2[0] ^ _umac_l3 (ctx->l3_key1,
 						ctx->l2_state);
   tag[1] = pad[1] ^ ctx->l3_key2[1] ^ _umac_l3 (ctx->l3_key1 + 8,
diff --git a/umac96.c b/umac96.c
index 2831ad14615163c4912de2028afcc74aa566e2aa..72c38311da9784d2b35a81616b42b53ead16888a 100644
--- a/umac96.c
+++ b/umac96.c
@@ -65,8 +65,7 @@ umac96_set_nonce (struct umac96_ctx *ctx,
     __umac96_y[0] += 8*UMAC_BLOCK_SIZE;					\
     __umac96_y[1] += 8*UMAC_BLOCK_SIZE;					\
     __umac96_y[2] += 8*UMAC_BLOCK_SIZE;					\
-    _umac_l2 (ctx->l2_key, ctx->l2_state, 3, ctx->count++,		\
-	      ctx->l1_out, __umac96_y);					\
+    _umac_l2 (ctx->l2_key, ctx->l2_state, 3, ctx->count++, __umac96_y);	\
   } while (0)
 
 void
@@ -98,8 +97,7 @@ umac96_digest (struct umac96_ctx *ctx,
       y[0] += 8 * ctx->index;
       y[1] += 8 * ctx->index;
       y[2] += 8 * ctx->index;
-      _umac_l2 (ctx->l2_key, ctx->l2_state, 3, ctx->count++,
-		ctx->l1_out, y);
+      _umac_l2 (ctx->l2_key, ctx->l2_state, 3, ctx->count++, y);
     }
   assert (ctx->count > 0);
 
@@ -108,7 +106,7 @@ umac96_digest (struct umac96_ctx *ctx,
 
   INCREMENT (ctx->nonce_length, ctx->nonce);
 
-  _umac_l2_final (ctx->l2_key, ctx->l2_state, 3, ctx->count, ctx->l1_out);
+  _umac_l2_final (ctx->l2_key, ctx->l2_state, 3, ctx->count);
   for (i = 0; i < 3; i++)
     tag[i] ^= ctx->l3_key2[i] ^ _umac_l3 (ctx->l3_key1 + 8*i,
 					  ctx->l2_state + 2*i);