diff --git a/ChangeLog b/ChangeLog
index ae510f649047ff526dea757ddb086075841559d1..a7f1653b37e5b31bd80a85ed2d53bdebba37341b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2012-11-13  Niels Möller  <nisse@lysator.liu.se>
+
+	* examples/nettle-benchmark.c (TIME_CYCLES): New macro.
+	(bench_sha1_compress, bench_salsa20_core): Use it.
+	(bench_sha3_permute): New function.
+	(main): Call bench_sha3_permute.
+
 2012-11-12  Niels Möller  <nisse@lysator.liu.se>
 
 	* examples/nettle-benchmark.c (main): Benchmark sha3_256.
diff --git a/examples/nettle-benchmark.c b/examples/nettle-benchmark.c
index f3e79cee26f6883f2bfea448f2f03d67209103fd..d3da1bcad5618ed40a67ba9c720d5c7500fab97d 100644
--- a/examples/nettle-benchmark.c
+++ b/examples/nettle-benchmark.c
@@ -50,6 +50,7 @@
 #include "salsa20.h"
 #include "serpent.h"
 #include "sha.h"
+#include "sha3.h"
 #include "twofish.h"
 
 #include "nettle-meta.h"
@@ -550,6 +551,8 @@ time_cipher(const struct nettle_cipher *cipher)
   free(key);
 }
 
+/* Try to get accurate cycle times for assembler functions. */
+#if WITH_CYCLE_COUNTER
 static int
 compare_double(const void *ap, const void *bp)
 {
@@ -563,71 +566,63 @@ compare_double(const void *ap, const void *bp)
     return 0;
 }
 
-/* Try to get accurate cycle times for assembler functions. */
-#if WITH_CYCLE_COUNTER
+#define TIME_CYCLES(t, code) do {				\
+  double tc_count[5];						\
+  uint32_t tc_start_lo, tc_start_hi, tc_end_lo, tc_end_hi;	\
+  unsigned tc_i, tc_j;						\
+  for (tc_j = 0; tc_j < 5; tc_j++)				\
+    {								\
+      tc_i = 0;							\
+      GET_CYCLE_COUNTER(tc_start_hi, tc_start_lo);		\
+      for (; tc_i < BENCH_ITERATIONS; tc_i++)			\
+	{ code; }						\
+								\
+      GET_CYCLE_COUNTER(tc_end_hi, tc_end_lo);			\
+								\
+      tc_end_hi -= (tc_start_hi + (tc_start_lo > tc_end_lo));	\
+      tc_end_lo -= tc_start_lo;					\
+								\
+      tc_count[tc_j] = ldexp(tc_end_hi, 32) + tc_end_lo;	\
+    }								\
+  qsort(tc_count, 5, sizeof(double), compare_double);		\
+  (t) = tc_count[2] / BENCH_ITERATIONS;				\
+} while (0)
+
 static void
 bench_sha1_compress(void)
 {
   uint32_t state[_SHA1_DIGEST_LENGTH];
-  uint8_t data[BENCH_ITERATIONS * SHA1_DATA_SIZE];
-  uint32_t start_lo, start_hi, end_lo, end_hi;
-
-  double count[5];
-  
-  uint8_t *p;
-  unsigned i, j;
-
-  for (j = 0; j < 5; j++)
-    {
-      i = 0;
-      p = data;
-      GET_CYCLE_COUNTER(start_hi, start_lo);
-      for (; i < BENCH_ITERATIONS; i++, p += SHA1_DATA_SIZE)
-	_nettle_sha1_compress(state, p);
-
-      GET_CYCLE_COUNTER(end_hi, end_lo);
-
-      end_hi -= (start_hi + (start_lo > end_lo));
-      end_lo -= start_lo;
+  uint8_t data[SHA1_DATA_SIZE];
+  double t;
 
-      count[j] = ldexp(end_hi, 32) + end_lo;
-    }
+  TIME_CYCLES (t, _nettle_sha1_compress(state, data));
 
-  qsort(count, 5, sizeof(double), compare_double);
-  printf("sha1_compress: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);  
+  printf("sha1_compress: %.2f cycles\n", t);  
 }
 
 static void
 bench_salsa20_core(void)
 {
   uint32_t state[_SALSA20_INPUT_LENGTH];
-  uint32_t start_lo, start_hi, end_lo, end_hi;
-
-  double count[5];
-  
-  unsigned i, j;
-
-  for (j = 0; j < 5; j++)
-    {
-      i = 0;
-      GET_CYCLE_COUNTER(start_hi, start_lo);
-      for (; i < BENCH_ITERATIONS; i++)
-	_nettle_salsa20_core(state, state, 20);
-
-      GET_CYCLE_COUNTER(end_hi, end_lo);
+  double t;
 
-      end_hi -= (start_hi + (start_lo > end_lo));
-      end_lo -= start_lo;
+  TIME_CYCLES (t, _nettle_salsa20_core(state, state, 20));
+  printf("salsa20_core: %.2f cycles\n", t);  
+}
 
-      count[j] = ldexp(end_hi, 32) + end_lo;
-    }
+static void
+bench_sha3_permute(void)
+{
+  struct sha3_state state;
+  double t;
 
-  qsort(count, 5, sizeof(double), compare_double);
-  printf("salsa20_core: %.2f cycles\n\n", count[2] / BENCH_ITERATIONS);  
+  TIME_CYCLES (t, sha3_permute (&state));
+  printf("sha3_permute: %.2f cycles (%.2f / round)\n", t, t / 24.0);
 }
 #else
 #define bench_sha1_compress()
 #define bench_salsa20_core()
+#define bench_sha3_permute()
 #endif
 
 #if WITH_OPENSSL
@@ -719,6 +714,8 @@ main(int argc, char **argv)
 #endif
   bench_sha1_compress();
   bench_salsa20_core();
+  bench_sha3_permute();
+  printf("\n");
   time_overhead();
 
   header();