diff --git a/aes-encrypt-internal.c b/aes-encrypt-internal.c
new file mode 100644
index 0000000000000000000000000000000000000000..d55f41e889c7abfd01b61292df7ea5f8af6f63c7
--- /dev/null
+++ b/aes-encrypt-internal.c
@@ -0,0 +1,103 @@
+/* aes-encrypt-internal.c
+ *
+ * Encryption function for the aes/rijndael block cipher.
+ */
+
+/* nettle, low-level cryptographics library
+ *
+ * Copyright (C) 2002 Niels M�ller
+ *  
+ * The nettle library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
+ * option) any later version.
+ * 
+ * The nettle library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with the nettle library; see the file COPYING.LIB.  If not, write to
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ */
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <assert.h>
+
+#include "aes-internal.h"
+#include "macros.h"
+
+void
+_nettle_aes_encrypt(const struct aes_ctx *ctx,
+		    const struct aes_table *T,
+		    unsigned length, uint8_t *dst,
+		    const uint8_t *src)
+{
+  FOR_BLOCKS(length, dst, src, AES_BLOCK_SIZE)
+    {
+      uint32_t w0, w1, w2, w3;		/* working ciphertext */
+      uint32_t t0, t1, t2, t3;
+      unsigned round;
+      
+      /* Get clear text, using little-endian byte order.
+       * Also XOR with the first subkey. */
+
+      w0 = LE_READ_UINT32(src)      ^ ctx->keys[0];
+      w1 = LE_READ_UINT32(src + 4)  ^ ctx->keys[1];
+      w2 = LE_READ_UINT32(src + 8)  ^ ctx->keys[2];
+      w3 = LE_READ_UINT32(src + 12) ^ ctx->keys[3];
+
+      for (round = 1; round < ctx->nrounds; round++)
+	{
+	  t0 = AES_ROUND(T, w0, w1, w2, w3, ctx->keys[4*round]);
+	  t1 = AES_ROUND(T, w1, w2, w3, w0, ctx->keys[4*round + 1]);
+	  t2 = AES_ROUND(T, w2, w3, w0, w1, ctx->keys[4*round + 2]);
+	  t3 = AES_ROUND(T, w3, w0, w1, w2, ctx->keys[4*round + 3]);
+
+	  /* FIXME: We could unrolling the loop twice, to avoid these
+	     assignments. If all eight variables fit in registers, that
+	     should give a speedup. */
+	  w0 = t0;
+	  w1 = t1;
+	  w2 = t2;
+	  w3 = t3;
+	}
+
+      /* Final round */
+
+      t0 = AES_FINAL_ROUND(T, w0, w1, w2, w3, ctx->keys[4*round]);
+      t1 = AES_FINAL_ROUND(T, w1, w2, w3, w0, ctx->keys[4*round + 1]);
+      t2 = AES_FINAL_ROUND(T, w2, w3, w0, w1, ctx->keys[4*round + 2]);
+      t3 = AES_FINAL_ROUND(T, w3, w0, w1, w2, ctx->keys[4*round + 3]);
+
+      LE_WRITE_UINT32(dst, t0);
+      LE_WRITE_UINT32(dst + 8, t2);
+      LE_WRITE_UINT32(dst + 4, t1);
+      LE_WRITE_UINT32(dst + 12, t3);
+    }
+}
+
+/* Some stats (on a P4 2.2GHz), all for AES 128:
+
+   A. Table-driven indexing (the approach of the old unified
+      _aes_crypt function).
+   B. Unrolling the j-loop.
+
+   C. Eliminated the use of IDXk(j) in the main loop.
+
+   D. Put wtxt in four scalar variables.
+
+   E. Also put t in four scalar variables.
+   
+       MB/s  code size
+   A   35.9  0x202
+   B   37.3  0x334
+   C   33.0  0x2a7
+   D   40.7  0x3f9
+   E   42.9  0x44a
+ */