diff --git a/src/acconfig.h b/src/acconfig.h
index 91d671adecc5d708f7b4d3c87fde50ff3c7e5fd8..4a98e1329817e2755a7735de3ae1c558fb399f9f 100644
--- a/src/acconfig.h
+++ b/src/acconfig.h
@@ -1,5 +1,5 @@
 /*
- * $Id: acconfig.h,v 1.46 1999/08/10 00:18:58 mast Exp $
+ * $Id: acconfig.h,v 1.47 1999/08/11 22:13:18 hubbe Exp $
  */
 #ifndef MACHINE_H
 #define MACHINE_H
@@ -295,6 +295,9 @@
 /* The last argument to accept() is an ACCEPT_SIZE_T * */
 #define ACCEPT_SIZE_T	int
 
+/* Can we compile in MMX support? */
+#undef TRY_USE_MMX
+
 @BOTTOM@
 
 /* NT stuff */
diff --git a/src/configure.in b/src/configure.in
index b9749426dbe5c8a7c3ba60b862f87a6fc318a903..09023aca179f774458c75ba3379080bff6ba5ef0 100644
--- a/src/configure.in
+++ b/src/configure.in
@@ -1,4 +1,4 @@
-AC_REVISION("$Id: configure.in,v 1.306 1999/08/10 12:35:19 grubba Exp $")
+AC_REVISION("$Id: configure.in,v 1.307 1999/08/11 22:13:19 hubbe Exp $")
 AC_INIT(interpret.c)
 AC_CONFIG_HEADER(machine.h)
 
@@ -1093,7 +1093,7 @@ thread.h dlfcn.h dld.h dl.h sys/times.h sched.h sys/procfs.h sys/param.h \
 winsock.h sys/ioct.h sys/socket.h malloc.h netinet/in.h sys/wait.h winbase.h \
 grp.h pwd.h passwd.h group.h winsock2.h signal.h sys/file.h poll.h sys/poll.h \
 socket.h ieeefp.h fp_class.h floatingpoint.h  sys/priocntl.h sched.h \
-windows.h errno.h stddef.h)
+windows.h errno.h stddef.h mmx.h)
 
 AC_CHECK_SIZEOF(char *,4)
 AC_CHECK_SIZEOF(long,4)
@@ -2235,6 +2235,28 @@ else
   AC_MSG_RESULT(not by cast)
 fi
 
+#############################################################################
+AC_MSG_CHECKING(Working MMX)
+
+AC_CACHE_VAL(pike_cv_sys_has_working_mmx,
+[
+AC_TRY_LINK([
+#include <mmx.h>
+],[
+{
+  mmx_t a;
+  mmx_t b;
+  paddw(a,b);
+}
+], pike_cv_sys_has_working_mmx=yes,pike_cv_sys_has_working_mmx=no)
+])
+
+AC_MSG_RESULT($pike_cv_sys_has_working_mmx)
+
+if test "x$pike_cv_sys_has_working_mmx" = xyes ; then
+  AC_DEFINE(TRY_USE_MMX)
+fi
+
 #############################################################################
 
 AC_MSG_CHECKING(if float conversion can cause SIGFPE)
diff --git a/src/main.c b/src/main.c
index fba091089396707a4dd4d326a495b3c4685c4d71..f5066fdbe9b6530be9c0d572d80ca5c17e8a7079 100644
--- a/src/main.c
+++ b/src/main.c
@@ -5,7 +5,7 @@
 \*/
 /**/
 #include "global.h"
-RCSID("$Id: main.c,v 1.73 1999/06/02 21:21:38 marcus Exp $");
+RCSID("$Id: main.c,v 1.74 1999/08/11 22:13:21 hubbe Exp $");
 #include "fdlib.h"
 #include "backend.h"
 #include "module.h"
@@ -44,6 +44,11 @@ RCSID("$Id: main.c,v 1.73 1999/06/02 21:21:38 marcus Exp $");
 #include <sys/resource.h>
 #endif
 
+#ifdef TRY_USE_MMX
+#include <mmx.h>
+int try_use_mmx;
+#endif
+
 
 char *master_file;
 char **ARGV;
@@ -132,6 +137,10 @@ int dbm_main(int argc, char **argv)
   extern char **environ;
 #endif
 
+#ifdef TRY_USE_MMX
+  try_use_mmx=mmx_ok();
+#endif
+
   ARGV=argv;
 
   fd_init();
diff --git a/src/main.h b/src/main.h
index c9d0dfb9ed34ac53489a63ddbba84832a7c5e0c1..930e94c9cbf2cc454c8102f530629d076f743269 100644
--- a/src/main.h
+++ b/src/main.h
@@ -5,7 +5,7 @@
 \*/
 
 /*
- * $Id: main.h,v 1.9 1998/04/13 14:30:52 grubba Exp $
+ * $Id: main.h,v 1.10 1999/08/11 22:13:22 hubbe Exp $
  */
 #ifndef MAIN_H
 #define MAIN_H
@@ -15,6 +15,10 @@
 extern int d_flag, t_flag, a_flag, l_flag, c_flag, p_flag, debug_options;
 extern int default_t_flag;
 
+#ifdef TRY_USE_MMX
+extern int try_use_mmx;
+#endif
+
 #define DEBUG_SIGNALS 1
 #define NO_TAILRECURSION 2
 
diff --git a/src/modules/Image/layers.c b/src/modules/Image/layers.c
index 5f1e260aecbd3454d8fd71bcae164957e0f24234..93d9364879510ecf51465499afda79416b2a0a6e 100644
--- a/src/modules/Image/layers.c
+++ b/src/modules/Image/layers.c
@@ -1,7 +1,7 @@
 /*
 **! module Image
 **! note
-**!	$Id: layers.c,v 1.33 1999/08/10 12:57:36 mirar Exp $
+**!	$Id: layers.c,v 1.34 1999/08/11 22:13:30 hubbe Exp $
 **! class Layer
 **! see also: layers
 **!
@@ -203,7 +203,7 @@
 
 #include <math.h> /* floor */
 
-RCSID("$Id: layers.c,v 1.33 1999/08/10 12:57:36 mirar Exp $");
+RCSID("$Id: layers.c,v 1.34 1999/08/11 22:13:30 hubbe Exp $");
 
 #include "image_machine.h"
 
@@ -1397,6 +1397,8 @@ static void lm_normal(rgb_group *s,rgb_group *l,rgb_group *d,
 
 /* operators from template */
 
+#if 0
+
 #define LM_FUNC lm_add
 #define L_TRUNC(X) MINIMUM(255,(X))
 #define L_OPER(A,B) ((A)+(int)(B))
@@ -1405,6 +1407,152 @@ static void lm_normal(rgb_group *s,rgb_group *l,rgb_group *d,
 #undef L_TRUNC
 #undef L_OPER
 
+#else
+
+#define L_TRUNC(X) MINIMUM(255,(X))
+#define L_OPER(A,B) ((A)+(int)(B))
+
+#ifdef TRY_USE_MMX
+#include <mmx.h>
+#endif
+
+static void lm_add(rgb_group *s,rgb_group *l,rgb_group *d,
+		   rgb_group *sa,rgb_group *la,rgb_group *da,
+		   int len,double alpha)
+{
+   if (alpha==0.0)
+   {
+      MEMCPY(d,s,sizeof(rgb_group)*len);
+      MEMCPY(da,sa,sizeof(rgb_group)*len);
+      return; 
+   }
+   else if (alpha==1.0)
+   {
+      if (!la)  /* no layer alpha => full opaque */
+      {
+#ifdef TRY_USE_MMX
+	extern int try_use_mmx;
+	if(try_use_mmx)
+	{
+	  /* Strangely enough, this doesn't seem to make things
+	   * any faster. Guess I should take a look at the generated
+	   * assembler code...
+	   * /Hubbe
+	   */
+
+	  int num=sizeof(rgb_group) * len;
+	  unsigned char *source=(char *)s;
+	  unsigned char *dest=(char *)d;
+	  unsigned char *sourcel=(char *)l;
+	  
+	  while (num-->0 && (7&(int)dest))
+	  {
+	    *dest=L_TRUNC(L_OPER(*source,*sourcel));
+	    source++;
+	    sourcel++;
+	    dest++;
+	  }
+	  
+	  
+	  while(num > 16)
+	  {
+	    movq_m2r(*source, mm0);
+	    source+=8;
+	    movq_m2r(*source, mm1);
+	    source+=8;
+	    paddusb_m2r(*sourcel, mm0);
+	    sourcel+=8;
+	    paddusb_m2r(*sourcel, mm1);
+	    sourcel+=8;
+	    movq_r2m(mm0,*dest);
+	    dest+=8;
+	    movq_r2m(mm1,*dest);
+	    dest+=8;
+	    num-=16;
+	  }
+	  emms();
+	  while (num-->0)
+	  {
+	    *dest=L_TRUNC(L_OPER(*source,*sourcel));
+	    source++;
+	    sourcel++;
+	    dest++;
+	  }
+	}
+	else
+#endif
+	{
+	  while (len--)
+	  {
+	    d->r=L_TRUNC(L_OPER(s->r,l->r));
+	    d->g=L_TRUNC(L_OPER(s->g,l->g));
+	    d->b=L_TRUNC(L_OPER(s->b,l->b));
+	    *da=white;
+	    l++; s++; sa++; da++; d++;
+	  }
+	}
+      }
+      else
+	 while (len--)
+	 {
+	    if (la->r==COLORMAX && la->g==COLORMAX && la->b==COLORMAX)
+	    {
+	       d->r=L_TRUNC(L_OPER(s->r,l->r));
+	       d->g=L_TRUNC(L_OPER(s->g,l->g));
+	       d->b=L_TRUNC(L_OPER(s->b,l->b));
+	       *da=white;
+	    }
+	    else if (la->r==0 && la->g==0 && la->b==0)
+	    {
+	       *d=*s;
+	       *da=*sa;
+	    }
+	    else
+	    {
+	       d->r=L_TRUNC(L_OPER(s->r,l->r));
+	       ALPHA_ADD(s,d,d,sa,la,da,r);
+	       d->g=L_TRUNC(L_OPER(s->g,l->g));
+	       ALPHA_ADD(s,d,d,sa,la,da,g);
+	       d->b=L_TRUNC(L_OPER(s->b,l->b));
+	       ALPHA_ADD(s,d,d,sa,la,da,b);
+	    }
+	    l++; s++; la++; sa++; da++; d++;
+	 }
+   }
+   else
+   {
+      if (!la)  /* no layer alpha => full opaque */
+	 while (len--)
+	 {
+	    d->r=L_TRUNC(L_OPER(s->r,l->r));
+	    ALPHA_ADD_V_NOLA(s,d,d,sa,da,alpha,r);
+	    d->g=L_TRUNC(L_OPER(s->g,l->g));
+	    ALPHA_ADD_V_NOLA(s,d,d,sa,da,alpha,g);
+	    d->b=L_TRUNC(L_OPER(s->b,l->b));
+	    ALPHA_ADD_V_NOLA(s,d,d,sa,da,alpha,b);
+	    l++; s++; sa++; da++; d++;
+	 }
+      else
+	 while (len--)
+	 {
+	    d->r=L_TRUNC(L_OPER(s->r,l->r));
+	    ALPHA_ADD_V(s,d,d,sa,la,da,alpha,r);
+	    d->g=L_TRUNC(L_OPER(s->g,l->g));
+	    ALPHA_ADD_V(s,d,d,sa,la,da,alpha,g);
+	    d->b=L_TRUNC(L_OPER(s->b,l->b));
+	    ALPHA_ADD_V(s,d,d,sa,la,da,alpha,b);
+	    l++; s++; la++; sa++; da++; d++;
+	 }
+   }
+}
+
+#undef L_TRUNC
+#undef L_OPER
+
+#endif
+
+
+
 #define LM_FUNC lm_subtract
 #define L_TRUNC(X) MAXIMUM(0,(X))
 #define L_OPER(A,B) ((A)-(int)(B))
@@ -2872,7 +3020,7 @@ void init_image_layers(void)
    char buf[100];
    char buf2[sizeof(INT32)];
    int i;
-   
+
    for (i=0; i<LAYER_MODES; i++)
       layer_mode[i].ps=make_shared_string(layer_mode[i].name);