diff --git a/ChangeLog b/ChangeLog
index b49e1c42af880e575c48c151b27bf082aa5a0aec..5d6bf41cad93737677076feec62de47145e7c1f6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2015-01-23  Niels Möller  <nisse@lysator.liu.se>
+
+	* fat-setup.h (DEFINE_FAT_FUNC): Check value of function pointer,
+	before calling fat_init. Should be correct even without memory
+	barrier.
+	* fat-x86_64.c (fat_init): Deleted static variable initialized.
+	The checks of the relevant pointer in DEFINE_FAT_FUNC is more
+	robust.
+	* fat-arm.c (fat_init): Likewise.
+
 2015-01-21  Niels Möller  <nisse@lysator.liu.se>
 
 	* fat-arm.c (fat_init): Setup for use of neon assembly functions.
diff --git a/fat-arm.c b/fat-arm.c
index e351fdcba8f336e7fc213353d37801f2c3f2025e..6d6accfafc53791b341477a105cee29761b68b45 100644
--- a/fat-arm.c
+++ b/fat-arm.c
@@ -164,13 +164,9 @@ DECLARE_FAT_FUNC_VAR(umac_nh_n, umac_nh_n_func, neon);
 static void CONSTRUCTOR
 fat_init (void)
 {
-  static volatile int initialized = 0;
   struct arm_features features;
   int verbose;
 
-  if (initialized)
-    return;
-
   get_arm_features (&features);
 
   verbose = getenv (ENV_VERBOSE) != NULL;
@@ -213,8 +209,6 @@ fat_init (void)
       _nettle_umac_nh_vec = _nettle_umac_nh_c;
       _nettle_umac_nh_n_vec = _nettle_umac_nh_n_c;
     }
-  /* FIXME: Needs memory barrier, to enforce store ordering. */
-  initialized = 1;
 }
   
 DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
diff --git a/fat-setup.h b/fat-setup.h
index ea78a188b69f836c675aa10c37b0b495b7733e88..7c0bbf6654281f9d640c284ba528717c5a0e3d13 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -30,10 +30,11 @@
 */
 
 /* Fat library initialization works as follows. The main function is
-   fat_init. It tries to do initialization only once, but since it is
-   idempotent and pointer updates are atomic on x86_64, there's no
-   harm if it is in some cases called multiple times from several
-   threads.
+   fat_init. We try to do initialization only once, but since it is
+   idempotent, there's no harm if it is in some cases called multiple
+   times from several threads. For correctness, we rely on atomic
+   writes, but not on memory barriers or any other synchronization
+   mechanism.
 
    The fat_init function checks the cpuid flags, and sets function
    pointers, e.g, _nettle_aes_encrypt_vec, to point to the appropriate
@@ -47,18 +48,24 @@
 
    For the actual indirection, there are two cases. 
 
-   If ifunc support is available, function pointers are statically
-   initialized to NULL, and we register resolver functions, e.g.,
-   _nettle_aes_encrypt_resolve, which call fat_init, and then return
-   the function pointer, e.g., the value of _nettle_aes_encrypt_vec.
-
-   If ifunc is not available, we have to define a wrapper function to
-   jump via the function pointer. (FIXME: For internal calls, we could
-   do this as a macro). We statically initialize each function pointer
-   to point to a special initialization function, e.g.,
-   _nettle_aes_encrypt_init, which calls fat_init, and then invokes
-   the right function. This way, all pointers are setup correctly at
-   the first call to any fat function.
+   * If ifunc support is available, function pointers are statically
+     initialized to NULL, and we register resolver functions, e.g.,
+     _nettle_aes_encrypt_resolve, which call fat_init, and then return
+     the function pointer, e.g., the value of _nettle_aes_encrypt_vec.
+
+   * If ifunc is not available, we have to define a wrapper function
+     to jump via the function pointer. (FIXME: For internal calls, we
+     could do this as a macro).
+
+     We statically initialize each function pointer to point to a
+     special initialization function, e.g., _nettle_aes_encrypt_init,
+     which calls fat_init, and then invokes the right function. This
+     way, all pointers are setup correctly at the first call to any
+     fat function.
+
+     And atomic writes are required for correctness in the case that
+     several threads do "first call to any fat function" at the same
+     time.
 */
 
 #if HAVE_GCC_ATTRIBUTE
@@ -107,7 +114,8 @@
   {								  \
     if (getenv (ENV_VERBOSE))					  \
       fprintf (stderr, "libnettle: "#name"_resolve\n");		  \
-    fat_init();							  \
+    if (!name##_vec)						  \
+      fat_init();						  \
     return (void_func *) name##_vec;				  \
   }
 
@@ -125,7 +133,8 @@
   static rtype name##_init prototype {				\
     if (getenv (ENV_VERBOSE))					\
       fprintf (stderr, "libnettle: "#name"_init\n");		\
-    fat_init();							\
+    if (name##_vec == name##_init)				\
+      fat_init();						\
     assert (name##_vec != name##_init);				\
     return name##_vec args;					\
   }
diff --git a/fat-x86_64.c b/fat-x86_64.c
index 20be1c57f6a944c0baa7e0aaafde40c487681f38..2e97d1e5d185d00a2bc9cff93239164a7ce7b9f4 100644
--- a/fat-x86_64.c
+++ b/fat-x86_64.c
@@ -122,11 +122,8 @@ DECLARE_FAT_FUNC_VAR(memxor, memxor_func, sse2)
 static void CONSTRUCTOR
 fat_init (void)
 {
-  static volatile int initialized = 0;
   struct x86_features features;
   int verbose;
-  if (initialized)
-    return;
 
   /* FIXME: Replace all getenv calls by getenv_secure? */
   verbose = getenv (ENV_VERBOSE) != NULL;
@@ -169,11 +166,6 @@ fat_init (void)
 	fprintf (stderr, "libnettle: intel SSE2 will not be used for memxor.\n");
       nettle_memxor_vec = _nettle_memxor_x86_64;
     }
-
-  /* The x86_64 architecture should always make stores visible in the
-     right order to other processors (except for non-temporal stores
-     and the like). So we don't need any memory barrier. */
-  initialized = 1;
 }
 
 DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,