(ROL32): Renamed macro (was "rol"). Deleted

x86 version using inline assembly; at least gcc-4.4.5 recognizes shift-and-or expressions which are in fact rotations. (_nettle_ripemd160_compress): Use LE_READ_UINT32. Rev: nettle/ripemd160-compress.c:1.2

(ROL32): Renamed macro (was "rol"). Deleted
1bc61893 · Niels Möller · 4b10881d · 1bc61893
Commit 1bc61893 authored 13 years ago by Niels Möller
--- a/ripemd160-compress.c
+++ b/ripemd160-compress.c
@@ -21,21 +21,14 @@
 #include "ripemd160.h"
+#include "macros.h"
 /****************
- * Rotate the 32 bit unsigned integer X by N bits left/right
+ * Rotate the 32 bit unsigned integer X by N bits left
 */
-#if defined(__GNUC__) && defined(__i386__)
-static inline uint32_t
+#define ROL32(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) )
-rol(uint32_t x, int n)
-{
-  __asm__("roll %%cl,%0"
-    :"=r" (x)
-    :"0" (x),"c" (n));
-  return x;
-}
-#else
-#define rol(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) )
-#endif
 /****************
 * Transform the message X which consists of 16 32-bit-words
@@ -45,29 +38,18 @@ _nettle_ripemd160_compress(uint32_t *state, const uint8_t *data)
 {
  register uint32_t a,b,c,d,e;
  uint32_t aa,bb,cc,dd,ee,t;
-#ifdef WORDS_BIGENDIAN
  uint32_t x[16];
+#ifdef WORDS_BIGENDIAN
  {
    int i;
-    uint8_t *p2, *p1;
+    for (i=0; i < 16; i++, data += 4 )
-    for (i=0, p1=data, p2=(uint8_t*)x; i < 16; i++, p2 += 4 )
+      x[i] = LE_READ_UINT32(data);
-    {
-      p2[3] = *p1++;
-      p2[2] = *p1++;
-      p2[1] = *p1++;
-      p2[0] = *p1++;
-    }
  }
 #else
-  /* This version is better because it is always aligned;
+  /* memcpy seems a bit faster. Benchmarked on Intel SU4100, it makes
-   * The performance penalty on a 586-100 is about 6% which
+     the entire update function roughly 6% faster. */
-   * is acceptable - because the data is more local it might
+  memcpy(x, data, sizeof(x));
-   * also be possible that this is faster on some machines.
-   * This function (when compiled with -02 on gcc 2.7.2)
-   * executes on a 586-100 (39.73 bogomips) at about 1900kb/sec;
-   * [measured with a 4MB data and "gpgm --print-md rmd160"] */
-  uint32_t x[16];
-  memcpy(x, data, 64);
 #endif
@@ -87,8 +69,8 @@ _nettle_ripemd160_compress(uint32_t *state, const uint8_t *data)
 #define F3(x,y,z)   ( ((x) & (z)) | ((y) & ~(z)) )
 #define F4(x,y,z)   ( (x) ^ ((y) | ~(z)) )
 #define R(a,b,c,d,e,f,k,r,s) do { t = a + f(b,c,d) + k + x[r]; \
-          a = rol(t,s) + e;        \
+          a = ROL32(t,s) + e;        \
-          c = rol(c,10);         \
+          c = ROL32(c,10);         \
        } while(0)
  /* left lane */