diff --git a/ripemd160-compress.c b/ripemd160-compress.c index 400922a61cbc702b09fedbe87bb8b1b2d3fe3888..dbc01a984b1f98fe1708f22be3ee2e13078abff7 100644 --- a/ripemd160-compress.c +++ b/ripemd160-compress.c @@ -21,21 +21,14 @@ #include "ripemd160.h" +#include "macros.h" + /**************** - * Rotate the 32 bit unsigned integer X by N bits left/right + * Rotate the 32 bit unsigned integer X by N bits left */ -#if defined(__GNUC__) && defined(__i386__) -static inline uint32_t -rol(uint32_t x, int n) -{ - __asm__("roll %%cl,%0" - :"=r" (x) - :"0" (x),"c" (n)); - return x; -} -#else -#define rol(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) ) -#endif + +#define ROL32(x,n) ( ((x) << (n)) | ((x) >> (32-(n))) ) + /**************** * Transform the message X which consists of 16 32-bit-words @@ -45,29 +38,18 @@ _nettle_ripemd160_compress(uint32_t *state, const uint8_t *data) { register uint32_t a,b,c,d,e; uint32_t aa,bb,cc,dd,ee,t; -#ifdef WORDS_BIGENDIAN uint32_t x[16]; + +#ifdef WORDS_BIGENDIAN { int i; - uint8_t *p2, *p1; - for (i=0, p1=data, p2=(uint8_t*)x; i < 16; i++, p2 += 4 ) - { - p2[3] = *p1++; - p2[2] = *p1++; - p2[1] = *p1++; - p2[0] = *p1++; - } + for (i=0; i < 16; i++, data += 4 ) + x[i] = LE_READ_UINT32(data); } #else - /* This version is better because it is always aligned; - * The performance penalty on a 586-100 is about 6% which - * is acceptable - because the data is more local it might - * also be possible that this is faster on some machines. - * This function (when compiled with -02 on gcc 2.7.2) - * executes on a 586-100 (39.73 bogomips) at about 1900kb/sec; - * [measured with a 4MB data and "gpgm --print-md rmd160"] */ - uint32_t x[16]; - memcpy(x, data, 64); + /* memcpy seems a bit faster. Benchmarked on Intel SU4100, it makes + the entire update function roughly 6% faster. */ + memcpy(x, data, sizeof(x)); #endif @@ -87,8 +69,8 @@ _nettle_ripemd160_compress(uint32_t *state, const uint8_t *data) #define F3(x,y,z) ( ((x) & (z)) | ((y) & ~(z)) ) #define F4(x,y,z) ( (x) ^ ((y) | ~(z)) ) #define R(a,b,c,d,e,f,k,r,s) do { t = a + f(b,c,d) + k + x[r]; \ - a = rol(t,s) + e; \ - c = rol(c,10); \ + a = ROL32(t,s) + e; \ + c = ROL32(c,10); \ } while(0) /* left lane */