Commit 086023df authored by Niels Möller's avatar Niels Möller
Browse files

Introduced 4-bit tables. Gives gmac performance of 45 cycles per

byte (still on intel x86_64).
* gcm.c (gcm_gf_shift): Renamed. Tweaked little-endian masks.
(gcm_rightshift): ... old name.
(gcm_gf_mul): New argument for the output. Added length argument
for one of the inputs (implicitly padding with zeros).
(shift_table): New table (in 4-bit and 8-bit versions), generated
by gcmdata.
(gcm_gf_shift_chunk): New function shifting 4 bits at
a time.
(gcm_gf_mul_chunk): New function processing 4 bits at a time.
(gcm_set_key): Generation of 4-bit key table.
(gcm_hash): Use tables, when available.

Rev: nettle/gcm.c:1.5
parent cca7d2b2
...@@ -49,9 +49,9 @@ ...@@ -49,9 +49,9 @@
/* Multiplication by 010...0; a big-endian shift right. If the bit /* Multiplication by 010...0; a big-endian shift right. If the bit
shifted out is one, the defining polynomial is added to cancel it shifted out is one, the defining polynomial is added to cancel it
out. */ out. The argument must be properly aligned for word accesses. */
static void static void
gcm_rightshift (uint8_t *x) gcm_gf_shift (uint8_t *x)
{ {
unsigned long *w = (unsigned long *) x; unsigned long *w = (unsigned long *) x;
long mask; long mask;
...@@ -74,7 +74,7 @@ gcm_rightshift (uint8_t *x) ...@@ -74,7 +74,7 @@ gcm_rightshift (uint8_t *x)
# if SIZEOF_LONG == 4 # if SIZEOF_LONG == 4
#define RSHIFT_WORD(x) \ #define RSHIFT_WORD(x) \
((((x) & 0xfefefefeUL) >> 1) \ ((((x) & 0xfefefefeUL) >> 1) \
| (((x) & 0x01010101) << 15)) | (((x) & 0x00010101) << 15))
mask = - ((w[3] >> 24) & 1); mask = - ((w[3] >> 24) & 1);
w[3] = RSHIFT_WORD(w[3]) | ((w[2] >> 17) & 0x80); w[3] = RSHIFT_WORD(w[3]) | ((w[2] >> 17) & 0x80);
w[2] = RSHIFT_WORD(w[2]) | ((w[1] >> 17) & 0x80); w[2] = RSHIFT_WORD(w[2]) | ((w[1] >> 17) & 0x80);
...@@ -83,20 +83,22 @@ gcm_rightshift (uint8_t *x) ...@@ -83,20 +83,22 @@ gcm_rightshift (uint8_t *x)
# elif SIZEOF_LONG == 8 # elif SIZEOF_LONG == 8
#define RSHIFT_WORD(x) \ #define RSHIFT_WORD(x) \
((((x) & 0xfefefefefefefefeUL) >> 1) \ ((((x) & 0xfefefefefefefefeUL) >> 1) \
| (((x) & 0x0101010101010101UL) << 15)) | (((x) & 0x0001010101010101UL) << 15))
mask = - ((w[1] >> 56) & 1); mask = - ((w[1] >> 56) & 1);
w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 49) & 0x80); w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 49) & 0x80);
w[0] = RSHIFT_WORD(w[0]) ^ (mask & GHASH_POLYNOMIAL); w[0] = RSHIFT_WORD(w[0]) ^ (mask & GHASH_POLYNOMIAL);
# else # else
# error Unsupported word size. */ # error Unsupported word size. */
# endif # endif
# undef RSHIFT_WORD
#endif /* ! WORDS_BIGENDIAN */ #endif /* ! WORDS_BIGENDIAN */
} }
/* Sets a <- a * b mod r, using the plain bitwise algorithm from the /* Sets r <- x * y mod r, using the plain bitwise algorithm from the
specification. */ specification. y may be shorter than a full block, missing bytes
are assumed zero. */
static void static void
gcm_gf_mul (uint8_t *x, const uint8_t *y) gcm_gf_mul (uint8_t *r, const uint8_t *x, unsigned yn, const uint8_t *y)
{ {
uint8_t V[GCM_BLOCK_SIZE]; uint8_t V[GCM_BLOCK_SIZE];
uint8_t Z[GCM_BLOCK_SIZE]; uint8_t Z[GCM_BLOCK_SIZE];
...@@ -105,7 +107,7 @@ gcm_gf_mul (uint8_t *x, const uint8_t *y) ...@@ -105,7 +107,7 @@ gcm_gf_mul (uint8_t *x, const uint8_t *y)
memcpy(V, x, sizeof(V)); memcpy(V, x, sizeof(V));
memset(Z, 0, sizeof(Z)); memset(Z, 0, sizeof(Z));
for (i = 0; i < GCM_BLOCK_SIZE; i++) for (i = 0; i < yn; i++)
{ {
uint8_t b = y[i]; uint8_t b = y[i];
unsigned j; unsigned j;
...@@ -114,11 +116,136 @@ gcm_gf_mul (uint8_t *x, const uint8_t *y) ...@@ -114,11 +116,136 @@ gcm_gf_mul (uint8_t *x, const uint8_t *y)
if (b & 0x80) if (b & 0x80)
memxor(Z, V, sizeof(V)); memxor(Z, V, sizeof(V));
gcm_rightshift(V); gcm_gf_shift(V);
} }
} }
memcpy (r, Z, sizeof(Z));
}
#if GCM_TABLE_BITS
# if WORDS_BIGENDIAN
# define W(left,right) (0x##left##right)
# else
# define W(left,right) (0x##right##left)
# endif
#if GCM_TABLE_BITS == 4
static const uint16_t
shift_table[0x10] = {
W(00,00),W(1c,20),W(38,40),W(24,60),W(70,80),W(6c,a0),W(48,c0),W(54,e0),
W(e1,00),W(fd,20),W(d9,40),W(c5,60),W(91,80),W(8d,a0),W(a9,c0),W(b5,e0),
};
#elif GCM_TABLE_BITS == 8
static const uint16_t
shift_table[0x100] = {
W(00,00),W(01,c2),W(03,84),W(02,46),W(07,08),W(06,ca),W(04,8c),W(05,4e),
W(0e,10),W(0f,d2),W(0d,94),W(0c,56),W(09,18),W(08,da),W(0a,9c),W(0b,5e),
W(1c,20),W(1d,e2),W(1f,a4),W(1e,66),W(1b,28),W(1a,ea),W(18,ac),W(19,6e),
W(12,30),W(13,f2),W(11,b4),W(10,76),W(15,38),W(14,fa),W(16,bc),W(17,7e),
W(38,40),W(39,82),W(3b,c4),W(3a,06),W(3f,48),W(3e,8a),W(3c,cc),W(3d,0e),
W(36,50),W(37,92),W(35,d4),W(34,16),W(31,58),W(30,9a),W(32,dc),W(33,1e),
W(24,60),W(25,a2),W(27,e4),W(26,26),W(23,68),W(22,aa),W(20,ec),W(21,2e),
W(2a,70),W(2b,b2),W(29,f4),W(28,36),W(2d,78),W(2c,ba),W(2e,fc),W(2f,3e),
W(70,80),W(71,42),W(73,04),W(72,c6),W(77,88),W(76,4a),W(74,0c),W(75,ce),
W(7e,90),W(7f,52),W(7d,14),W(7c,d6),W(79,98),W(78,5a),W(7a,1c),W(7b,de),
W(6c,a0),W(6d,62),W(6f,24),W(6e,e6),W(6b,a8),W(6a,6a),W(68,2c),W(69,ee),
W(62,b0),W(63,72),W(61,34),W(60,f6),W(65,b8),W(64,7a),W(66,3c),W(67,fe),
W(48,c0),W(49,02),W(4b,44),W(4a,86),W(4f,c8),W(4e,0a),W(4c,4c),W(4d,8e),
W(46,d0),W(47,12),W(45,54),W(44,96),W(41,d8),W(40,1a),W(42,5c),W(43,9e),
W(54,e0),W(55,22),W(57,64),W(56,a6),W(53,e8),W(52,2a),W(50,6c),W(51,ae),
W(5a,f0),W(5b,32),W(59,74),W(58,b6),W(5d,f8),W(5c,3a),W(5e,7c),W(5f,be),
W(e1,00),W(e0,c2),W(e2,84),W(e3,46),W(e6,08),W(e7,ca),W(e5,8c),W(e4,4e),
W(ef,10),W(ee,d2),W(ec,94),W(ed,56),W(e8,18),W(e9,da),W(eb,9c),W(ea,5e),
W(fd,20),W(fc,e2),W(fe,a4),W(ff,66),W(fa,28),W(fb,ea),W(f9,ac),W(f8,6e),
W(f3,30),W(f2,f2),W(f0,b4),W(f1,76),W(f4,38),W(f5,fa),W(f7,bc),W(f6,7e),
W(d9,40),W(d8,82),W(da,c4),W(db,06),W(de,48),W(df,8a),W(dd,cc),W(dc,0e),
W(d7,50),W(d6,92),W(d4,d4),W(d5,16),W(d0,58),W(d1,9a),W(d3,dc),W(d2,1e),
W(c5,60),W(c4,a2),W(c6,e4),W(c7,26),W(c2,68),W(c3,aa),W(c1,ec),W(c0,2e),
W(cb,70),W(ca,b2),W(c8,f4),W(c9,36),W(cc,78),W(cd,ba),W(cf,fc),W(ce,3e),
W(91,80),W(90,42),W(92,04),W(93,c6),W(96,88),W(97,4a),W(95,0c),W(94,ce),
W(9f,90),W(9e,52),W(9c,14),W(9d,d6),W(98,98),W(99,5a),W(9b,1c),W(9a,de),
W(8d,a0),W(8c,62),W(8e,24),W(8f,e6),W(8a,a8),W(8b,6a),W(89,2c),W(88,ee),
W(83,b0),W(82,72),W(80,34),W(81,f6),W(84,b8),W(85,7a),W(87,3c),W(86,fe),
W(a9,c0),W(a8,02),W(aa,44),W(ab,86),W(ae,c8),W(af,0a),W(ad,4c),W(ac,8e),
W(a7,d0),W(a6,12),W(a4,54),W(a5,96),W(a0,d8),W(a1,1a),W(a3,5c),W(a2,9e),
W(b5,e0),W(b4,22),W(b6,64),W(b7,a6),W(b2,e8),W(b3,2a),W(b1,6c),W(b0,ae),
W(bb,f0),W(ba,32),W(b8,74),W(b9,b6),W(bc,f8),W(bd,3a),W(bf,7c),W(be,be),
};
#else
#error Unsupported table size
#endif
#undef W
/* Like gcm_rightshift, but shifts GCM_TABLE_BITS steps. */
#if GCM_TABLE_BITS == 4
static void
gcm_gf_shift_chunk(uint8_t *x)
{
unsigned long *w = (unsigned long *) x;
unsigned long reduce;
/* Shift uses big-endian representation. */
#if WORDS_BIGENDIAN
# if SIZEOF_LONG == 4
reduce = shift_table[w[3] & 0xf];
w[3] = (w[3] >> 4) | ((w[2] & 0xf) << 28);
w[2] = (w[2] >> 4) | ((w[1] & 0xf) << 28);
w[1] = (w[1] >> 4) | ((w[0] & 0xf) << 28);
w[0] = (w[0] >> 4) ^ (reduce << 16);
# elif SIZEOF_LONG == 8
reduce = shift_table[w[3] & 0xf];
w[1] = (w[1] >> 4) | ((w[0] & 0xf) << 63);
w[0] = (w[0] >> 4) ^ (reduce << 48);
# else
# error Unsupported word size. */
#endif
#else /* ! WORDS_BIGENDIAN */
# if SIZEOF_LONG == 4
#define RSHIFT_WORD(x) \
((((x) & 0xf0f0f0f0UL) >> 4) \
| (((x) & 0x000f0f0f) << 12))
reduce = shift_table[(w[3] >> 24) & 0xf];
w[3] = RSHIFT_WORD(w[3]) | ((w[2] >> 20) & 0xf0);
w[2] = RSHIFT_WORD(w[2]) | ((w[1] >> 20) & 0xf0);
w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 20) & 0xf0);
w[0] = RSHIFT_WORD(w[0]) ^ reduce;
# elif SIZEOF_LONG == 8
#define RSHIFT_WORD(x) \
((((x) & 0xf0f0f0f0f0f0f0f0UL) >> 4) \
| (((x) & 0x000f0f0f0f0f0f0fUL) << 12))
reduce = shift_table[(w[1] >> 56) & 0xf];
w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 52) & 0xf0);
w[0] = RSHIFT_WORD(w[0]) ^ reduce;
# else
# error Unsupported word size. */
# endif
# undef RSHIFT_WORD
#endif /* ! WORDS_BIGENDIAN */
}
/* FIXME: Table should be const. */
static void
gcm_gf_mul_chunk (uint8_t *x, const uint8_t *h, uint8_t table[16][16])
{
uint8_t Z[GCM_BLOCK_SIZE];
unsigned i;
memset(Z, 0, sizeof(Z));
for (i = GCM_BLOCK_SIZE; i-- > 0;)
{
uint8_t b = x[i];
gcm_gf_shift_chunk(Z);
memxor(Z, table[b & 0xf], GCM_BLOCK_SIZE);
gcm_gf_shift_chunk(Z);
memxor(Z, table[b >> 4], GCM_BLOCK_SIZE);
}
memcpy (x, Z, sizeof(Z)); memcpy (x, Z, sizeof(Z));
} }
#endif /* GCM_TABLE_BITS == 4 */
#endif /* GCM_TABLE_BITS */
/* Increment the rightmost 32 bits. */ /* Increment the rightmost 32 bits. */
#define INC32(block) INCREMENT(4, (block) + GCM_BLOCK_SIZE - 4) #define INC32(block) INCREMENT(4, (block) + GCM_BLOCK_SIZE - 4)
...@@ -135,9 +262,20 @@ gcm_set_key(struct gcm_ctx *ctx, ...@@ -135,9 +262,20 @@ gcm_set_key(struct gcm_ctx *ctx,
memset (ctx->h, 0, sizeof (ctx->h)); memset (ctx->h, 0, sizeof (ctx->h));
f (cipher, GCM_BLOCK_SIZE, ctx->h, ctx->h); /* H */ f (cipher, GCM_BLOCK_SIZE, ctx->h, ctx->h); /* H */
#if GCM_TABLE_BITS #if GCM_TABLE_BITS
/* FIXME: Expand hash subkey */ #if GCM_TABLE_BITS == 4
{
unsigned i;
for (i = 0; i < 16; i++)
{
uint8_t x;
x = i << 4;
gcm_gf_mul(ctx->h_table[i], ctx->h, 1, &x);
}
}
#else
abort(); abort();
#endif #endif
#endif /* GCM_TABLE_BITS */
} }
/* /*
...@@ -171,12 +309,20 @@ gcm_hash(struct gcm_ctx *ctx, unsigned length, const uint8_t *data) ...@@ -171,12 +309,20 @@ gcm_hash(struct gcm_ctx *ctx, unsigned length, const uint8_t *data)
length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE) length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
{ {
memxor (ctx->x, data, GCM_BLOCK_SIZE); memxor (ctx->x, data, GCM_BLOCK_SIZE);
gcm_gf_mul (ctx->x, ctx->h); #if GCM_TABLE_BITS
gcm_gf_mul_chunk (ctx->x, ctx->h, ctx->h_table);
#else
gcm_gf_mul (ctx->x, ctx->x, GCM_BLOCK_SIZE, ctx->h);
#endif
} }
if (length > 0) if (length > 0)
{ {
memxor (ctx->x, data, length); memxor (ctx->x, data, length);
gcm_gf_mul (ctx->x, ctx->h); #if GCM_TABLE_BITS
gcm_gf_mul_chunk (ctx->x, ctx->h, ctx->h_table);
#else
gcm_gf_mul (ctx->x, ctx->x, GCM_BLOCK_SIZE, ctx->h);
#endif
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment