Commit 36b4cf8d authored by Niels Möller's avatar Niels Möller
Browse files

* src/parse.c (parse_utf8): Keep track of the length of invalid

sequences. Moved the check for overlong sequences, not it's done
after the complete sequence is processed.

* src/charset.c (low_utf8_to_local): If utf8_tolerant is set,
replace invalid utf8 sequences with '?'. Used mainly by the
testsuite.

* src/charset.h (enum utf8_flag): New flag utf8_tolerant.

Rev: src/charset.c:1.13
Rev: src/charset.h:1.9
Rev: src/parse.c:1.43
parent d5586f14
......@@ -220,11 +220,16 @@ low_utf8_to_local(uint32_t length, const uint8_t *s, enum utf8_flag flags)
break;
}
case 0: /* Error */
fail:
lsh_string_free(res);
return NULL;
if (flags & utf8_tolerant)
lsh_string_putc(res, i++, '?');
else
{
fail:
lsh_string_free(res);
return NULL;
}
break;
default:
fatal("Internal error!\n");
......
......@@ -48,6 +48,9 @@ enum utf8_flag
/* If set, control characters are treated as never existing in any
local character set */
utf8_paranoid = 2,
/* If set, also invalid utf8 sequences are replaced by '?' */
utf8_tolerant = 4
};
struct lsh_string *local_to_utf8(struct lsh_string *s, int free);
......
......@@ -134,7 +134,11 @@ parse_utf8(struct simple_buffer *buffer, uint32_t *result, unsigned *utf8_length
static const uint32_t min_value[7] =
{
0, 0,
0x80, 0x20, 0x10, 0x08, 0x04,
0x80,
0x800,
0x10000,
0x200000,
0x4000000,
};
uint32_t c;
......@@ -158,6 +162,8 @@ parse_utf8(struct simple_buffer *buffer, uint32_t *result, unsigned *utf8_length
switch(c & 0xF0)
{
default:
ADVANCE(1);
*utf8_length = 1;
return 0;
case 0xC0:
case 0xD0:
......@@ -189,42 +195,54 @@ parse_utf8(struct simple_buffer *buffer, uint32_t *result, unsigned *utf8_length
value = c & 0x01;
break;
default:
/* Invalid format 1111 111x */
/* Invalid format 1111 111x */
ADVANCE(1);
return 0;
}
break;
}
if (LEFT < length)
return 0;
{
ADVANCE(LEFT);
*utf8_length = 1;
return 0;
}
c = HERE[1];
if ( (c & 0xC0) != 0x80)
return 0;
{
ADVANCE(1);
*utf8_length = 1;
return 0;
}
value = (value << 6) | (c & 0x3f);
/* Check for overlong sequences */
if (value < min_value[length])
return 0;
for(i = 2; i<length; i++)
{
c = HERE[i];
if ( (c & 0xC0) != 0x80)
return 0;
{
ADVANCE(i);
*utf8_length = i;
return 0;
}
value = (value << 6) | (c & 0x3f);
}
ADVANCE(length);
*utf8_length = length;
/* Check for overlong sequences */
if (value < min_value[length])
return 0;
/* Surrogates and non-characters should not appear in utf8 text. */
if ( (value >= 0xd800 && value <0xe000)
|| value == 0xfffe || value == 0xffff)
return 0;
*result = value;
*utf8_length = length;
ADVANCE(length);
return 1;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment