diff --git a/lib/modules/Mysql.pmod/SqlTable.pike b/lib/modules/Mysql.pmod/SqlTable.pike index 25357426788e0ecdb819cef31e92f1900222dfe7..bc3f8d5126ac680118b4d1eac5d135273ab02686 100644 --- a/lib/modules/Mysql.pmod/SqlTable.pike +++ b/lib/modules/Mysql.pmod/SqlTable.pike @@ -1458,7 +1458,7 @@ protected void add_mysql_value (String.Buffer buf, string col_name, mixed val) // FIXME: If the column holds binary data we should throw an // error here instead of sending what is effectively garbled // data. - buf->add ("_utf8\"", string_to_utf8 (quote (val)), "\""); + buf->add ("_utf8\"", string_to_utf8 (quote (val), 2), "\""); } else if (intp (val)) { if (undefinedp (val)) diff --git a/lib/modules/Sql.pmod/mysql.pike b/lib/modules/Sql.pmod/mysql.pike index 54ee5e07c34a78a8cde894379815db4185b291ff..bcddad134a032e0a870079e4ab3a10ee3746c865 100644 --- a/lib/modules/Sql.pmod/mysql.pike +++ b/lib/modules/Sql.pmod/mysql.pike @@ -405,7 +405,7 @@ string quote(string s) ({ "\\\\", "\\\"", "\\0", "\\\'", "\\n", "\\r" })); } -string latin1_to_utf8 (string s) +string latin1_to_utf8 (string s, int extended) //! Converts a string in MySQL @expr{latin1@} format to UTF-8. { return string_to_utf8 (replace (s, ([ @@ -417,10 +417,12 @@ string latin1_to_utf8 (string s) "\x94": "\u201D", "\x95": "\u2022", "\x96": "\u2013", "\x97": "\u2014", "\x98": "\u02DC", "\x99": "\u2122", "\x9a": "\u0161", "\x9b": "\u203A", "\x9c": "\u0153", /*"\x9d": "\u009D",*/ "\x9e": "\u017E", "\x9f": "\u0178", - ]))); + ])), extended); } -string utf8_encode_query (string q, function(string:string) encode_fn) +string utf8_encode_query (string q, + function(string, mixed|void...:string) encode_fn, + mixed ... extras) //! Encodes the appropriate sections of the query with @[encode_fn]. //! Everything except strings prefixed by an introducer (i.e. //! @expr{_something@} or @expr{N@}) is encoded. @@ -429,7 +431,7 @@ string utf8_encode_query (string q, function(string:string) encode_fn) string e = ""; while (1) { sscanf(q, "%[^\'\"]%s", string prefix, string suffix); - e += encode_fn (prefix); + e += encode_fn (prefix, @extras); if (suffix == "") break; @@ -525,7 +527,7 @@ string utf8_encode_query (string q, function(string:string) encode_fn) } e += s; } else { - e += encode_fn (suffix[..end]); + e += encode_fn (suffix[..end], @extras); } q = suffix[end+1..]; @@ -671,7 +673,7 @@ int decode_datetime (string timestr) new_send_charset = "latin1"; \ else { \ CH_DEBUG ("Converting (mysql-)latin1 query to utf8.\n"); \ - query = utf8_encode_query (query, latin1_to_utf8); \ + query = utf8_encode_query (query, latin1_to_utf8, 2); \ new_send_charset = "utf8"; \ } \ } \ @@ -684,7 +686,7 @@ int decode_datetime (string timestr) */ \ if ((send_charset == "utf8") || !_can_send_as_latin1(query)) { \ CH_DEBUG ("Converting query to utf8.\n"); \ - query = utf8_encode_query (query, string_to_utf8); \ + query = utf8_encode_query (query, string_to_utf8, 2); \ new_send_charset = "utf8"; \ } \ } \ diff --git a/lib/modules/Sql.pmod/sql_util.pmod b/lib/modules/Sql.pmod/sql_util.pmod index 09b3e40fe43a6ed837417fcbde3b8e58d2cf9ffa..cf9c183cc8b22d57976dc0019926040e3aa95e21 100644 --- a/lib/modules/Sql.pmod/sql_util.pmod +++ b/lib/modules/Sql.pmod/sql_util.pmod @@ -198,12 +198,12 @@ class UnicodeWrapper ( field_info = master_result->fetch_fields(); foreach(field_info, int|mapping(string:mixed) field) { if (mappingp(field)) { - field->name = utf8_to_string(field->name); + field->name = utf8_to_string(field->name, 2); if (field->table) { - field->table = utf8_to_string(field->table); + field->table = utf8_to_string(field->table, 2); } if (field->default) { - field->default = utf8_to_string(field->default); + field->default = utf8_to_string(field->default, 2); } } } @@ -227,7 +227,7 @@ class UnicodeWrapper ( array(int|mapping(string:mixed)) field_info = fetch_fields(); foreach(row; int i; string|int val) { if (stringp(val)) { - row[i] = utf8_to_string(val); + row[i] = utf8_to_string(val, 2); } } return row; @@ -260,7 +260,7 @@ class MySQLUnicodeWrapper array(int|mapping(string:mixed)) field_info = fetch_fields(); foreach(row; int i; string|int val) { if (stringp(val) && field_info[i]->charsetnr != 63) { - row[i] = utf8_to_string(val); + row[i] = utf8_to_string(val, 2); } } return row; @@ -296,7 +296,7 @@ class MySQLBrokenUnicodeWrapper foreach(row; int i; string|int val) { if (stringp(val) && field_info[i]->flags && !field_info[i]->flags->binary) { - row[i] = utf8_to_string(val); + row[i] = utf8_to_string(val, 2); } } return row; diff --git a/src/builtin_functions.c b/src/builtin_functions.c index 57db3a9b907563b22004b5d3ccc5a31976e11d60..42eb92d0924790c02b21a9350361db7131a7bf88 100644 --- a/src/builtin_functions.c +++ b/src/builtin_functions.c @@ -2093,17 +2093,28 @@ static void f_string_filter_non_unicode( INT32 args ) /*! @decl string(0..255) string_to_utf8(string s) *! @decl string(0..255) string_to_utf8(string s, int extended) *! - *! Converts a string into an UTF-8 compliant byte-stream. + *! Convert a string into a UTF-8 compliant byte-stream. + *! + *! @param s + *! String to encode into UTF-8. + *! + *! @param extended + *! Bitmask with extension options. + *! @int + *! @value 1 + *! Accept and encode the characters outside the valid ranges + *! using the same algorithm. Such encoded characters are + *! however not UTF-8 compliant. + *! @value 2 + *! Encode characters outside the BMP with UTF-8 encoded UTF-16 + *! (ie split them into surrogate pairs and encode). + *! @endint *! *! @note *! Throws an error if characters not valid in an UTF-8 stream are *! encountered. Valid characters are in the ranges *! @expr{0x00000000-0x0000d7ff@} and @expr{0x0000e000-0x0010ffff@}. *! - *! If @[extended] is 1 then characters outside the valid ranges are - *! accepted too and encoded using the same algorithm. Such encoded - *! characters are however not UTF-8 compliant. - *! *! @seealso *! @[Charset.encoder()], @[string_to_unicode()], *! @[unicode_to_string()], @[utf8_to_string()] @@ -2141,13 +2152,16 @@ PMOD_EXPORT void f_string_to_utf8(INT32 args) if (c & ~0xffff) { /* 17bit or more. */ len++; - if (!extended && c > 0x10ffff) + if (!(extended & 1) && c > 0x10ffff) bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1, NULL, Pike_sp - args, "Character 0x%08x at index %"PRINTPTRDIFFT"d is " "outside the allowed range.\n", c, i); - if (c & ~0x1fffff) { + if ((extended & 2) && (c <= 0x10ffff)) { + /* Encode with a surrogate pair. */ + len += 2; + } else if (c & ~0x1fffff) { /* 22bit or more. */ len++; if (c & ~0x3ffffff) { @@ -2161,7 +2175,7 @@ PMOD_EXPORT void f_string_to_utf8(INT32 args) } } } - else if (!extended && c >= 0xd800 && c <= 0xdfff) + else if (!(extended & 1) && c >= 0xd800 && c <= 0xdfff) bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1, NULL, Pike_sp - args, "Character 0x%08x at index %"PRINTPTRDIFFT"d is " @@ -2191,6 +2205,23 @@ PMOD_EXPORT void f_string_to_utf8(INT32 args) out->str[j++] = 0xe0 | (c >> 12); out->str[j++] = 0x80 | ((c >> 6) & 0x3f); out->str[j++] = 0x80 | (c & 0x3f); + } else if ((extended & 2) && (c <= 0x10ffff)) { + /* Encode with surrogates. */ + c -= 0x10000; + /* 0xd800 | (c>>10) + * 0b1101 10cccc cccccc + * UTF8: 11101101 1010cccc 10cccccc + */ + out->str[j++] = 0xed; + out->str[j++] = 0xa0 | (c >> 16); + out->str[j++] = 0x80 | ((c >> 10) & 0x3f); + /* 0xdc00 | (c & 0x3ff) + * 0b1101 11cccc cccccc + * UTF8: 11101101 1011cccc 10cccccc + */ + out->str[j++] = 0xed; + out->str[j++] = 0xb0 | ((c >> 6) & 0x3f); + out->str[j++] = 0x80 | (c & 0x3f); } else if (!(c & ~0x1fffff)) { /* 21bit */ out->str[j++] = 0xf0 | (c >> 18); diff --git a/src/testsuite.in b/src/testsuite.in index 19cc1c78aebb4cebbaad5617faaff71caee0097f..91cacc8849e6bf8061ae22c23b3408dbdc06124c 100644 --- a/src/testsuite.in +++ b/src/testsuite.in @@ -12049,6 +12049,8 @@ test_eval_error(return unicode_to_string(" ")) test_eq(string_to_utf8("foo"), "foo") test_eq(string_to_utf8("bl�"), "bl\303\244") test_eq(string_to_utf8("\77077"), "\347\270\277") +test_eq(string_to_utf8("\U0010ffff\U00100000\U00010000"), "\364\217\277\277\364\200\200\200\360\220\200\200") +test_eq(string_to_utf8("\U0010ffff\U00100000\U00010000", 2), "\355\257\277\355\277\277\355\257\200\355\260\200\355\240\200\355\260\200") test_eq(string_to_utf8("\7077077", 1), "\367\207\270\277") test_eq(string_to_utf8("\77077077", 1), "\370\277\207\270\277") test_eq(string_to_utf8("\7077077077", 1), "\374\270\277\207\270\277") @@ -12064,6 +12066,9 @@ test_eq(utf8_to_string("\376\203\270\277\207\270\277", 1), "\37077077077") test_eq(utf8_to_string("\374\270\277\207\270\277", 1), "\7077077077") test_eq(utf8_to_string("\370\277\207\270\277", 1), "\77077077") test_eq(utf8_to_string("\367\207\270\277", 1), "\7077077") +test_eval_error(return utf8_to_string("\355\257\277\355\277\277\355\257\200\355\260\200\355\240\200\355\260\200")) +test_eq(utf8_to_string("\355\257\277\355\277\277\355\257\200\355\260\200\355\240\200\355\260\200", 2), "\U0010ffff\U00100000\U00010000") +test_eq(utf8_to_string("\364\217\277\277\364\200\200\200\360\220\200\200"), "\U0010ffff\U00100000\U00010000") test_eq(utf8_to_string("\347\270\277"), "\77077") test_eq(utf8_to_string("bl\303\244"), "bl�") test_eq(utf8_to_string("foo"), "foo")