From 9c9a91aead55a552fdacb9a11eb188f708fc6be7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Grubbstr=C3=B6m=20=28Grubba=29?= <grubba@grubba.org> Date: Thu, 24 May 2018 16:55:37 +0200 Subject: [PATCH] Sql.mysql: Use/support UTF-8 encoded UTF-16. MySQL/MariaDB default to a "utf8" character set that may only encode the BMP (max 3 bytes). In MySQL/MariaDB 5.5 and later there is an additional character set "utf8mb4" that also supports the code points outside the BMP. This new character set however requires redefining tables, etc for it to be able to be used. As a work-around we instead default to keep using the "utf8" character set while encoding characters outside the BMP with surrogate pairs. This works seemlessly with old table definitions, while having the minor defect of characters outside the BMP not collating as single characters. Fixes [PIKE-112]. --- lib/modules/Sql.pmod/mysql.pike | 10 ++++++---- lib/modules/Sql.pmod/sql_util.pmod | 12 ++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lib/modules/Sql.pmod/mysql.pike b/lib/modules/Sql.pmod/mysql.pike index f8073dacbc..506f0adeac 100644 --- a/lib/modules/Sql.pmod/mysql.pike +++ b/lib/modules/Sql.pmod/mysql.pike @@ -421,7 +421,9 @@ string latin1_to_utf8 (string s) ]))); } -string utf8_encode_query (string q, function(string:string) encode_fn) +string utf8_encode_query (string q, + function(string, mixed|void...:string) encode_fn, + mixed ... extras) //! Encodes the appropriate sections of the query with @[encode_fn]. //! Everything except strings prefixed by an introducer (i.e. //! @expr{_something@} or @expr{N@}) is encoded. @@ -430,7 +432,7 @@ string utf8_encode_query (string q, function(string:string) encode_fn) string e = ""; while (1) { sscanf(q, "%[^\'\"]%s", string prefix, string suffix); - e += encode_fn (prefix); + e += encode_fn (prefix, @extras); if (suffix == "") break; @@ -526,7 +528,7 @@ string utf8_encode_query (string q, function(string:string) encode_fn) } e += s; } else { - e += encode_fn (suffix[..end]); + e += encode_fn (suffix[..end], @extras); } q = suffix[end+1..]; @@ -685,7 +687,7 @@ int decode_datetime (string timestr) */ \ if ((send_charset == "utf8") || !_can_send_as_latin1(query)) { \ CH_DEBUG ("Converting query to utf8.\n"); \ - query = utf8_encode_query (query, string_to_utf8); \ + query = utf8_encode_query (query, string_to_utf8, 2); \ new_send_charset = "utf8"; \ } \ } \ diff --git a/lib/modules/Sql.pmod/sql_util.pmod b/lib/modules/Sql.pmod/sql_util.pmod index adfcfd84cb..756aa91bd9 100644 --- a/lib/modules/Sql.pmod/sql_util.pmod +++ b/lib/modules/Sql.pmod/sql_util.pmod @@ -198,12 +198,12 @@ class UnicodeWrapper ( field_info = master_result->fetch_fields(); foreach(field_info, int|mapping(string:mixed) field) { if (mappingp(field)) { - field->name = utf8_to_string(field->name); + field->name = utf8_to_string(field->name, 2); if (field->table) { - field->table = utf8_to_string(field->table); + field->table = utf8_to_string(field->table, 2); } if (field->default) { - field->default = utf8_to_string(field->default); + field->default = utf8_to_string(field->default, 2); } } } @@ -227,7 +227,7 @@ class UnicodeWrapper ( array(int|mapping(string:mixed)) field_info = fetch_fields(); foreach(row; int i; string|int val) { if (stringp(val)) { - row[i] = utf8_to_string(val); + row[i] = utf8_to_string(val, 2); } } return row; @@ -260,7 +260,7 @@ class MySQLUnicodeWrapper array(int|mapping(string:mixed)) field_info = fetch_fields(); foreach(row; int i; string|int val) { if (stringp(val) && field_info[i]->charsetnr != 63) { - row[i] = utf8_to_string(val); + row[i] = utf8_to_string(val, 2); } } return row; @@ -296,7 +296,7 @@ class MySQLBrokenUnicodeWrapper foreach(row; int i; string|int val) { if (stringp(val) && field_info[i]->flags && !field_info[i]->flags->binary) { - row[i] = utf8_to_string(val); + row[i] = utf8_to_string(val, 2); } } return row; -- GitLab