From 9c9a91aead55a552fdacb9a11eb188f708fc6be7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Grubbstr=C3=B6m=20=28Grubba=29?=
 <grubba@grubba.org>
Date: Thu, 24 May 2018 16:55:37 +0200
Subject: [PATCH] Sql.mysql: Use/support UTF-8 encoded UTF-16.

MySQL/MariaDB default to a "utf8" character set that may only
encode the BMP (max 3 bytes). In MySQL/MariaDB 5.5 and later
there is an additional character set "utf8mb4" that also supports
the code points outside the BMP. This new character set however
requires redefining tables, etc for it to be able to be used.

As a work-around we instead default to keep using the "utf8"
character set while encoding characters outside the BMP with
surrogate pairs. This works seemlessly with old table definitions,
while having the minor defect of characters outside the BMP not
collating as single characters.

Fixes [PIKE-112].
---
 lib/modules/Sql.pmod/mysql.pike    | 10 ++++++----
 lib/modules/Sql.pmod/sql_util.pmod | 12 ++++++------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lib/modules/Sql.pmod/mysql.pike b/lib/modules/Sql.pmod/mysql.pike
index f8073dacbc..506f0adeac 100644
--- a/lib/modules/Sql.pmod/mysql.pike
+++ b/lib/modules/Sql.pmod/mysql.pike
@@ -421,7 +421,9 @@ string latin1_to_utf8 (string s)
   ])));
 }
 
-string utf8_encode_query (string q, function(string:string) encode_fn)
+string utf8_encode_query (string q,
+			  function(string, mixed|void...:string) encode_fn,
+			  mixed ... extras)
 //! Encodes the appropriate sections of the query with @[encode_fn].
 //! Everything except strings prefixed by an introducer (i.e.
 //! @expr{_something@} or @expr{N@}) is encoded.
@@ -430,7 +432,7 @@ string utf8_encode_query (string q, function(string:string) encode_fn)
   string e = "";
   while (1) {
     sscanf(q, "%[^\'\"]%s", string prefix, string suffix);
-    e += encode_fn (prefix);
+    e += encode_fn (prefix, @extras);
 
     if (suffix == "") break;
 
@@ -526,7 +528,7 @@ string utf8_encode_query (string q, function(string:string) encode_fn)
       }
       e += s;
     } else {
-      e += encode_fn (suffix[..end]);
+      e += encode_fn (suffix[..end], @extras);
     }
 
     q = suffix[end+1..];
@@ -685,7 +687,7 @@ int decode_datetime (string timestr)
        */								\
       if ((send_charset == "utf8") || !_can_send_as_latin1(query)) {	\
 	CH_DEBUG ("Converting query to utf8.\n");			\
-	query = utf8_encode_query (query, string_to_utf8);		\
+	query = utf8_encode_query (query, string_to_utf8, 2);		\
 	new_send_charset = "utf8";					\
       }									\
     }									\
diff --git a/lib/modules/Sql.pmod/sql_util.pmod b/lib/modules/Sql.pmod/sql_util.pmod
index adfcfd84cb..756aa91bd9 100644
--- a/lib/modules/Sql.pmod/sql_util.pmod
+++ b/lib/modules/Sql.pmod/sql_util.pmod
@@ -198,12 +198,12 @@ class UnicodeWrapper (
       field_info = master_result->fetch_fields();
       foreach(field_info, int|mapping(string:mixed) field) {
 	if (mappingp(field)) {
-	  field->name = utf8_to_string(field->name);
+	  field->name = utf8_to_string(field->name, 2);
 	  if (field->table) {
-	    field->table = utf8_to_string(field->table);
+	    field->table = utf8_to_string(field->table, 2);
 	  }
 	  if (field->default) {
-	    field->default = utf8_to_string(field->default);
+	    field->default = utf8_to_string(field->default, 2);
 	  }
 	}
       }
@@ -227,7 +227,7 @@ class UnicodeWrapper (
     array(int|mapping(string:mixed)) field_info = fetch_fields();
     foreach(row; int i; string|int val) {
       if (stringp(val)) {
-	row[i] = utf8_to_string(val);
+	row[i] = utf8_to_string(val, 2);
       }
     }
     return row;
@@ -260,7 +260,7 @@ class MySQLUnicodeWrapper
     array(int|mapping(string:mixed)) field_info = fetch_fields();
     foreach(row; int i; string|int val) {
       if (stringp(val) && field_info[i]->charsetnr != 63) {
-	row[i] = utf8_to_string(val);
+	row[i] = utf8_to_string(val, 2);
       }
     }
     return row;
@@ -296,7 +296,7 @@ class MySQLBrokenUnicodeWrapper
     foreach(row; int i; string|int val) {
       if (stringp(val) && field_info[i]->flags &&
 	  !field_info[i]->flags->binary) {
-	row[i] = utf8_to_string(val);
+	row[i] = utf8_to_string(val, 2);
       }
     }
     return row;
-- 
GitLab