diff --git a/src/builtin_functions.c b/src/builtin_functions.c index 4a69af7ccaeee24db05b6e167d18e6b893dd69f8..57730315a7d6d4b9c5bc7df9e7b18a05792356ae 100644 --- a/src/builtin_functions.c +++ b/src/builtin_functions.c @@ -1927,17 +1927,28 @@ PMOD_EXPORT void f_unicode_to_string(INT32 args) /*! @decl string(0..255) string_to_utf8(string s) *! @decl string(0..255) string_to_utf8(string s, int extended) *! - *! Converts a string into an UTF-8 compliant byte-stream. + *! Convert a string into a UTF-8 compliant byte-stream. + *! + *! @param s + *! String to encode into UTF-8. + *! + *! @param extended + *! Bitmask with extension options. + *! @int + *! @value 1 + *! Accept and encode the characters outside the valid ranges + *! using the same algorithm. Such encoded characters are + *! however not UTF-8 compliant. + *! @value 2 + *! Encode characters outside the BMP with UTF-8 encoded UTF-16 + *! (ie split them into surrogate pairs and encode). + *! @endint *! *! @note *! Throws an error if characters not valid in an UTF-8 stream are *! encountered. Valid characters are in the ranges *! @expr{0x00000000-0x0000d7ff@} and @expr{0x0000e000-0x0010ffff@}. *! - *! If @[extended] is 1 then characters outside the valid ranges are - *! accepted too and encoded using the same algorithm. Such encoded - *! characters are however not UTF-8 compliant. - *! *! @seealso *! @[Locale.Charset.encoder()], @[string_to_unicode()], *! @[unicode_to_string()], @[utf8_to_string()] @@ -1966,13 +1977,16 @@ PMOD_EXPORT void f_string_to_utf8(INT32 args) if (c & ~0xffff) { /* 17bit or more. */ len++; - if (!extended && c > 0x10ffff) + if (!(extended & 1) && c > 0x10ffff) bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1, NULL, Pike_sp - args, "Character 0x%08x at index %"PRINTPTRDIFFT"d is " "outside the allowed range.\n", c, i); - if (c & ~0x1fffff) { + if ((extended & 2) && (c <= 0x10ffff)) { + /* Encode with a surrogate pair. */ + len += 2; + } else if (c & ~0x1fffff) { /* 22bit or more. */ len++; if (c & ~0x3ffffff) { @@ -1986,7 +2000,7 @@ PMOD_EXPORT void f_string_to_utf8(INT32 args) } } } - else if (!extended && c >= 0xd800 && c <= 0xdfff) + else if (!(extended & 1) && c >= 0xd800 && c <= 0xdfff) bad_arg_error ("string_to_utf8", Pike_sp - args, args, 1, NULL, Pike_sp - args, "Character 0x%08x at index %"PRINTPTRDIFFT"d is " @@ -2016,6 +2030,23 @@ PMOD_EXPORT void f_string_to_utf8(INT32 args) out->str[j++] = 0xe0 | (c >> 12); out->str[j++] = 0x80 | ((c >> 6) & 0x3f); out->str[j++] = 0x80 | (c & 0x3f); + } else if ((extended & 2) && (c <= 0x10ffff)) { + /* Encode with surrogates. */ + c -= 0x10000; + /* 0xd800 | (c>>10) + * 0b1101 10cccc cccccc + * UTF8: 11101101 1010cccc 10cccccc + */ + out->str[j++] = 0xed; + out->str[j++] = 0xa0 | (c >> 16); + out->str[j++] = 0x80 | ((c >> 10) & 0x3f); + /* 0xdc00 | (c & 0x3ff) + * 0b1101 11cccc cccccc + * UTF8: 11101101 1011cccc 10cccccc + */ + out->str[j++] = 0xed; + out->str[j++] = 0xb0 | ((c >> 6) & 0x3f); + out->str[j++] = 0x80 | (c & 0x3f); } else if (!(c & ~0x1fffff)) { /* 21bit */ out->str[j++] = 0xf0 | (c >> 18); diff --git a/src/testsuite.in b/src/testsuite.in index bbc8eeea2ac3a763e6906416bfabb56220a02f17..560b60334c5c694765807aac12542862f93ae4a5 100644 --- a/src/testsuite.in +++ b/src/testsuite.in @@ -11840,6 +11840,8 @@ test_eval_error(return unicode_to_string(" ")) test_eq(string_to_utf8("foo"), "foo") test_eq(string_to_utf8("bl�"), "bl\303\244") test_eq(string_to_utf8("\77077"), "\347\270\277") +test_eq(string_to_utf8("\U0010ffff\U00100000\U00010000"), "\364\217\277\277\364\200\200\200\360\220\200\200") +test_eq(string_to_utf8("\U0010ffff\U00100000\U00010000", 2), "\355\257\277\355\277\277\355\257\200\355\260\200\355\240\200\355\260\200") test_eq(string_to_utf8("\7077077", 1), "\367\207\270\277") test_eq(string_to_utf8("\77077077", 1), "\370\277\207\270\277") test_eq(string_to_utf8("\7077077077", 1), "\374\270\277\207\270\277") @@ -11855,6 +11857,9 @@ test_eq(utf8_to_string("\376\203\270\277\207\270\277", 1), "\37077077077") test_eq(utf8_to_string("\374\270\277\207\270\277", 1), "\7077077077") test_eq(utf8_to_string("\370\277\207\270\277", 1), "\77077077") test_eq(utf8_to_string("\367\207\270\277", 1), "\7077077") +test_eval_error(return utf8_to_string("\355\257\277\355\277\277\355\257\200\355\260\200\355\240\200\355\260\200")) +test_eq(utf8_to_string("\355\257\277\355\277\277\355\257\200\355\260\200\355\240\200\355\260\200", 2), "\U0010ffff\U00100000\U00010000") +test_eq(string_to_utf8("\364\217\277\277\364\200\200\200\360\220\200\200"), "\U0010ffff\U00100000\U00010000") test_eq(utf8_to_string("\347\270\277"), "\77077") test_eq(utf8_to_string("bl\303\244"), "bl�") test_eq(utf8_to_string("foo"), "foo")