From be40775845d6f37a3c8d501b4e5e7cc32a56bb2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Grubbstr=C3=B6m=20=28Grubba=29?=
 <grubba@grubba.org>
Date: Thu, 15 Oct 1998 04:42:39 +0200
Subject: [PATCH] Added string_to_utf8() and utf8_to_string().

Rev: src/builtin_functions.c:1.131
---
 src/builtin_functions.c | 249 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 248 insertions(+), 1 deletion(-)

diff --git a/src/builtin_functions.c b/src/builtin_functions.c
index d56556842f..24b9fd1177 100644
--- a/src/builtin_functions.c
+++ b/src/builtin_functions.c
@@ -4,7 +4,7 @@
 ||| See the files COPYING and DISCLAIMER for more information.
 \*/
 #include "global.h"
-RCSID("$Id: builtin_functions.c,v 1.130 1998/10/14 15:21:59 grubba Exp $");
+RCSID("$Id: builtin_functions.c,v 1.131 1998/10/15 02:42:39 grubba Exp $");
 #include "interpret.h"
 #include "svalue.h"
 #include "pike_macros.h"
@@ -745,6 +745,251 @@ void f_unicode_to_string(INT32 args)
   push_string(out);
 }
 
+void f_string_to_utf8(INT32 args)
+{
+  int len;
+  struct pike_string *in;
+  struct pike_string *out;
+  int i,j;
+  int extended = 0;
+
+  get_all_args("string_to_utf8", args, "%W", &in);
+
+  if (args > 1) {
+    if (sp[1-args].type != T_INT) {
+      error("string_to_utf8(): Bad argument 2, expected int|void.\n");
+    }
+    extended = sp[1-args].u.integer;
+  }
+
+  len = in->len;
+
+  for(i=0; i < in->len; i++) {
+    unsigned INT32 c = index_shared_string(in, i);
+    if (c & ~0x7f) {
+      /* 8bit or more. */
+      len++;
+      if (c & ~0x7ff) {
+	/* 12bit or more. */
+	len++;
+	if (c & ~0xffff) {
+	  /* 17bit or more. */
+	  len++;
+	  if (c & ~0x1fffff) {
+	    /* 22bit or more. */
+	    if (!extended) {
+	      error("string_to_utf8(): "
+		    "Value 0x%08x (index %d) is larger than 21 bits.\n",
+		    c, i);
+	    }
+	    len++;
+	    if (c & ~0x3ffffff) {
+	      /* 27bit or more. */
+	      len++;
+	      if (c & ~0x7fffffff) {
+		/* 32bit or more. */
+		len++;
+		/* FIXME: Needs fixing when we get 64bit chars... */
+	      }
+	    }
+	  }
+	}
+      }
+    }
+  }
+  if (len == in->len) {
+    // 7bit string -- already valid utf8.
+    pop_n_elems(args - 1);
+    return;
+  }
+  out = begin_shared_string(len);
+
+  for(i=j=0; i < in->len; i++) {
+    unsigned INT32 c = index_shared_string(in, i);
+    if (!(c & ~0x7f)) {
+      /* 7bit */
+      out->str[j++] = c;
+    } else if (!(c & ~0x7ff)) {
+      /* 11bit */
+      out->str[j++] = 0xc0 | (c >> 6);
+      out->str[j++] = 0x80 | (c & 0x3f);
+    } else if (!(c & ~0xffff)) {
+      /* 16bit */
+      out->str[j++] = 0xe0 | (c >> 12);
+      out->str[j++] = 0x80 | ((c >> 6) & 0x3f);
+      out->str[j++] = 0x80 | (c & 0x3f);
+    } else if (!(c & ~0x1fffff)) {
+      /* 21bit */
+      out->str[j++] = 0xf0 | (c >> 18);
+      out->str[j++] = 0x80 | ((c >> 12) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 6) & 0x3f);
+      out->str[j++] = 0x80 | (c & 0x3f);
+    } else if (!(c & ~0x3ffffff)) {
+      /* This and onwards is extended UTF-8 encoding. */
+      /* 26bit */
+      out->str[j++] = 0xf8 | (c >> 24);
+      out->str[j++] = 0x80 | ((c >> 18) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 12) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 6) & 0x3f);
+      out->str[j++] = 0x80 | (c & 0x3f);
+    } else if (!(c & ~0x7fffffff)) {
+      /* 31bit */
+      out->str[j++] = 0xfc | (c >> 30);
+      out->str[j++] = 0x80 | ((c >> 24) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 18) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 12) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 6) & 0x3f);
+      out->str[j++] = 0x80 | (c & 0x3f);
+    } else {
+      /* 32 - 36bit */
+      out->str[j++] = 0xfe;
+      out->str[j++] = 0x80 | ((c >> 30) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 24) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 18) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 12) & 0x3f);
+      out->str[j++] = 0x80 | ((c >> 6) & 0x3f);
+      out->str[j++] = 0x80 | (c & 0x3f);
+    }
+  }
+#ifdef DEBUG
+  if (len != j) {
+    fatal("string_to_utf8(): Calculated and actual lengths differ: %d != %d\n",
+	  len, j);
+  }
+#endif /* DEBUG */
+  out = end_shared_string(out);
+  pop_n_elems(args);
+  push_string(out);
+}
+
+void f_utf8_to_string(INT32 args)
+{
+  struct pike_string *in;
+  struct pike_string *out;
+  int len = 0;
+  int shift = 0;
+  int i,j;
+
+  get_all_args("utf8_to_string", args, "%S", &in);
+
+  for(i=0; i < in->len; i++) {
+    unsigned int c = ((unsigned char *)in->str)[i];
+    len++;
+    if (c & 0x80) {
+      int cont = 0;
+      if ((c & 0xc0) == 0x80) {
+	error("utf8_to_string(): "
+	      "Unexpected continuation block 0x%02x at index %d.\n",
+	      c, i);
+      }
+      if ((c & 0xe0) == 0xc0) {
+	/* 11bit */
+	cont = 1;
+	if (c & 0x1c) {
+	  if (shift < 1) {
+	    shift = 1;
+	  }
+	}
+      } else if ((c & 0xf0) == 0xe0) {
+	/* 16bit */
+	cont = 2;
+	if (shift < 1) {
+	  shift = 1;
+	}
+      } else {
+	shift = 2;
+	if ((c & 0xf8) == 0xf0) {
+	  /* 21bit */
+	  cont = 3;
+	} else if ((c & 0xfc) == 0xf8) {
+	  /* 26bit */
+	  cont = 4;
+	} else if ((c & 0xfe) == 0xfc) {
+	  /* 31bit */
+	  cont = 5;
+	} else if (c == 0xfe) {
+	  /* 36bit */
+	  cont = 6;
+	} else {
+	  error("utf8_to_string(): "
+		"Unexpected character 0xff at index %d.\n",
+		i);
+	}
+      }
+      while(cont--) {
+	i++;
+	if (i >= in->len) {
+	  error("utf8_to_string(): Truncated UTF8 sequence.\n");
+	}
+	c = ((unsigned char *)(in->str))[i];
+	if ((c & 0xc0) != 0x80) {
+	  error("utf8_to_string(): "
+		"Expected continuation character at index %d (got 0x%02x).\n",
+		i, c);
+	}
+      }
+    }
+  }
+  if (len == in->len) {
+    /* 7bit in == 7bit out */
+    pop_n_elems(args-1);
+    return;
+  }
+
+  out = begin_wide_shared_string(len, shift);
+  
+  for(j=i=0; i < in->len; i++) {
+    unsigned int c = ((unsigned char *)in->str)[i];
+
+    if (c & 0x80) {
+      int cont = 0;
+
+      /* NOTE: The tests aren't as paranoid here, since we've
+       * already tested the string above.
+       */
+      if ((c & 0xe0) == 0xc0) {
+	/* 11bit */
+	cont = 1;
+	c &= 0x1f;
+      } else if ((c & 0xf0) == 0xe0) {
+	/* 16bit */
+	cont = 2;
+	c &= 0x0f;
+      } else if ((c & 0xf8) == 0xf0) {
+	/* 21bit */
+	cont = 3;
+	c &= 0x07;
+      } else if ((c & 0xfc) == 0xf8) {
+	/* 26bit */
+	cont = 4;
+	c &= 0x03;
+      } else if ((c & 0xfe) == 0xfc) {
+	/* 31bit */
+	cont = 5;
+	c &= 0x01;
+      } else {
+	/* 36bit */
+	cont = 6;
+	c = 0;
+      }
+      while(cont--) {
+	unsigned INT32 c2 = ((unsigned char *)(in->str))[++i] & 0x3f;
+	c = (c << 6) | c2;
+      }
+    }
+    low_set_index(out, j++, c);
+  }
+#ifdef DEBUG
+  if (j != len) {
+    fatal("utf8_to_string(): Calculated and actual lengths differ: %d != %d\n",
+	  len, j);
+  }
+#endif /* DEBUG */
+  out = end_shared_string(out);
+  pop_n_elems(args);
+  push_string(out);
+}
+
 void f_all_constants(INT32 args)
 {
   pop_n_elems(args);
@@ -3283,6 +3528,8 @@ void init_builtin_efuns(void)
   /* Some Wide-string stuff */
   add_efun("string_to_unicode", f_string_to_unicode, "function(string:string)", OPT_TRY_OPTIMIZE);
   add_efun("unicode_to_string", f_unicode_to_string, "function(string:string)", OPT_TRY_OPTIMIZE);
+  add_efun("string_to_utf8", f_string_to_utf8, "function(string,int|void:string)", OPT_TRY_OPTIMIZE);
+  add_efun("utf8_to_string", f_utf8_to_string, "function(string:string)", OPT_TRY_OPTIMIZE);
 
 #ifdef HAVE_LOCALTIME
   add_efun("localtime",f_localtime,"function(int:mapping(string:int))",OPT_EXTERNAL_DEPEND);
-- 
GitLab