From 4813f5e1a35accdaee2c6ea43643beaa8d6d99c4 Mon Sep 17 00:00:00 2001
From: "Mirar (Pontus Hagland)" <pike@sort.mirar.org>
Date: Wed, 24 Sep 2003 14:46:47 +0200
Subject: [PATCH] Regexp.PCRE starting to look useful; pike glue added to
 module.pmod.in

Rev: src/modules/_Regexp_PCRE/configure.in:1.6
Rev: src/modules/_Regexp_PCRE/module.pmod.in:1.1
Rev: src/modules/_Regexp_PCRE/pcre_glue.cmod:1.8
Rev: src/modules/_Regexp_PCRE/testsuite.in:1.2
---
 .gitattributes                          |   1 +
 src/modules/_Regexp_PCRE/configure.in   |   4 +-
 src/modules/_Regexp_PCRE/module.pmod.in | 389 ++++++++++++++++++++++++
 src/modules/_Regexp_PCRE/pcre_glue.cmod |  24 +-
 src/modules/_Regexp_PCRE/testsuite.in   |  12 +-
 5 files changed, 419 insertions(+), 11 deletions(-)
 create mode 100644 src/modules/_Regexp_PCRE/module.pmod.in

diff --git a/.gitattributes b/.gitattributes
index a55ab58b77..107fbb3624 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -693,6 +693,7 @@ testfont binary
 /src/modules/_Regexp_PCRE/Makefile.in foreign_ident
 /src/modules/_Regexp_PCRE/acconfig.h foreign_ident
 /src/modules/_Regexp_PCRE/configure.in foreign_ident
+/src/modules/_Regexp_PCRE/module.pmod.in foreign_ident
 /src/modules/_Regexp_PCRE/pcre_glue.cmod foreign_ident
 /src/modules/_Roxen/Makefile.in foreign_ident
 /src/modules/_Roxen/acconfig.h foreign_ident
diff --git a/src/modules/_Regexp_PCRE/configure.in b/src/modules/_Regexp_PCRE/configure.in
index e0e1abda33..05fb115863 100644
--- a/src/modules/_Regexp_PCRE/configure.in
+++ b/src/modules/_Regexp_PCRE/configure.in
@@ -1,11 +1,11 @@
 #
-# $Id: configure.in,v 1.5 2003/09/23 12:46:59 grubba Exp $
+# $Id: configure.in,v 1.6 2003/09/24 12:46:47 mirar Exp $
 #
 AC_INIT(pcre_glue.cmod)
 AC_CONFIG_HEADER(pcre_machine.h)
 AC_ARG_WITH(libpcre,     [  --with(out)-libpcre       Support Regexp.PCRE],[],[with_libpcre=yes])
 
-AC_MODULE_INIT()
+AC_MODULE_INIT(_Regexp_PCRE)
 
 PIKE_FEATURE_WITHOUT(Regexp.PCRE)
 
diff --git a/src/modules/_Regexp_PCRE/module.pmod.in b/src/modules/_Regexp_PCRE/module.pmod.in
new file mode 100644
index 0000000000..88ff24deb1
--- /dev/null
+++ b/src/modules/_Regexp_PCRE/module.pmod.in
@@ -0,0 +1,389 @@
+// -*- Pike -*-
+// $Id: module.pmod.in,v 1.1 2003/09/24 12:46:47 mirar Exp $
+// original author: mirar
+
+#pike __REAL_VERSION__
+
+inherit @module@; // C-module
+
+// there are other maybe useful stuff in _Regexp_PCRE
+// so don't stop compiling because it's not complete:
+#if constant(@module@._pcre) // there are other maybe useful stuff
+
+//! The main regexp class. Will provide anything needed 
+//! for matching regexps. 
+//!
+//! There are subclasses that adds wrappers for widestrings,
+//! and to optimize the regexp pattern.
+
+class Plain
+{
+   inherit _pcre;
+
+/***************************************************************/
+
+   inline int(0..0) handle_exec_error(int error_no)
+   {
+      switch (error_no)
+      {
+	 case ERROR.NOMATCH: 
+	    return UNDEFINED; 
+	 case ERROR.NOMEMORY:
+	    error("out of memory in exec() (ERROR.NOMEMORY)\n");
+	 default:
+	    error("error returned from exec: %s\n",
+		  ([ERROR.NULL   :"ERROR.NULL",
+		    ERROR.BADOPTION:"ERROR.BADOPTION",
+		    ERROR.BADMAGIC :"ERROR.BADMAGIC",
+		    ERROR.UNKNOWN_NODE:"ERROR.UNKNOWN_NODE",
+		    ERROR.MATCHLIMIT  :"ERROR.MATCHLIMIT",
+		    ERROR.CALLOUT     :"ERROR.CALLOUT"])[error_no]
+		  ||sprintf("unknown error: %d\n",error_no));
+      }
+   }
+
+//! Matches a subject against the pattern,
+//! returns an array where the first element are the whole match,
+//! and the subsequent are the matching subpatterns.
+//! Returns 0 if there was no match.
+//!
+//! example:
+//! > Regexp.PCRE.Plain("i\(.*\) is \(.*\)u")->split2("pike is fun");
+//! Result: ({ 
+//!              "ike is fu",
+//!              "ke",
+//!              "f"
+//!          })
+
+   array(string)|int(0..0) split2(string subject,void|int startoffset)
+   {
+      array(int)|int v=exec(subject,startoffset);
+      if (intp(v)) return handle_exec_error([int]v);
+      return split_subject(subject,[array(int)]v); 
+   }
+
+//! Matches a subject against the pattern,
+//! compatible with the old split method:
+//! returns an array of the subpatterns,
+//! or if there are no subpattern but still a match, ({0}).
+//! Returns 0 if there was no match.
+//!
+//! example:
+//! > Regexp.PCRE.Plain("i\(.*\) is \(.*\)u")->split("pike is fun");
+//! (1) Result: ({
+//!                 "ke",
+//!                 "f"
+//!             })
+//! > Regexp.PCRE.Plain("is fun")->split("pike is fun"); 
+//! (4) Result: ({
+//!                 0
+//!             })
+
+   array(string)|int(0..0) split(string subject,void|int startoffset)
+   {
+      array(string)|int(0..0) v=split2(subject,startoffset);
+      if (intp(v)) return v;
+      if (sizeof(v)>1) return v[1..];
+      else return copy_value(({0}));
+   }
+
+//! returns true (1) if a match is found,
+//! false otherwise
+//!
+//! example:
+//! > Regexp.PCRE.Plain("is fun")->match("pike is fun");
+//! Result: 1
+//! > Regexp.PCRE.Plain("is fun")->match("pike isn't fun");
+//! Result: 0
+
+   int(0..1) match(string subject,void|int startoffset)
+   {
+      array(int)|int v=exec(subject,startoffset);
+      if (intp(v)) return handle_exec_error([int]v);
+      return 1;
+   }
+
+//! replace all occurances of a pattern in a subject;
+//! callbacks and replacements will be from the first occurance,
+//! not from the last as in Regexp.Builtin.replace.
+//!
+//! example:
+//! > Regexp.PCRE("b[^-]*m")->replace("abam-boom-fooabadoom","gurka");
+//! Result: "agurka-gurka-fooagurka"
+//! > Regexp.PCRE("b[^-]*m")->replace("abam-boom-fooabadoom",
+//!      lambda(string s) { werror("%O\n",s); return "gurka"; });
+//! "bam"
+//! "boom"
+//! "badoom"
+//! Result: "agurka-gurka-fooagurka"
+
+   string replace(string subject,string|function(string:string) with)
+   {
+      int i=0;
+      String.Buffer res=String.Buffer();
+      for (;;)
+      {
+	 array(int)|int v=exec(subject,i);
+
+	 if (intp(v) && !handle_exec_error([int]v)) break;
+
+	 if (v[0]>i) res->add(subject[i..v[0]-1]);
+
+	 if (stringp(with)) res->add(with);
+	 else res->add(with(subject[v[0]..v[1]-1]));
+
+	 i=v[1];
+      }
+
+      res->add(subject[i..]);
+
+      return (string)res;
+   }
+
+//! replace one (first) occurance of a pattern in a subject
+//!
+//! example:
+//! > Regexp.PCRE("b[^-]*m")->replace1("abam-boom-fooabadoom","gurka");
+//! Result: "agurka-boom-fooabadoom"
+
+   string replace1(string subject,string|function(string:string) with)
+   {
+      array(int)|int v=exec(subject,0);
+      if (intp(v) && !handle_exec_error([int]v))
+	 return subject;
+      
+      String.Buffer res=String.Buffer();
+      if (v[0]>0) res->add(subject[0..v[0]-1]);
+
+      if (stringp(with)) res->add(with);
+      else res->add(with(subject[v[0]..v[1]-1]));
+
+      res->add(subject[v[1]..]);
+
+      return (string)res;
+   }
+
+//! Will give a callback for each match in a subject.
+//! Called arguments will be matching patterns and subpatterns
+//! in an array and as second argument the exec result array.
+//!
+//! returns called object
+//!
+//! example:
+//! > Regexp.PCRE("b(a*)([^-\1234]*)(\1234*)m")
+//!     ->matchall("abam-boom-fooabado\1234m",
+//!                lambda(mixed s) { werror("%O\n",s); return "gurka"; });
+//! ({ /* 4 elements */
+//!     "bam",
+//!     "a",
+//!     "",
+//!     ""
+//! })
+//! ({ /* 4 elements */
+//!     "boom",
+//!     "",
+//!     "oo",
+//!     ""
+//! })
+//! ({ /* 4 elements */
+//!     "bado\1234m",
+//!     "a",
+//!     "do",
+//!     "\1234"
+//! })
+//! Result: Regexp.PCRE.StudiedWidestring("b(a*)([^-�\234]*)(�\234*)m")
+
+   this_program matchall(string subject,
+			 function(array(string)|void,
+				  array(int)|void:mixed|void) callback)
+   {
+      int i=0;
+      for (;;)
+      {
+	 array(int)|int v=exec(subject,i);
+	 if (intp(v) && !handle_exec_error([int]v)) 
+	    return this_object();
+	 callback(split_subject(subject,v),v);
+	 i=v[1];
+      }
+   }
+
+/**** "internal" ***********************************************/
+
+   string _sprintf(int t,mapping fum)
+   {
+      if (t=='t') return "Regexp.PCRE.Plain";
+      return ::_sprintf(t,fum);
+   }
+
+   string cast(string to)
+   {
+      if (to=="string") return pattern; 
+      else error("can't cast %t to %O\n",this_object(),to);
+   }
+}
+
+//! Same as Plain,
+//! but will be studied to match faster; useful if you're doing
+//! many matches on the same pattern
+
+class Studied
+{
+   inherit Plain;
+
+   void create(string pattern,void|int options,void|object table)
+   {
+      ::create(pattern,options,table);
+      study();
+   }
+
+   string _sprintf(int t,mapping fum)
+   {
+      if (t=='t') return "Regexp.PCRE.Studied";
+      return ::_sprintf(t,fum);
+   }
+}
+
+#if constant(@module@.buildconfig_UTF8)
+#define PCRE_GOT_WIDESTRINGS
+
+//! Wrapper class around Plain, that will allow widestring
+//! patterns and subjects.
+//!
+//! Widestring support and this class will not be implemented if the
+//! linked libpcre lacks UTF8 support.
+
+class Widestring
+{
+   inherit Plain;
+
+   void create(string pattern,void|int options,void|object table)
+   {
+      ::create(string_to_utf8(pattern),options|OPTION.UTF8,table);
+   }
+   
+   string _sprintf(int t,mapping fum)
+   {
+      if (t=='t') return "Regexp.PCRE.Widestring";
+      return ::_sprintf(t,fum);
+   }
+
+   array(string)|int(0..0) split2(string subject,void|int startoffset)
+   {
+      string subject_utf8=string_to_utf8(subject);
+
+      if (startoffset && subject_utf8!=subject) 
+	 startoffset=char_number_to_utf8_byte_index(startoffset,subject);
+
+      array(int)|int v=::exec(subject_utf8,startoffset);
+
+      if (intp(v)) return handle_exec_error([int]v);
+      return map(split_subject(subject_utf8,[array(int)]v),
+		 utf8_to_string);
+   }
+
+//! The exec function is wrapped to give the correct indexes for 
+//! the widestring. 
+
+   array(int)|int exec(string subject,void|int startoffset)
+   {
+      string subject_utf8=string_to_utf8(subject);
+
+      if (startoffset && subject_utf8!=subject) 
+	 startoffset=char_number_to_utf8_byte_index(startoffset,subject);
+
+      array(int)|int v=::exec(subject_utf8,startoffset);
+
+      if (arrayp(v))
+	 if (subject_utf8!=subject) 
+	    return utf8_byte_index_to_char_number(v,subject_utf8);
+	 else
+	    return v;
+
+      return v;
+   }
+}
+
+// really slow helper functions -- FIXME! and add to String or something
+static array(int)|int 
+   utf8_byte_index_to_char_number(array(int)|int c,string utf8)
+{
+   if (arrayp(c)) return map(c,utf8_byte_index_to_char_number,utf8);
+   return c && strlen(utf8_to_string(utf8[..c-1]));
+}
+
+static array(int)|int 
+   char_number_to_utf8_byte_index(array(int)|int c,string wide)
+{
+   if (arrayp(c)) return map(c,char_number_to_utf8_byte_index,wide);
+   return c && strlen(string_to_utf8(wide[..c-1]));
+}
+
+
+//! Same as Widestring,
+//! but will be studied to match faster; useful if you're doing
+//! many matches on the same pattern
+
+class StudiedWidestring
+{
+   inherit Widestring;
+
+   void create(string pattern,void|int options,void|object table)
+   {
+      ::create(pattern,options,table);
+      study();
+   }
+
+   string _sprintf(int t,mapping fum)
+   {
+      if (t=='t') return "Regexp.PCRE.StudiedWidestring";
+      return ::_sprintf(t,fum);
+   }
+}
+
+#define GOOD StudiedWidestring
+#define QUICK Widestring
+#else
+#define GOOD Studied
+#define QUICK Plain
+#endif // buildconfig_UTF8
+
+//! Convencience function to create a suitable PCRE Regexp object;
+//! will create a StudiedWidestring from the arguments.
+//!
+//! That means the result will be able to handle widestrings,
+//! and will produce fast matchings by studying the pattern, but the
+//! widestring wrapping will on the other hand add overhead.
+//!
+//! If you need a faster regexp and doesn't use widestring,
+//! create a Regexp.PCRE.Studied instead.
+//!
+//! Widestring support will not be used if the linked libpcre
+//! lacks UTF8 support. This can be tested with
+//! checking that the Regexp.PCRE.Widestring class exist.
+
+GOOD `()(string pattern,void|int options,void|object table)
+{
+   return GOOD(pattern,options,table);
+}
+
+// ****************************************************************
+
+int(0..1) match(string regexp, string data) {
+  return QUICK(regexp)->match(data);
+}
+
+array split(string regexp, string data) {
+  return QUICK(regexp)->split(data);
+}
+
+array split2(string regexp, string data) {
+  return QUICK(regexp)->split2(data);
+}
+
+string replace(string regexp, string data,
+	       string|function(string:string) transform) {
+  return QUICK(regexp)->replace(data, transform);
+}
+
+#endif // constant(_pcre)
diff --git a/src/modules/_Regexp_PCRE/pcre_glue.cmod b/src/modules/_Regexp_PCRE/pcre_glue.cmod
index 4d95fc33ad..c8e6958dd4 100644
--- a/src/modules/_Regexp_PCRE/pcre_glue.cmod
+++ b/src/modules/_Regexp_PCRE/pcre_glue.cmod
@@ -2,12 +2,12 @@
 || This file is part of Pike. For copyright information see COPYRIGHT.
 || Pike is distributed under GPL, LGPL and MPL. See the file COPYING
 || for more information.
-|| $Id: pcre_glue.cmod,v 1.7 2003/09/23 12:48:27 grubba Exp $
+|| $Id: pcre_glue.cmod,v 1.8 2003/09/24 12:46:47 mirar Exp $
 || (original author: mirar)
 */
 
 #include "global.h"
-RCSID("$Id: pcre_glue.cmod,v 1.7 2003/09/23 12:48:27 grubba Exp $");
+RCSID("$Id: pcre_glue.cmod,v 1.8 2003/09/24 12:46:47 mirar Exp $");
 #include "pcre_machine.h"
 
 #include "pike_macros.h"
@@ -78,9 +78,12 @@ PIKECLASS _pcre
 	 switch (args)
 	 {
 	    default:
-	       get_all_args("pcre->create",args,"%S%d%o",
-			    &(THIS->pattern),&options,&table);
-	       break;
+	       if (Pike_sp[2-args].type!=T_INT) /* allow NULL table */
+	       {
+		  get_all_args("pcre->create",args,"%S%d%o",
+			       &(THIS->pattern),&options,&table);
+		  break;
+	       }
 	    case 2:
 	       get_all_args("pcre->create",args,"%S%d",
 			    &(THIS->pattern),&options);
@@ -142,12 +145,13 @@ PIKECLASS _pcre
 	    return;
 
 	 case 'O':
-	    push_constant_text ("Regexp.PCRE(%O)");
+	    push_constant_text ("%t(%O)");
+	    ref_push_object(Pike_fp->current_object);
 	    if (THIS->pattern)
 	       ref_push_string(THIS->pattern);
 	    else
 	       push_undefined();
-	    f_sprintf(2);
+	    f_sprintf(3);
 	    return;
 
 	 case 's':
@@ -156,6 +160,10 @@ PIKECLASS _pcre
 	    else
 	       push_undefined();
 	    return;
+
+	 case 't':
+	    push_text("Regexp.PCRE._pcre");
+	    return;
       }
    }
 
@@ -624,7 +632,7 @@ PIKE_MODULE_INIT
    add_integer_constant("NULL",PCRE_ERROR_NULL,0);
    add_integer_constant("BADOPTION",PCRE_ERROR_BADOPTION,0);
    add_integer_constant("BADMAGIC",PCRE_ERROR_BADMAGIC,0);
-   add_integer_constant("NODE",PCRE_ERROR_UNKNOWN_NODE,0);
+   add_integer_constant("UNKNOWN_NODE",PCRE_ERROR_UNKNOWN_NODE,0);
    add_integer_constant("NOMEMORY",PCRE_ERROR_NOMEMORY,0);
    add_integer_constant("NOSUBSTRING",PCRE_ERROR_NOSUBSTRING,0);
 #ifdef PCRE_ERROR_MATCHLIMIT
diff --git a/src/modules/_Regexp_PCRE/testsuite.in b/src/modules/_Regexp_PCRE/testsuite.in
index 8dad7c7f95..33e8e2c4b4 100644
--- a/src/modules/_Regexp_PCRE/testsuite.in
+++ b/src/modules/_Regexp_PCRE/testsuite.in
@@ -1,3 +1,13 @@
-cond([[ master()->resolv("_Regexp_PCRE") ]],
+cond([[ master()->resolv("Regexp.PCRE.Plain") ]],
 [[
+   test_eq(Regexp.PCRE("b[^-]*m")->replace1("abam-boom-fooabadoom","gurka"),
+           "agurka-boom-fooabadoom")
+   test_eq(Regexp.PCRE("b[^-]*m")->replace("abam-boom-fooabadoom","gurka"),
+           "agurka-gurka-fooagurka")
+]])
+
+cond([[ master()->resolv("Regexp.PCRE.Widestring") ]],
+[[
+   test_eq(Regexp.PCRE("\1234[^-]*m")->replace("a\1234\567m-\1234oom-fooa\1234adoom","g\1234rka"),
+           "ag\1234rka-g\1234rka-fooag\1234rka")
 ]])
-- 
GitLab