From 4813f5e1a35accdaee2c6ea43643beaa8d6d99c4 Mon Sep 17 00:00:00 2001 From: "Mirar (Pontus Hagland)" <pike@sort.mirar.org> Date: Wed, 24 Sep 2003 14:46:47 +0200 Subject: [PATCH] Regexp.PCRE starting to look useful; pike glue added to module.pmod.in Rev: src/modules/_Regexp_PCRE/configure.in:1.6 Rev: src/modules/_Regexp_PCRE/module.pmod.in:1.1 Rev: src/modules/_Regexp_PCRE/pcre_glue.cmod:1.8 Rev: src/modules/_Regexp_PCRE/testsuite.in:1.2 --- .gitattributes | 1 + src/modules/_Regexp_PCRE/configure.in | 4 +- src/modules/_Regexp_PCRE/module.pmod.in | 389 ++++++++++++++++++++++++ src/modules/_Regexp_PCRE/pcre_glue.cmod | 24 +- src/modules/_Regexp_PCRE/testsuite.in | 12 +- 5 files changed, 419 insertions(+), 11 deletions(-) create mode 100644 src/modules/_Regexp_PCRE/module.pmod.in diff --git a/.gitattributes b/.gitattributes index a55ab58b77..107fbb3624 100644 --- a/.gitattributes +++ b/.gitattributes @@ -693,6 +693,7 @@ testfont binary /src/modules/_Regexp_PCRE/Makefile.in foreign_ident /src/modules/_Regexp_PCRE/acconfig.h foreign_ident /src/modules/_Regexp_PCRE/configure.in foreign_ident +/src/modules/_Regexp_PCRE/module.pmod.in foreign_ident /src/modules/_Regexp_PCRE/pcre_glue.cmod foreign_ident /src/modules/_Roxen/Makefile.in foreign_ident /src/modules/_Roxen/acconfig.h foreign_ident diff --git a/src/modules/_Regexp_PCRE/configure.in b/src/modules/_Regexp_PCRE/configure.in index e0e1abda33..05fb115863 100644 --- a/src/modules/_Regexp_PCRE/configure.in +++ b/src/modules/_Regexp_PCRE/configure.in @@ -1,11 +1,11 @@ # -# $Id: configure.in,v 1.5 2003/09/23 12:46:59 grubba Exp $ +# $Id: configure.in,v 1.6 2003/09/24 12:46:47 mirar Exp $ # AC_INIT(pcre_glue.cmod) AC_CONFIG_HEADER(pcre_machine.h) AC_ARG_WITH(libpcre, [ --with(out)-libpcre Support Regexp.PCRE],[],[with_libpcre=yes]) -AC_MODULE_INIT() +AC_MODULE_INIT(_Regexp_PCRE) PIKE_FEATURE_WITHOUT(Regexp.PCRE) diff --git a/src/modules/_Regexp_PCRE/module.pmod.in b/src/modules/_Regexp_PCRE/module.pmod.in new file mode 100644 index 0000000000..88ff24deb1 --- /dev/null +++ b/src/modules/_Regexp_PCRE/module.pmod.in @@ -0,0 +1,389 @@ +// -*- Pike -*- +// $Id: module.pmod.in,v 1.1 2003/09/24 12:46:47 mirar Exp $ +// original author: mirar + +#pike __REAL_VERSION__ + +inherit @module@; // C-module + +// there are other maybe useful stuff in _Regexp_PCRE +// so don't stop compiling because it's not complete: +#if constant(@module@._pcre) // there are other maybe useful stuff + +//! The main regexp class. Will provide anything needed +//! for matching regexps. +//! +//! There are subclasses that adds wrappers for widestrings, +//! and to optimize the regexp pattern. + +class Plain +{ + inherit _pcre; + +/***************************************************************/ + + inline int(0..0) handle_exec_error(int error_no) + { + switch (error_no) + { + case ERROR.NOMATCH: + return UNDEFINED; + case ERROR.NOMEMORY: + error("out of memory in exec() (ERROR.NOMEMORY)\n"); + default: + error("error returned from exec: %s\n", + ([ERROR.NULL :"ERROR.NULL", + ERROR.BADOPTION:"ERROR.BADOPTION", + ERROR.BADMAGIC :"ERROR.BADMAGIC", + ERROR.UNKNOWN_NODE:"ERROR.UNKNOWN_NODE", + ERROR.MATCHLIMIT :"ERROR.MATCHLIMIT", + ERROR.CALLOUT :"ERROR.CALLOUT"])[error_no] + ||sprintf("unknown error: %d\n",error_no)); + } + } + +//! Matches a subject against the pattern, +//! returns an array where the first element are the whole match, +//! and the subsequent are the matching subpatterns. +//! Returns 0 if there was no match. +//! +//! example: +//! > Regexp.PCRE.Plain("i\(.*\) is \(.*\)u")->split2("pike is fun"); +//! Result: ({ +//! "ike is fu", +//! "ke", +//! "f" +//! }) + + array(string)|int(0..0) split2(string subject,void|int startoffset) + { + array(int)|int v=exec(subject,startoffset); + if (intp(v)) return handle_exec_error([int]v); + return split_subject(subject,[array(int)]v); + } + +//! Matches a subject against the pattern, +//! compatible with the old split method: +//! returns an array of the subpatterns, +//! or if there are no subpattern but still a match, ({0}). +//! Returns 0 if there was no match. +//! +//! example: +//! > Regexp.PCRE.Plain("i\(.*\) is \(.*\)u")->split("pike is fun"); +//! (1) Result: ({ +//! "ke", +//! "f" +//! }) +//! > Regexp.PCRE.Plain("is fun")->split("pike is fun"); +//! (4) Result: ({ +//! 0 +//! }) + + array(string)|int(0..0) split(string subject,void|int startoffset) + { + array(string)|int(0..0) v=split2(subject,startoffset); + if (intp(v)) return v; + if (sizeof(v)>1) return v[1..]; + else return copy_value(({0})); + } + +//! returns true (1) if a match is found, +//! false otherwise +//! +//! example: +//! > Regexp.PCRE.Plain("is fun")->match("pike is fun"); +//! Result: 1 +//! > Regexp.PCRE.Plain("is fun")->match("pike isn't fun"); +//! Result: 0 + + int(0..1) match(string subject,void|int startoffset) + { + array(int)|int v=exec(subject,startoffset); + if (intp(v)) return handle_exec_error([int]v); + return 1; + } + +//! replace all occurances of a pattern in a subject; +//! callbacks and replacements will be from the first occurance, +//! not from the last as in Regexp.Builtin.replace. +//! +//! example: +//! > Regexp.PCRE("b[^-]*m")->replace("abam-boom-fooabadoom","gurka"); +//! Result: "agurka-gurka-fooagurka" +//! > Regexp.PCRE("b[^-]*m")->replace("abam-boom-fooabadoom", +//! lambda(string s) { werror("%O\n",s); return "gurka"; }); +//! "bam" +//! "boom" +//! "badoom" +//! Result: "agurka-gurka-fooagurka" + + string replace(string subject,string|function(string:string) with) + { + int i=0; + String.Buffer res=String.Buffer(); + for (;;) + { + array(int)|int v=exec(subject,i); + + if (intp(v) && !handle_exec_error([int]v)) break; + + if (v[0]>i) res->add(subject[i..v[0]-1]); + + if (stringp(with)) res->add(with); + else res->add(with(subject[v[0]..v[1]-1])); + + i=v[1]; + } + + res->add(subject[i..]); + + return (string)res; + } + +//! replace one (first) occurance of a pattern in a subject +//! +//! example: +//! > Regexp.PCRE("b[^-]*m")->replace1("abam-boom-fooabadoom","gurka"); +//! Result: "agurka-boom-fooabadoom" + + string replace1(string subject,string|function(string:string) with) + { + array(int)|int v=exec(subject,0); + if (intp(v) && !handle_exec_error([int]v)) + return subject; + + String.Buffer res=String.Buffer(); + if (v[0]>0) res->add(subject[0..v[0]-1]); + + if (stringp(with)) res->add(with); + else res->add(with(subject[v[0]..v[1]-1])); + + res->add(subject[v[1]..]); + + return (string)res; + } + +//! Will give a callback for each match in a subject. +//! Called arguments will be matching patterns and subpatterns +//! in an array and as second argument the exec result array. +//! +//! returns called object +//! +//! example: +//! > Regexp.PCRE("b(a*)([^-\1234]*)(\1234*)m") +//! ->matchall("abam-boom-fooabado\1234m", +//! lambda(mixed s) { werror("%O\n",s); return "gurka"; }); +//! ({ /* 4 elements */ +//! "bam", +//! "a", +//! "", +//! "" +//! }) +//! ({ /* 4 elements */ +//! "boom", +//! "", +//! "oo", +//! "" +//! }) +//! ({ /* 4 elements */ +//! "bado\1234m", +//! "a", +//! "do", +//! "\1234" +//! }) +//! Result: Regexp.PCRE.StudiedWidestring("b(a*)([^-�\234]*)(�\234*)m") + + this_program matchall(string subject, + function(array(string)|void, + array(int)|void:mixed|void) callback) + { + int i=0; + for (;;) + { + array(int)|int v=exec(subject,i); + if (intp(v) && !handle_exec_error([int]v)) + return this_object(); + callback(split_subject(subject,v),v); + i=v[1]; + } + } + +/**** "internal" ***********************************************/ + + string _sprintf(int t,mapping fum) + { + if (t=='t') return "Regexp.PCRE.Plain"; + return ::_sprintf(t,fum); + } + + string cast(string to) + { + if (to=="string") return pattern; + else error("can't cast %t to %O\n",this_object(),to); + } +} + +//! Same as Plain, +//! but will be studied to match faster; useful if you're doing +//! many matches on the same pattern + +class Studied +{ + inherit Plain; + + void create(string pattern,void|int options,void|object table) + { + ::create(pattern,options,table); + study(); + } + + string _sprintf(int t,mapping fum) + { + if (t=='t') return "Regexp.PCRE.Studied"; + return ::_sprintf(t,fum); + } +} + +#if constant(@module@.buildconfig_UTF8) +#define PCRE_GOT_WIDESTRINGS + +//! Wrapper class around Plain, that will allow widestring +//! patterns and subjects. +//! +//! Widestring support and this class will not be implemented if the +//! linked libpcre lacks UTF8 support. + +class Widestring +{ + inherit Plain; + + void create(string pattern,void|int options,void|object table) + { + ::create(string_to_utf8(pattern),options|OPTION.UTF8,table); + } + + string _sprintf(int t,mapping fum) + { + if (t=='t') return "Regexp.PCRE.Widestring"; + return ::_sprintf(t,fum); + } + + array(string)|int(0..0) split2(string subject,void|int startoffset) + { + string subject_utf8=string_to_utf8(subject); + + if (startoffset && subject_utf8!=subject) + startoffset=char_number_to_utf8_byte_index(startoffset,subject); + + array(int)|int v=::exec(subject_utf8,startoffset); + + if (intp(v)) return handle_exec_error([int]v); + return map(split_subject(subject_utf8,[array(int)]v), + utf8_to_string); + } + +//! The exec function is wrapped to give the correct indexes for +//! the widestring. + + array(int)|int exec(string subject,void|int startoffset) + { + string subject_utf8=string_to_utf8(subject); + + if (startoffset && subject_utf8!=subject) + startoffset=char_number_to_utf8_byte_index(startoffset,subject); + + array(int)|int v=::exec(subject_utf8,startoffset); + + if (arrayp(v)) + if (subject_utf8!=subject) + return utf8_byte_index_to_char_number(v,subject_utf8); + else + return v; + + return v; + } +} + +// really slow helper functions -- FIXME! and add to String or something +static array(int)|int + utf8_byte_index_to_char_number(array(int)|int c,string utf8) +{ + if (arrayp(c)) return map(c,utf8_byte_index_to_char_number,utf8); + return c && strlen(utf8_to_string(utf8[..c-1])); +} + +static array(int)|int + char_number_to_utf8_byte_index(array(int)|int c,string wide) +{ + if (arrayp(c)) return map(c,char_number_to_utf8_byte_index,wide); + return c && strlen(string_to_utf8(wide[..c-1])); +} + + +//! Same as Widestring, +//! but will be studied to match faster; useful if you're doing +//! many matches on the same pattern + +class StudiedWidestring +{ + inherit Widestring; + + void create(string pattern,void|int options,void|object table) + { + ::create(pattern,options,table); + study(); + } + + string _sprintf(int t,mapping fum) + { + if (t=='t') return "Regexp.PCRE.StudiedWidestring"; + return ::_sprintf(t,fum); + } +} + +#define GOOD StudiedWidestring +#define QUICK Widestring +#else +#define GOOD Studied +#define QUICK Plain +#endif // buildconfig_UTF8 + +//! Convencience function to create a suitable PCRE Regexp object; +//! will create a StudiedWidestring from the arguments. +//! +//! That means the result will be able to handle widestrings, +//! and will produce fast matchings by studying the pattern, but the +//! widestring wrapping will on the other hand add overhead. +//! +//! If you need a faster regexp and doesn't use widestring, +//! create a Regexp.PCRE.Studied instead. +//! +//! Widestring support will not be used if the linked libpcre +//! lacks UTF8 support. This can be tested with +//! checking that the Regexp.PCRE.Widestring class exist. + +GOOD `()(string pattern,void|int options,void|object table) +{ + return GOOD(pattern,options,table); +} + +// **************************************************************** + +int(0..1) match(string regexp, string data) { + return QUICK(regexp)->match(data); +} + +array split(string regexp, string data) { + return QUICK(regexp)->split(data); +} + +array split2(string regexp, string data) { + return QUICK(regexp)->split2(data); +} + +string replace(string regexp, string data, + string|function(string:string) transform) { + return QUICK(regexp)->replace(data, transform); +} + +#endif // constant(_pcre) diff --git a/src/modules/_Regexp_PCRE/pcre_glue.cmod b/src/modules/_Regexp_PCRE/pcre_glue.cmod index 4d95fc33ad..c8e6958dd4 100644 --- a/src/modules/_Regexp_PCRE/pcre_glue.cmod +++ b/src/modules/_Regexp_PCRE/pcre_glue.cmod @@ -2,12 +2,12 @@ || This file is part of Pike. For copyright information see COPYRIGHT. || Pike is distributed under GPL, LGPL and MPL. See the file COPYING || for more information. -|| $Id: pcre_glue.cmod,v 1.7 2003/09/23 12:48:27 grubba Exp $ +|| $Id: pcre_glue.cmod,v 1.8 2003/09/24 12:46:47 mirar Exp $ || (original author: mirar) */ #include "global.h" -RCSID("$Id: pcre_glue.cmod,v 1.7 2003/09/23 12:48:27 grubba Exp $"); +RCSID("$Id: pcre_glue.cmod,v 1.8 2003/09/24 12:46:47 mirar Exp $"); #include "pcre_machine.h" #include "pike_macros.h" @@ -78,9 +78,12 @@ PIKECLASS _pcre switch (args) { default: - get_all_args("pcre->create",args,"%S%d%o", - &(THIS->pattern),&options,&table); - break; + if (Pike_sp[2-args].type!=T_INT) /* allow NULL table */ + { + get_all_args("pcre->create",args,"%S%d%o", + &(THIS->pattern),&options,&table); + break; + } case 2: get_all_args("pcre->create",args,"%S%d", &(THIS->pattern),&options); @@ -142,12 +145,13 @@ PIKECLASS _pcre return; case 'O': - push_constant_text ("Regexp.PCRE(%O)"); + push_constant_text ("%t(%O)"); + ref_push_object(Pike_fp->current_object); if (THIS->pattern) ref_push_string(THIS->pattern); else push_undefined(); - f_sprintf(2); + f_sprintf(3); return; case 's': @@ -156,6 +160,10 @@ PIKECLASS _pcre else push_undefined(); return; + + case 't': + push_text("Regexp.PCRE._pcre"); + return; } } @@ -624,7 +632,7 @@ PIKE_MODULE_INIT add_integer_constant("NULL",PCRE_ERROR_NULL,0); add_integer_constant("BADOPTION",PCRE_ERROR_BADOPTION,0); add_integer_constant("BADMAGIC",PCRE_ERROR_BADMAGIC,0); - add_integer_constant("NODE",PCRE_ERROR_UNKNOWN_NODE,0); + add_integer_constant("UNKNOWN_NODE",PCRE_ERROR_UNKNOWN_NODE,0); add_integer_constant("NOMEMORY",PCRE_ERROR_NOMEMORY,0); add_integer_constant("NOSUBSTRING",PCRE_ERROR_NOSUBSTRING,0); #ifdef PCRE_ERROR_MATCHLIMIT diff --git a/src/modules/_Regexp_PCRE/testsuite.in b/src/modules/_Regexp_PCRE/testsuite.in index 8dad7c7f95..33e8e2c4b4 100644 --- a/src/modules/_Regexp_PCRE/testsuite.in +++ b/src/modules/_Regexp_PCRE/testsuite.in @@ -1,3 +1,13 @@ -cond([[ master()->resolv("_Regexp_PCRE") ]], +cond([[ master()->resolv("Regexp.PCRE.Plain") ]], [[ + test_eq(Regexp.PCRE("b[^-]*m")->replace1("abam-boom-fooabadoom","gurka"), + "agurka-boom-fooabadoom") + test_eq(Regexp.PCRE("b[^-]*m")->replace("abam-boom-fooabadoom","gurka"), + "agurka-gurka-fooagurka") +]]) + +cond([[ master()->resolv("Regexp.PCRE.Widestring") ]], +[[ + test_eq(Regexp.PCRE("\1234[^-]*m")->replace("a\1234\567m-\1234oom-fooa\1234adoom","g\1234rka"), + "ag\1234rka-g\1234rka-fooag\1234rka") ]]) -- GitLab