From 98fdec72b78fce23cdb3195f47797d87ebda9cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fredrik=20H=C3=BCbinette=20=28Hubbe=29?= <hubbe@hubbe.net> Date: Fri, 4 Dec 1998 20:01:04 -0800 Subject: [PATCH] first version of XML parser Rev: src/modules/spider/Makefile.in:1.15 Rev: src/modules/spider/spider.c:1.77 Rev: src/modules/spider/xml.c:1.1 --- src/modules/spider/Makefile.in | 4 +- src/modules/spider/spider.c | 5 +- src/modules/spider/xml.c | 1538 ++++++++++++++++++++++++++++++++ 3 files changed, 1544 insertions(+), 3 deletions(-) create mode 100644 src/modules/spider/xml.c diff --git a/src/modules/spider/Makefile.in b/src/modules/spider/Makefile.in index 0d17fd7dcb..3e2f9d379c 100644 --- a/src/modules/spider/Makefile.in +++ b/src/modules/spider/Makefile.in @@ -1,7 +1,7 @@ -# $Id: Makefile.in,v 1.14 1998/03/28 13:53:45 grubba Exp $ +# $Id: Makefile.in,v 1.15 1998/12/05 04:01:02 hubbe Exp $ SRCDIR=@srcdir@ VPATH=@srcdir@:@srcdir@/../..:../.. -OBJS=spider.o discdate.o stardate.o dumudp.o accesseddb.o +OBJS=spider.o discdate.o stardate.o dumudp.o accesseddb.o xml.o MODULE_LDFLAGS=@LDFLAGS@ # streamed_parser.o diff --git a/src/modules/spider/spider.c b/src/modules/spider/spider.c index 7f7b4d0ccf..e94d752c43 100644 --- a/src/modules/spider/spider.c +++ b/src/modules/spider/spider.c @@ -43,7 +43,7 @@ #include "threads.h" #include "operators.h" -RCSID("$Id: spider.c,v 1.76 1998/11/22 11:08:31 hubbe Exp $"); +RCSID("$Id: spider.c,v 1.77 1998/12/05 04:01:03 hubbe Exp $"); #ifdef HAVE_PWD_H #include <pwd.h> @@ -1299,6 +1299,7 @@ static struct program *streamed_parser; #endif /* ENABLE_STREAMED_PARSER */ extern void init_udp(void); +extern void init_xml(void); /* Hohum. Here we go. This is try number three for a more optimized Roxen. */ @@ -1525,6 +1526,8 @@ void pike_module_init(void) streamed_parser = end_program(); add_program_constant("streamed_parser", streamed_parser,0); #endif /* ENABLE_STREAMED_PARSER */ + + init_xml(); } diff --git a/src/modules/spider/xml.c b/src/modules/spider/xml.c new file mode 100644 index 0000000000..997fc852d6 --- /dev/null +++ b/src/modules/spider/xml.c @@ -0,0 +1,1538 @@ +#include "global.h" + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif /* HAVE_CONFIG_H */ + +#include "pike_macros.h" +#include "stralloc.h" +#include "object.h" +#include "interpret.h" +#include "mapping.h" +#include "program.h" +#include "array.h" +#include "builtin_functions.h" +#include "module_support.h" +#include "operators.h" +#include "error.h" + +/* #define VERBOSE_XMLDEBUG */ + +struct xmldata +{ + PCHARP datap; + INT32 len; + INT32 pos; + struct svalue *func; + struct svalue *extra_args; + INT32 num_extra_args; + TYPE_FIELD extra_arg_types; +}; + +struct xmlobj +{ + struct mapping *entities; +}; + +#define THIS ((struct xmlobj *)(fp->current_storage)) + + +static int isBaseChar(INT32 c) +{ + switch(c>>8) + { + case 0x00: + if(c>=0x0041 && c<=0x005A) return 1; + if(c>=0x0061 && c<=0x007A) return 1; + if(c>=0x00C0 && c<=0x00D6) return 1; + if(c>=0x00D8 && c<=0x00F6) return 1; + if(c>=0x00F8 && c<=0x00FF) return 1; + break; + + case 0x01: + if(c>=0x0100 && c<=0x0131) return 1; + if(c>=0x0134 && c<=0x013E) return 1; + if(c>=0x0141 && c<=0x0148) return 1; + if(c>=0x014A && c<=0x017E) return 1; + if(c>=0x0180 && c<=0x01C3) return 1; + if(c>=0x01CD && c<=0x01F0) return 1; + if(c>=0x01F4 && c<=0x01F5) return 1; + if(c>=0x01FA && c<=0x0217) return 1; + break; + + case 0x002: + if(c>=0x0250 && c<=0x02A8) return 1; + if(c>=0x02BB && c<=0x02C1) return 1; + break; + + case 0x03: + if(c==0x0386) return 1; + if(c>=0x0388 && c<=0x038A) return 1; + if(c==0x038C) return 1; + if(c>=0x038E && c<=0x03A1) return 1; + if(c>=0x03A3 && c<=0x03CE) return 1; + if(c>=0x03D0 && c<=0x03D6) return 1; + if(c==0x03DA) return 1; + if(c==0x03DC) return 1; + if(c==0x03DE) return 1; + if(c==0x03E0) return 1; + if(c>=0x03E2 && c<=0x03F3) return 1; + break; + + case 0x04: + if(c>=0x0401 && c<=0x040C) return 1; + if(c>=0x040E && c<=0x044F) return 1; + if(c>=0x0451 && c<=0x045C) return 1; + if(c>=0x045E && c<=0x0481) return 1; + if(c>=0x0490 && c<=0x04C4) return 1; + if(c>=0x04C7 && c<=0x04C8) return 1; + if(c>=0x04CB && c<=0x04CC) return 1; + if(c>=0x04D0 && c<=0x04EB) return 1; + if(c>=0x04EE && c<=0x04F5) return 1; + if(c>=0x04F8 && c<=0x04F9) return 1; + break; + + case 0x05: + if(c>=0x0531 && c<=0x0556) return 1; + if(c==0x0559) return 1; + if(c>=0x0561 && c<=0x0586) return 1; + if(c>=0x05D0 && c<=0x05EA) return 1; + if(c>=0x05F0 && c<=0x05F2) return 1; + break; + + case 0x06: + if(c>=0x0621 && c<=0x063A) return 1; + if(c>=0x0641 && c<=0x064A) return 1; + if(c>=0x0671 && c<=0x06B7) return 1; + if(c>=0x06BA && c<=0x06BE) return 1; + if(c>=0x06C0 && c<=0x06CE) return 1; + if(c>=0x06D0 && c<=0x06D3) return 1; + if(c==0x06D5) return 1; + if(c>=0x06E5 && c<=0x06E6) return 1; + break; + + case 0x09: + if(c>=0x0905 && c<=0x0939) return 1; + if(c==0x093D) return 1; + if(c>=0x0958 && c<=0x0961) return 1; + if(c>=0x0985 && c<=0x098C) return 1; + if(c>=0x098F && c<=0x0990) return 1; + if(c>=0x0993 && c<=0x09A8) return 1; + if(c>=0x09AA && c<=0x09B0) return 1; + if(c==0x09B2) return 1; + if(c>=0x09B6 && c<=0x09B9) return 1; + if(c>=0x09DC && c<=0x09DD) return 1; + if(c>=0x09DF && c<=0x09E1) return 1; + if(c>=0x09F0 && c<=0x09F1) return 1; + break; + + case 0x0a: + if(c>=0x0A05 && c<=0x0A0A) return 1; + if(c>=0x0A0F && c<=0x0A10) return 1; + if(c>=0x0A13 && c<=0x0A28) return 1; + if(c>=0x0A2A && c<=0x0A30) return 1; + if(c>=0x0A32 && c<=0x0A33) return 1; + if(c>=0x0A35 && c<=0x0A36) return 1; + if(c>=0x0A38 && c<=0x0A39) return 1; + if(c>=0x0A59 && c<=0x0A5C) return 1; + if(c==0x0A5E) return 1; + if(c>=0x0A72 && c<=0x0A74) return 1; + if(c>=0x0A85 && c<=0x0A8B) return 1; + if(c==0x0A8D) return 1; + if(c>=0x0A8F && c<=0x0A91) return 1; + if(c>=0x0A93 && c<=0x0AA8) return 1; + if(c>=0x0AAA && c<=0x0AB0) return 1; + if(c>=0x0AB2 && c<=0x0AB3) return 1; + if(c>=0x0AB5 && c<=0x0AB9) return 1; + if(c==0x0ABD) return 1; + if(c==0x0AE0) return 1; + break; + + case 0x0b: + if(c>=0x0B05 && c<=0x0B0C) return 1; + if(c>=0x0B0F && c<=0x0B10) return 1; + if(c>=0x0B13 && c<=0x0B28) return 1; + if(c>=0x0B2A && c<=0x0B30) return 1; + if(c>=0x0B32 && c<=0x0B33) return 1; + if(c>=0x0B36 && c<=0x0B39) return 1; + if(c==0x0B3D) return 1; + if(c>=0x0B5C && c<=0x0B5D) return 1; + if(c>=0x0B5F && c<=0x0B61) return 1; + if(c>=0x0B85 && c<=0x0B8A) return 1; + if(c>=0x0B8E && c<=0x0B90) return 1; + if(c>=0x0B92 && c<=0x0B95) return 1; + if(c>=0x0B99 && c<=0x0B9A) return 1; + if(c==0x0B9C) return 1; + if(c>=0x0B9E && c<=0x0B9F) return 1; + if(c>=0x0BA3 && c<=0x0BA4) return 1; + if(c>=0x0BA8 && c<=0x0BAA) return 1; + if(c>=0x0BAE && c<=0x0BB5) return 1; + if(c>=0x0BB7 && c<=0x0BB9) return 1; + break; + + case 0x0c: + if(c>=0x0C05 && c<=0x0C0C) return 1; + if(c>=0x0C0E && c<=0x0C10) return 1; + if(c>=0x0C12 && c<=0x0C28) return 1; + if(c>=0x0C2A && c<=0x0C33) return 1; + if(c>=0x0C35 && c<=0x0C39) return 1; + if(c>=0x0C60 && c<=0x0C61) return 1; + if(c>=0x0C85 && c<=0x0C8C) return 1; + if(c>=0x0C8E && c<=0x0C90) return 1; + if(c>=0x0C92 && c<=0x0CA8) return 1; + if(c>=0x0CAA && c<=0x0CB3) return 1; + if(c>=0x0CB5 && c<=0x0CB9) return 1; + if(c==0x0CDE) return 1; + if(c>=0x0CE0 && c<=0x0CE1) return 1; + break; + + case 0x0d: + if(c>=0x0D05 && c<=0x0D0C) return 1; + if(c>=0x0D0E && c<=0x0D10) return 1; + if(c>=0x0D12 && c<=0x0D28) return 1; + if(c>=0x0D2A && c<=0x0D39) return 1; + if(c>=0x0D60 && c<=0x0D61) return 1; + break; + + case 0x0e: + if(c>=0x0E01 && c<=0x0E2E) return 1; + if(c==0x0E30) return 1; + if(c>=0x0E32 && c<=0x0E33) return 1; + if(c>=0x0E40 && c<=0x0E45) return 1; + if(c>=0x0E81 && c<=0x0E82) return 1; + if(c==0x0E84) return 1; + if(c>=0x0E87 && c<=0x0E88) return 1; + if(c==0x0E8A) return 1; + if(c==0x0E8D) return 1; + if(c>=0x0E94 && c<=0x0E97) return 1; + if(c>=0x0E99 && c<=0x0E9F) return 1; + if(c>=0x0EA1 && c<=0x0EA3) return 1; + if(c==0x0EA5) return 1; + if(c==0x0EA7) return 1; + if(c>=0x0EAA && c<=0x0EAB) return 1; + if(c>=0x0EAD && c<=0x0EAE) return 1; + if(c==0x0EB0) return 1; + if(c>=0x0EB2 && c<=0x0EB3) return 1; + if(c==0x0EBD) return 1; + if(c>=0x0EC0 && c<=0x0EC4) return 1; + break; + + case 0x0f: + if(c>=0x0F40 && c<=0x0F47) return 1; + if(c>=0x0F49 && c<=0x0F69) return 1; + break; + + case 0x10: + if(c>=0x10A0 && c<=0x10C5) return 1; + if(c>=0x10D0 && c<=0x10F6) return 1; + break; + + case 0x11: + if(c==0x1100) return 1; + if(c>=0x1102 && c<=0x1103) return 1; + if(c>=0x1105 && c<=0x1107) return 1; + if(c==0x1109) return 1; + if(c>=0x110B && c<=0x110C) return 1; + if(c>=0x110E && c<=0x1112) return 1; + if(c==0x113C) return 1; + if(c==0x113E) return 1; + if(c==0x1140) return 1; + if(c==0x114C) return 1; + if(c==0x114E) return 1; + if(c==0x1150) return 1; + if(c>=0x1154 && c<=0x1155) return 1; + if(c==0x1159) return 1; + if(c>=0x115F && c<=0x1161) return 1; + if(c==0x1163) return 1; + if(c==0x1165) return 1; + if(c==0x1167) return 1; + if(c==0x1169) return 1; + if(c>=0x116D && c<=0x116E) return 1; + if(c>=0x1172 && c<=0x1173) return 1; + if(c==0x1175) return 1; + if(c==0x119E) return 1; + if(c==0x11A8) return 1; + if(c==0x11AB) return 1; + if(c>=0x11AE && c<=0x11AF) return 1; + if(c>=0x11B7 && c<=0x11B8) return 1; + if(c==0x11BA) return 1; + if(c>=0x11BC && c<=0x11C2) return 1; + if(c==0x11EB) return 1; + if(c==0x11F0) return 1; + if(c==0x11F9) return 1; + break; + + case 0x1e: + if(c>=0x1E00 && c<=0x1E9B) return 1; + if(c>=0x1EA0 && c<=0x1EF9) return 1; + break; + + case 0x1f: + if(c>=0x1F00 && c<=0x1F15) return 1; + if(c>=0x1F18 && c<=0x1F1D) return 1; + if(c>=0x1F20 && c<=0x1F45) return 1; + if(c>=0x1F48 && c<=0x1F4D) return 1; + if(c>=0x1F50 && c<=0x1F57) return 1; + if(c==0x1F59) return 1; + if(c==0x1F5B) return 1; + if(c==0x1F5D) return 1; + if(c>=0x1F5F && c<=0x1F7D) return 1; + if(c>=0x1F80 && c<=0x1FB4) return 1; + if(c>=0x1FB6 && c<=0x1FBC) return 1; + if(c==0x1FBE) return 1; + if(c>=0x1FC2 && c<=0x1FC4) return 1; + if(c>=0x1FC6 && c<=0x1FCC) return 1; + if(c>=0x1FD0 && c<=0x1FD3) return 1; + if(c>=0x1FD6 && c<=0x1FDB) return 1; + if(c>=0x1FE0 && c<=0x1FEC) return 1; + if(c>=0x1FF2 && c<=0x1FF4) return 1; + if(c>=0x1FF6 && c<=0x1FFC) return 1; + break; + + case 0x21: + if(c==0x2126) return 1; + if(c>=0x212A && c<=0x212B) return 1; + if(c==0x212E) return 1; + if(c>=0x2180 && c<=0x2182) return 1; + break; + + case 0x30: + if(c>=0x3041 && c<=0x3094) return 1; + if(c>=0x30A1 && c<=0x30FA) return 1; + if(c>=0x3105 && c<=0x312C) return 1; + break; + + default: + if(c>=0xAC00 && c<=0xD7A3) return 1; + } + return 0; +} + +static INLINE int isIdeographic(INT32 c) +{ + if(c>=0x4E00 && c<=0x9FA5) return 1; + if(c==0x3007) return 1; + if(c>=0x3021 && c<=0x3029) return 1; + return 0; +} + +static INLINE int isLetter(INT32 c) +{ + return isBaseChar(c) || isIdeographic(c); +} + + +static int isCombiningChar(INT32 c) +{ + switch(c>>8) + { + case 0x03: + if(c>=0x0300 && c<=0x0345) return 1; + if(c>=0x0360 && c<=0x0361) return 1; + break; + + case 0x04: + if(c>=0x0483 && c<=0x0486) return 1; + break; + + case 0x05: + if(c>=0x0591 && c<=0x05A1) return 1; + if(c>=0x05A3 && c<=0x05B9) return 1; + if(c>=0x05BB && c<=0x05BD) return 1; + if(c==0x05BF) return 1; + if(c>=0x05C1 && c<=0x05C2) return 1; + if(c==0x05C4) return 1; + break; + + case 0x06: + if(c>=0x064B && c<=0x0652) return 1; + if(c==0x0670) return 1; + if(c>=0x06D6 && c<=0x06DC) return 1; + if(c>=0x06DD && c<=0x06DF) return 1; + if(c>=0x06E0 && c<=0x06E4) return 1; + if(c>=0x06E7 && c<=0x06E8) return 1; + if(c>=0x06EA && c<=0x06ED) return 1; + break; + + case 0x09: + if(c>=0x0901 && c<=0x0903) return 1; + if(c==0x093C) return 1; + if(c>=0x093E && c<=0x094C) return 1; + if(c==0x094D) return 1; + if(c>=0x0951 && c<=0x0954) return 1; + if(c>=0x0962 && c<=0x0963) return 1; + if(c>=0x0981 && c<=0x0983) return 1; + if(c==0x09BC) return 1; + if(c==0x09BE) return 1; + if(c==0x09BF) return 1; + if(c>=0x09C0 && c<=0x09C4) return 1; + if(c>=0x09C7 && c<=0x09C8) return 1; + if(c>=0x09CB && c<=0x09CD) return 1; + if(c==0x09D7) return 1; + if(c>=0x09E2 && c<=0x09E3) return 1; + break; + + case 0x0a: + if(c==0x0A02) return 1; + if(c==0x0A3C) return 1; + if(c==0x0A3E) return 1; + if(c==0x0A3F) return 1; + if(c>=0x0A40 && c<=0x0A42) return 1; + if(c>=0x0A47 && c<=0x0A48) return 1; + if(c>=0x0A4B && c<=0x0A4D) return 1; + if(c>=0x0A70 && c<=0x0A71) return 1; + if(c>=0x0A81 && c<=0x0A83) return 1; + if(c==0x0ABC) return 1; + if(c>=0x0ABE && c<=0x0AC5) return 1; + if(c>=0x0AC7 && c<=0x0AC9) return 1; + if(c>=0x0ACB && c<=0x0ACD) return 1; + break; + + case 0x0b: + if(c>=0x0B01 && c<=0x0B03) return 1; + if(c==0x0B3C) return 1; + if(c>=0x0B3E && c<=0x0B43) return 1; + if(c>=0x0B47 && c<=0x0B48) return 1; + if(c>=0x0B4B && c<=0x0B4D) return 1; + if(c>=0x0B56 && c<=0x0B57) return 1; + if(c>=0x0B82 && c<=0x0B83) return 1; + if(c>=0x0BBE && c<=0x0BC2) return 1; + if(c>=0x0BC6 && c<=0x0BC8) return 1; + if(c>=0x0BCA && c<=0x0BCD) return 1; + if(c==0x0BD7) return 1; + break; + + case 0x0c: + if(c>=0x0C01 && c<=0x0C03) return 1; + if(c>=0x0C3E && c<=0x0C44) return 1; + if(c>=0x0C46 && c<=0x0C48) return 1; + if(c>=0x0C4A && c<=0x0C4D) return 1; + if(c>=0x0C55 && c<=0x0C56) return 1; + if(c>=0x0C82 && c<=0x0C83) return 1; + if(c>=0x0CBE && c<=0x0CC4) return 1; + if(c>=0x0CC6 && c<=0x0CC8) return 1; + if(c>=0x0CCA && c<=0x0CCD) return 1; + if(c>=0x0CD5 && c<=0x0CD6) return 1; + break; + + case 0x0d: + if(c>=0x0D02 && c<=0x0D03) return 1; + if(c>=0x0D3E && c<=0x0D43) return 1; + if(c>=0x0D46 && c<=0x0D48) return 1; + if(c>=0x0D4A && c<=0x0D4D) return 1; + if(c==0x0D57) return 1; + break; + + case 0x0e: + if(c==0x0E31) return 1; + if(c>=0x0E34 && c<=0x0E3A) return 1; + if(c>=0x0E47 && c<=0x0E4E) return 1; + if(c==0x0EB1) return 1; + if(c>=0x0EB4 && c<=0x0EB9) return 1; + if(c>=0x0EBB && c<=0x0EBC) return 1; + if(c>=0x0EC8 && c<=0x0ECD) return 1; + break; + + case 0x0f: + if(c>=0x0F18 && c<=0x0F19) return 1; + if(c==0x0F35) return 1; + if(c==0x0F37) return 1; + if(c==0x0F39) return 1; + if(c==0x0F3E) return 1; + if(c==0x0F3F) return 1; + if(c>=0x0F71 && c<=0x0F84) return 1; + if(c>=0x0F86 && c<=0x0F8B) return 1; + if(c>=0x0F90 && c<=0x0F95) return 1; + if(c==0x0F97) return 1; + if(c>=0x0F99 && c<=0x0FAD) return 1; + if(c>=0x0FB1 && c<=0x0FB7) return 1; + if(c==0x0FB9) return 1; + break; + + case 0x20: + if(c>=0x20D0 && c<=0x20DC) return 1; + if(c==0x20E1) return 1; + break; + + case 0x30: + if(c>=0x302A && c<=0x302F) return 1; + if(c==0x3099) return 1; + if(c==0x309A) return 1; + } + return 0; +} + +static INLINE int isDigit(INT32 c) +{ + switch(c>>8) + { + case 0x00: + return c>=0x0030 && c<=0x0039; + + case 0x06: + if(c>=0x0660 && c<=0x0669) return 1; + if(c>=0x06F0 && c<=0x06F9) return 1; + break; + + case 0x09: + if(c>=0x0966 && c<=0x096F) return 1; + if(c>=0x09E6 && c<=0x09EF) return 1; + break; + + case 0x0a: + if(c>=0x0A66 && c<=0x0A6F) return 1; + if(c>=0x0AE6 && c<=0x0AEF) return 1; + break; + + case 0x0b: + if(c>=0x0B66 && c<=0x0B6F) return 1; + if(c>=0x0BE7 && c<=0x0BEF) return 1; + break; + + case 0x0c: + if(c>=0x0C66 && c<=0x0C6F) return 1; + if(c>=0x0CE6 && c<=0x0CEF) return 1; + break; + + case 0x0d: + if(c>=0x0D66 && c<=0x0D6F) return 1; + break; + + case 0x0e: + if(c>=0x0E50 && c<=0x0E59) return 1; + if(c>=0x0ED0 && c<=0x0ED9) return 1; + + case 0x0f: + if(c>=0x0F20 && c<=0x0F29) return 1; + } + return 0; +} + +static int isExtender(INT32 c) +{ + switch(c) + { + case 0x00B7: + case 0x02D0: + case 0x02D1: + case 0x0387: + case 0x0640: + case 0x0E46: + case 0x0EC6: + case 0x3005: + + case 0x3031: + case 0x3032: + case 0x3033: + case 0x3034: + case 0x3035: + + case 0x309D: + case 0x309E: + + case 0x30FC: + case 0x30FD: + case 0x30FE: + return 1; + } + return 0; +} + +static int isChar(INT32 c) +{ + + if(c <= 0xd7ff) + { + if(c>0x20 || c==0x9 || c==0x0a || c==0x0d) return 1; + }else{ + if(c <=0xffd) + { + if(c>=0xe000) return 1; + }else{ + if(c>=0x10000 && c<=0x10ffff) return 1; + } + } + return 0; +} + +static INLINE int isSpace(INT32 c) +{ + switch(c) + { + case 0x20: + case 0x09: + case 0x0d: + case 0x0a: + return 1; + } + return 0; +} + +static INLINE int isNameChar(INT32 c) +{ + return isLetter(c) || isDigit(c) || + c=='.' || c=='-' || c=='_' || c==':' || + isCombiningChar(c) || isExtender(c); +} + +static INLINE int isFirstNameChar(INT32 c) +{ + return isLetter(c) || c=='_' || c==':'; +} + +static INLINE int isHexChar(INT32 c) +{ + switch(c) + { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': case 'A': return 10; + case 'b': case 'B': return 11; + case 'c': case 'C': return 12; + case 'd': case 'D': return 13; + case 'e': case 'E': return 14; + case 'f': case 'F': return 15; + default: return -1; + } +} + + +#define PEEK(X) INDEX_PCHARP(data->datap,X) + +#ifdef VERBOSE_XMLDEBUG +#define POKE(X,Y) string_builder_putchar(&X,Y) +#define READ(Z) do { data->pos+=Z; fprintf(stderr,"Stepping %d steps to %d\n",Z,data->pos); data->len-=Z; INC_PCHARP(data->datap, Z); }while(0) +#else +#define POKE(X,Y) string_builder_putchar(&X,Y) +#define READ(Z) do { data->pos+=Z; data->len-=Z; INC_PCHARP(data->datap, Z); }while(0) +#endif + +#define SIMPLE_READ_SYSTEMLITERAL() simple_read_system_literal(data) +#define SIMPLE_READ_PUBIDLITERAL() simple_read_system_literal(data) +#define SIMPLE_READNAME() simple_readname(data) + +static void simple_readname(struct xmldata *); +static void simple_read_system_literal(struct xmldata *); +static int low_parse_xml(struct xmldata *data, + struct pike_string *end, + int toplevel); + +#define ERROR(desc) do { \ + struct svalue * save_sp=sp; \ + push_text("error"); \ + push_int(0); /* no name */ \ + push_text("location"); \ + push_int(data->pos); \ + f_aggregate_mapping(2); \ + push_text(desc); \ + SYS(); \ + if(save_sp == sp) \ + error("%s\n",desc); \ + READ(1); \ + }while(0) + + +#define SKIPSPACE() \ + do { while(isSpace(PEEK(0))) READ(1); }while(0) + +#define READNAME(X) do { \ + if(isFirstNameChar(PEEK(0))) \ + { \ + POKE(X, PEEK(0)); \ + READ(1); \ + }else{ \ + ERROR("Name expected"); \ + } \ + while(isNameChar(PEEK(0))) \ + { \ + POKE(X, PEEK(0)); \ + READ(1); \ + } \ + }while(0) + + +#define BEGIN_STRING(STR) \ + do{ \ + struct string_builder STR; \ + ONERROR tmp_ ## STR; \ + init_string_builder(&STR,0); \ + SET_ONERROR(tmp_ ## STR, free_string_builder, &STR) + + +#define END_STRING(STR) \ + check_stack(1); \ + UNSET_ONERROR(tmp_ ## STR); \ + push_string(finish_string_builder(&STR)); \ + }while(0) + + + + +#define READ_REFERENCE(X,PARSE_RECURSIVELY) do { \ + READ(1); /* Assume '&' for now */ \ + if(PEEK(0)=='#') \ + { \ + /* Character reference */ \ + INT32 num=0; \ + \ + READ(1); \ + if(PEEK(0)=='x') \ + { \ + READ(1); \ + while(isHexChar(PEEK(0))>=0) \ + { \ + num*=16; \ + num+=isHexChar(PEEK(0)); \ + READ(1); \ + } \ + }else{ \ + while(PEEK(0)>='0' && PEEK(0)<='9') \ + { \ + num*=10; \ + num+=PEEK(0)-'0'; \ + READ(1); \ + } \ + } \ + if(PEEK(0)!=';') \ + ERROR("Missing ';' after character reference."); \ + READ(1); \ + POKE(X, num); \ + }else{ \ + /* Entity reference */ \ + if(!THIS->entities) \ + { \ + ERROR("XML->__entities is not a mapping"); \ + f_aggregate_mapping(0); \ + }else{ \ + ref_push_mapping(THIS->entities); \ + } \ + SIMPLE_READNAME(); \ + if(PEEK(0)!=';') \ + ERROR("Missing ';' after entity reference."); \ + READ(1); \ + /* lookup entry in mapping and parse it recursively */ \ + /* Generate error if entity is not defined */ \ + f_index(2); \ + if(IS_ZERO(sp-1)) \ + { \ + ERROR("No such entity."); \ + pop_stack(); \ + }else{ \ + if(sp[-1].type!=T_STRING) \ + { \ + ERROR("XML->__entities value is not a string!"); \ + }else{ \ + struct pike_string *s=sp[-1].u.string; \ + struct xmldata my_tmp=*data; \ + ONERROR tmp2; \ + sp--; \ + SET_ONERROR(tmp2, do_free_string, s); \ + check_stack(10); \ + my_tmp.datap=MKPCHARP_STR(s); \ + my_tmp.len=s->len; \ + PARSE_RECURSIVELY; \ + UNSET_ONERROR(tmp2); \ + free_string(s); \ + } \ + } \ + } \ + }while(0) + + + +#define READ_PEREFERENCE(X,PARSE_RECURSIVELY) do { \ + READ(1); /* Assume '%' */ \ + if(!THIS->entities) \ + { \ + ERROR("XML->__entities is not a mapping"); \ + f_aggregate_mapping(0); \ + }else{ \ + ref_push_mapping(THIS->entities); \ + } \ + push_text("%"); \ + SIMPLE_READNAME(); \ + f_add(2); \ + if(PEEK(0)!=';') \ + ERROR("Missing ';' after entity reference."); \ + READ(1); \ + /* lookup entry in mapping and parse it recursively */ \ + /* Generate error if entity is not defined */ \ + f_index(2); \ + if(IS_ZERO(sp-1)) \ + { \ + ERROR("No such entity."); \ + pop_stack(); \ + }else{ \ + if(sp[-1].type!=T_STRING) \ + { \ + ERROR("XML->__entities value is not a string!"); \ + }else{ \ + struct pike_string *s=sp[-1].u.string; \ + struct xmldata my_tmp=*data; \ + ONERROR tmp2; \ + sp--; \ + SET_ONERROR(tmp2, do_free_string, s); \ + check_stack(10); \ + my_tmp.datap=MKPCHARP_STR(s); \ + my_tmp.len=s->len; \ + PARSE_RECURSIVELY; \ + UNSET_ONERROR(tmp2); \ + free_string(s); \ + } \ + } \ + }while(0) + + + + +#define READ_ATTVALUE(X) do { \ + SKIPSPACE(); \ + switch(PEEK(0)) \ + { \ + case '\'': \ + READ(1); \ + read_attvalue(data,&X,'\''); \ + break; \ + case '\"': \ + READ(1); \ + read_attvalue(data,&X,'\"'); \ + break; \ + default: \ + ERROR("Unquoted attribute value."); \ + } \ + }while(0) + +#define READ_ATTVALUE2(X) do { \ + SKIPSPACE(); \ + switch(PEEK(0)) \ + { \ + case '\'': \ + READ(1); \ + read_attvalue2(data,&X,'\''); \ + break; \ + case '\"': \ + READ(1); \ + read_attvalue2(data,&X,'\"'); \ + break; \ + default: \ + ERROR("Unquoted attribute value."); \ + } \ + }while(0) + + +#define READ_COMMENT() do { BEGIN_STRING(com); \ + while(!(PEEK(0)=='-' && PEEK(1)=='-' && PEEK(2)=='>')) \ + { \ + POKE(com, PEEK(0)); \ + READ(1); \ + } \ + READ(3); END_STRING(com); \ + }while(0); + + +#define INTERMISSION(X) do { \ + if((X).s->len) { \ + check_stack(4); \ + push_text(""); \ + push_int(0); /* No name */ \ + push_int(0); /* No attributes */ \ + push_string(finish_string_builder(&(X))); \ + init_string_builder(&(X),0); \ + SYS(); \ + } } while (0) + + +#define SYS() do{ \ + check_stack(data->num_extra_args); \ + assign_svalues_no_free(sp, data->extra_args, \ + data->num_extra_args, \ + data->extra_arg_types); \ + sp+=data->num_extra_args; \ + apply_svalue(data->func, 4+data->num_extra_args); \ + if(IS_ZERO(sp-1)) \ + pop_stack(); \ + }while(0); + +static void read_attvalue(struct xmldata *data, + struct string_builder *X, + p_wchar2 Y) +{ + while(1) + { + if(data->len<=0) + { + if(Y) + ERROR("End of file while looking for end of attribute value."); + break; + } + if(PEEK(0)==Y) + { + READ(1); + break; + } + switch(PEEK(0)) + { + case '&': + READ_REFERENCE((*X), read_attvalue(&my_tmp, X, 0)); + break; + + case 0x0d: if(PEEK(1)==0x0a) READ(1); + case 0x20: + case 0x0a: + case 0x09: + POKE(*X, 0x20); + READ(1); + break; + + default: + POKE(*X, PEEK(0)); + READ(1); + } + } +} + +static void read_attvalue2(struct xmldata *data, + struct string_builder *X, + p_wchar2 Y) +{ + while(1) + { + if(data->len<=0) + { + ERROR("End of file while looking for end of attribute value."); + break; + } + if(PEEK(0)==Y) + { + READ(1); + break; + } + switch(PEEK(0)) + { + case '&': + READ_REFERENCE((*X), read_attvalue2(&my_tmp, X, 0)); + break; + + case '%': + READ_PEREFERENCE((*X), read_attvalue2(&my_tmp, X, 0)); + break; + + case 0x0d: if(PEEK(1)==0x0a) READ(1); + case 0x20: + case 0x0a: + case 0x09: + POKE(*X, 0x20); + READ(1); + break; + + default: + POKE(*X, PEEK(0)); + READ(1); + } + } +} + +static void simple_read_system_literal(struct xmldata *data) +{ + BEGIN_STRING(name); + SKIPSPACE(); + READ_ATTVALUE(name); + END_STRING(name); +} + + +static void simple_readname(struct xmldata *data) +{ + BEGIN_STRING(name); + READNAME(name); + END_STRING(name); +} + +#define SIMPLE_READ_ATTRIBUTES() simple_read_attributes(data); + +static void simple_read_attributes(struct xmldata *data) +{ + SKIPSPACE(); + + /* Read unordered attributes */ + push_mapping(allocate_mapping(10)); /* Attributes */ + while(isFirstNameChar(PEEK(0))) + { + SIMPLE_READNAME(); + SKIPSPACE(); + if(PEEK(0)!='=') + ERROR("Missing '=' in attribute."); + READ(1); + + BEGIN_STRING(val); + READ_ATTVALUE(val); + END_STRING(val); + + assign_lvalue(sp-3, sp-1); + pop_n_elems(2); + SKIPSPACE(); + } +} + +static int low_parse_dtd(struct xmldata *data) +{ + int done=0; + struct svalue *save_sp=sp; + while(!done && data->len>0) + { + switch(PEEK(0)) + { + default: + if(!isSpace(PEEK(0))) + ERROR("Non-space character on DTD top level."); + READ(1); + SKIPSPACE(); + break; + + case '%': /* PEReference */ + + case '<': + switch(PEEK(1)) + { + case '!': + switch(PEEK(2)) + { + case '-': /* Comment */ + if(PEEK(3)=='-') + { + /* Comment */ + push_text("<!--"); + push_int(0); /* No name */ + push_int(0); /* No attribues */ + READ(4); + READ_COMMENT(); + SYS(); + }else{ + ERROR("Expected <!-- but got something else."); + } + break; + + case 'E': /* ELEMENT or ENTITY */ + if(PEEK(3)=='N' && + PEEK(4)=='T' && + PEEK(5)=='I' && + PEEK(6)=='T' && + PEEK(7)=='Y' && + isSpace(PEEK(8))) + { + int may_have_notation=0; + READ(9); + SKIPSPACE(); + + push_text("<!ENTITY"); + + if(PEEK(0)=='%') + { + READ(1); + SKIPSPACE(); + push_text("%"); + SIMPLE_READNAME(); + f_add(2); + }else{ + may_have_notation=1; + SIMPLE_READNAME(); + } + + SKIPSPACE(); + + switch(PEEK(0)) + { + case '\'': + case '"': + push_int(0); /* no attributes */ + BEGIN_STRING(value); + READ_ATTVALUE2(value); + END_STRING(value); + if(THIS->entities) + { + if(low_mapping_string_lookup(THIS->entities, + sp[-3].u.string)) + { + /* Duplicate entry, we should issue a warning */ + }else{ + mapping_string_insert(THIS->entities, + sp[-3].u.string, + sp-1); + } + }else{ + ERROR("XML->__entities is not a mapping."); + } + SYS(); + break; + + default: + case 'S': /* SYSTEM */ + case 'P': /* PUBLIC */ + /* FIXME, DTD's are IGNORED! */ + while(data->len>0 && PEEK(0)!='>') + ERROR("External entities not yet implemented.\n"); + } + + SKIPSPACE(); + if(PEEK(0)!='>') + ERROR("Missing '>' in ENTITY."); + READ(1); + break; + } + + case 'A': /* ATTLIST */ + case 'N': /* NOTATION */ + push_text("<!"); + READ(2); + SIMPLE_READNAME(); + SKIPSPACE(); + + /* FIXME, DTD's are IGNORED! */ + while(data->len>0 && PEEK(0)!='>') + READ(1); + + if(PEEK(0) != '>') + ERROR("Missing '>' in DTD"); + READ(1); + + push_int(0); /* No attributes */ + push_int(0); /* No data - yet */ + SYS(); + break; + + default: + ERROR("Unknown entry in DTD."); + break; + } + break; + + + case '?': /* Processing Info */ + READ(2); + push_text("<?"); + SIMPLE_READNAME(); + push_int(0); /* No attributes */ + SKIPSPACE(); + BEGIN_STRING(foo); + while(data->len>0 && !(PEEK(0)=='?' && PEEK(1)=='>')) + { + POKE(foo, PEEK(0)); + READ(1); + } + READ(2); + END_STRING(foo); + SYS(); + break; + + default: + ERROR("Unknown entry in DTD."); + break; + + } + break; + + case ']': +#ifdef VERBOSE_XMLDEBUG + fprintf(stderr,"low_parse_dtd found ']'\n"); +#endif + done=1; + } + } + f_aggregate(sp-save_sp); + /* There is now one value on the stack */ + return done; +} + +static struct pike_string *very_low_parse_xml(struct xmldata *data, + struct pike_string *end, + int toplevel, + struct string_builder *text) +{ + int done=0; + while(!done && data->len>0) + { + switch(PEEK(0)) + { + default: + if(toplevel) + { + if(!isSpace(PEEK(0))) + { + ERROR("All data must be inside tags"); + READ(1); + } + SKIPSPACE(); + break; + } + POKE(*text, PEEK(0)); + READ(1); + continue; + + /* Strangely enough, \r and \r\n should be reported as \n, + * but \n\r should be reported as \n\n + */ + case '\r': + if(toplevel) + { + SKIPSPACE(); + break; + } + POKE(*text,'\n'); + READ(1); + if(PEEK(0) == '\n') READ(1); + continue; + + case '&': + READ_REFERENCE(*text,very_low_parse_xml(&my_tmp, end, toplevel, text)); + continue; + + case '<': + INTERMISSION(*text); + + switch(PEEK(1)) + { + case '?': /* Ends with ?> */ + if(PEEK(2)=='x' && + PEEK(3)=='m' && + PEEK(4)=='l' && + isSpace(PEEK(5))) + { + push_text("<?xml"); + READ(6); + push_int(0); + SIMPLE_READ_ATTRIBUTES(); + + if(PEEK(0) != '?' && PEEK(1)!='>') + ERROR("Missing ?> at end of <?xml."); + READ(2); + + push_int(0); /* No data */ + }else{ + READ(2); + push_text("<?"); + SIMPLE_READNAME(); + push_int(0); /* No attributes */ + SKIPSPACE(); + BEGIN_STRING(foo); + while(!(PEEK(0)=='?' && PEEK(1)=='>')) + { + POKE(foo, PEEK(0)); + READ(1); + } + READ(2); + END_STRING(foo); + } + SYS(); + break; + + case '!': + switch(PEEK(2)) + { + case '-': /* Comment */ + if(PEEK(3)=='-') + { + /* Comment */ + push_text("<!--"); + push_int(0); /* No name */ + push_int(0); /* No attribues */ + READ(4); + READ_COMMENT(); + SYS(); + }else{ + ERROR("Expected <!-- but got something else."); + } + break; + + + case 'A': /* ATTLIST? */ + case 'E': /* ENTITY? ELEMENT? */ + ERROR("Invalid entry outside DTD."); + break; + + case '[': + if(PEEK(3)=='C' && + PEEK(4)=='D' && + PEEK(5)=='A' && + PEEK(6)=='T' && + PEEK(7)=='A' && + PEEK(8)=='[') + { + READ(9); + push_text("<![CDATA["); + push_int(0); + push_int(0); + BEGIN_STRING(cdata); + while(data->len>0 && !(PEEK(0)==']' && + PEEK(1)==']' && + PEEK(2)=='>')) + { + POKE(cdata, PEEK(0)); + READ(1); + } + READ(3); + END_STRING(cdata); + SYS(); + break; + } + + default: + ERROR("Invalid entry."); + break; + + case 'D': /* DOCTYPE? */ +/* fprintf(stderr,"FOO: %c%c%c%c\n",PEEK(3),PEEK(4),PEEK(5),PEEK(6)); */ + if(PEEK(3)!='O' || + PEEK(4)!='C' || + PEEK(5)!='T' || + PEEK(6)!='Y' || + PEEK(7)!='P' || + PEEK(8)!='E' || + !isSpace(PEEK(9))) + { + ERROR("Expected 'DOCTYPE', got something else."); + }else{ + READ(9); + SKIPSPACE(); + push_text("<!DOCTYPE"); + SIMPLE_READNAME(); /* NAME */ + SKIPSPACE(); + switch(PEEK(0)) + { + case 'P': + if(PEEK(1)=='U' && + PEEK(2)=='B' && + PEEK(3)=='L' && + PEEK(4)=='I' && + PEEK(5)=='C') + { + SIMPLE_READNAME(); + SIMPLE_READ_PUBIDLITERAL(); + push_text("SYSTEM"); + SIMPLE_READ_SYSTEMLITERAL(); + SKIPSPACE(); + f_aggregate_mapping(4); + }else{ + ERROR("Expected PUBLIC, found something else."); + f_aggregate_mapping(0); + } + break; + + case 'S': + if(PEEK(1)=='Y' && + PEEK(2)=='S' && + PEEK(3)=='T' && + PEEK(4)=='E' && + PEEK(5)=='M') + { + SIMPLE_READNAME(); + SIMPLE_READ_SYSTEMLITERAL(); + SKIPSPACE(); + f_aggregate_mapping(2); + }else{ + ERROR("Expected SYSTEM, found something else."); + f_aggregate_mapping(0); + } + break; + + default: + f_aggregate_mapping(0); + } + + if(PEEK(0)=='[') + { + READ(1); + low_parse_dtd(data); +#ifdef VERBOSE_XMLDEBUG + fprintf(stderr,"FOO: %c%c%c%c\n",PEEK(0),PEEK(1),PEEK(2),PEEK(3)); +#endif + if(PEEK(0) != ']') + ERROR("Missing ] in DOCTYPE tag."); + READ(1); + SKIPSPACE(); + }else{ + push_int(0); + } + if(PEEK(0)!='>') + ERROR("Missing '>' in DOCTYPE tag."); + READ(1); + SYS(); + } + break; + } + break; + + case '/': /* End tag */ + READ(2); + SIMPLE_READNAME(); + SKIPSPACE(); + if(PEEK(0)!='>') + ERROR("Missing > in end tag."); + else + READ(1); + if(end!=sp[-1].u.string) + { + ERROR("Unmatched end tag."); + }else{ + end=0; + } + done=1; + pop_stack(); + break; + + default: + /* 'Normal' tag (we hope) */ + push_text(">"); /* This might change */ + + READ(1); + SIMPLE_READNAME(); + SIMPLE_READ_ATTRIBUTES(); + + switch(PEEK(0)) + { + case '>': + READ(1); + if(low_parse_xml(data, sp[-2].u.string,0)) + ERROR("Unmatched tag."); + SYS(); + break; + + case '/': + READ(1); + if(PEEK(0)!='>') + ERROR("Missing '>' in empty tag."); + READ(1); + /* Self-contained tag */ + free_string(sp[-3].u.string); + sp[-3].u.string=make_shared_string("<>"); + push_int(0); /* No data */ + SYS(); + break; + + } + } + } + } + return end; +} + +static int low_parse_xml(struct xmldata *data, + struct pike_string *end, + int toplevel) +{ + struct svalue *save_sp=sp; + BEGIN_STRING(text); + end=very_low_parse_xml(data,end,toplevel,&text); + INTERMISSION(text); + END_STRING(text); + pop_stack(); + f_aggregate(sp-save_sp); + /* There is now one value on the stack */ + return !!end; +} + +static void parse_xml(INT32 args) +{ + struct svalue tmp; + struct pike_string *s; + struct xmldata data; + + s=sp[-args].u.string; + if(args<2) + error("Too few arguments to XML->parse()\n"); + +#if 0 + if(!s->size_shift) + { + if(STR0(s)[0]==0xfe && STR1(s)[0]==0xff) + { + /* String is UTF8, convert to unicode here */ + + } + } +#endif + data.datap=MKPCHARP_STR(s); + data.len=s->len; + data.pos=0; + data.func=sp+1-args; + data.extra_args=sp+2-args; + data.num_extra_args=args-2; + data.extra_arg_types=-1; /* FIXME */ + + low_parse_xml(&data,0,1); + tmp=*--sp; + pop_n_elems(args); + *sp++=tmp; +} + +static void parse_dtd(INT32 args) +{ + struct svalue tmp; + struct pike_string *s; + struct xmldata data; + + s=sp[-args].u.string; + if(args<2) + error("Too few arguments to XML->parse()\n"); + +#if 0 + if(!s->size_shift) + { + if(STR0(s)[0]==0xfe && STR1(s)[0]==0xff) + { + /* String is UTF8, convert to unicode here */ + + } + } +#endif + data.datap=MKPCHARP_STR(s); + data.len=s->len; + data.pos=0; + data.func=sp+1-args; + data.extra_args=sp+2-args; + data.num_extra_args=args-2; + data.extra_arg_types=-1; /* FIXME */ + + low_parse_dtd(&data); + tmp=*--sp; + pop_n_elems(args); + *sp++=tmp; +} + +static void create(INT32 args) +{ + pop_n_elems(args); + if(!THIS->entities) + { + push_text("lt"); push_text("<"); + push_text("gt"); push_text(">"); + push_text("amp"); push_text("&"); + push_text("apos"); push_text("'"); + push_text("quot"); push_text("\""); + f_aggregate_mapping(10); + THIS->entities=sp[-1].u.mapping; + sp--; + } + push_int(0); +} + + +void init_xml(void) +{ + INT32 off; + start_new_program(); + off=add_storage(sizeof(struct xmlobj)); + map_variable("__entities","mapping",0, + off + OFFSETOF(xmlobj, entities),T_MAPPING); + add_function("parse",parse_xml,"function(string,function(string,string,mapping,array|string:0=mixed),mixed...:array(0))",0); + add_function("parse_dtd",parse_dtd,"function(string,function(string,string,mapping,array|string:0=mixed),mixed...:array(0))",0); + add_function("create",create,"function(:void)",0); + end_class("XML",0); +} -- GitLab