diff --git a/src/modules/Parser/html.c b/src/modules/Parser/html.c index b5573171bdeff55be4c57611a9ce7463c08f411a..5166b92555b0b252b2889b85f93e2351f6dde68e 100644 --- a/src/modules/Parser/html.c +++ b/src/modules/Parser/html.c @@ -106,7 +106,7 @@ enum types {TYPE_TAG, TYPE_CONT, TYPE_ENTITY, TYPE_QTAG, TYPE_DATA}; /* flag: arg quote may have tag_end to end quote and tag */ #define FLAG_LAZY_END_ARG_QUOTE 0x00000002 -/* flag: the entity_break chars ends entity */ +/* flag: the chars in lazy_entity_ends ends the search for entity ends */ #define FLAG_LAZY_ENTITY_END 0x00000004 /* flag: match '<' and '>' for in-tag-tags (<foo <bar>>) */ @@ -2748,7 +2748,7 @@ static newstate do_try_feed(struct parser_html_storage *this, } else if (scan_entity && ch==this->entity_start) /* entity */ { - int end_found, entity_close = 1; + int end_found; DEBUG((stderr,"%*d do_try_feed scan entity %p:%d\n", this->stack_count,this->stack_count, *feed,st->c)); @@ -2757,8 +2757,13 @@ static newstate do_try_feed(struct parser_html_storage *this, if (this->flags & FLAG_LAZY_ENTITY_END) { end_found=scan_forward(*feed,st->c+1,&dst,&cdst, this->lazy_entity_ends,this->n_lazy_entity_ends); - if (end_found && index_shared_string(dst->s,cdst) != this->entity_end) - entity_close = 0; + if (end_found && index_shared_string(dst->s,cdst) != this->entity_end) { + /* Got no entity end; send it to callback__data. */ + dst=*feed; + cdst=st->c+1; + got_data=1; + goto done; + } } else { look_for[0]=this->entity_end; @@ -2775,7 +2780,7 @@ static newstate do_try_feed(struct parser_html_storage *this, struct svalue *v; push_feed_range(*feed,st->c+1,dst,cdst); - cdst+=entity_close; + cdst+=1; v=low_mapping_lookup(this->mapentity,sp-1); if (v) /* entity we want, do a callback */ @@ -2801,7 +2806,7 @@ static newstate do_try_feed(struct parser_html_storage *this, } pop_stack(); } - else cdst+=entity_close; + else cdst+=1; dmalloc_touch_svalue(&(this->callback__entity)); @@ -3782,10 +3787,11 @@ static void html_set_extra(INT32 args) **! back won't preserve the case of registered tags and **! containers. **! -**! <li><b>lazy_entity_end</b>: Normally, the entity end character -**! (i.e. ';') is required to end an entity. When this flag is -**! set, the characters '&', '<', '>', '"', ''', newline and -**! linefeed also breaks an entity. +**! <li><b>lazy_entity_end</b>: Normally, the parser search +**! indefinitely for the entity end character (i.e. ';'). When +**! this flag is set, the characters '&', '<', '>', '"', ''', +**! newline and linefeed breaks the search for the entity end, and +**! the entity text is then treated as data. **! **! <li><b>match_tag</b>: Unquoted nested tag starters and enders **! will be balanced when parsing tags. This is the default. diff --git a/src/modules/Parser/testsuite.in b/src/modules/Parser/testsuite.in index 176dcdff9f8f511f7b574276bf5f6ffdb89bafd1..1ff5de35e451f061eeb0b6461282e883f5f1b17e 100644 --- a/src/modules/Parser/testsuite.in +++ b/src/modules/Parser/testsuite.in @@ -122,12 +122,6 @@ test_any([[ "\'"); return p->finish ("<t> <!-- <t a='> -- --> <!-<t a='> -->")->read(); ]], "x [ <t a='> -- ] {<t a=} -->"); -test_any([[ - object p = Parser.HTML(); - p->lazy_entity_end (1); - p->_set_entity_callback (lambda (object p, string s) {return ({"[",s,"]"});}); - return p->finish ("&abc|&abc<>&abc;")->read(); -]], "[&abc|][&abc]<>[&abc;]"); // Current context functions test_any([[ @@ -154,7 +148,7 @@ test_any([[ (string) p->tag_content(), ")"}); }, "p"); return p->finish ("<t a=&e; <t>> <q <\"' &e; p> &e<c x=y -- >x</c> &e; ")->read(); -]], "T(<t a=&e; <t>>|t|0) Q(<q <\"' &e; p>|q| <\"' &e; ) E(&e|e|0)C(<c x=y -- >x</c>|c|x) E(&e;|e|0) "); +]], "T(<t a=&e; <t>>|t|0) Q(<q <\"' &e; p>|q| <\"' &e; ) &eC(<c x=y -- >x</c>|c|x) E(&e;|e|0) "); // Argument quoting test_any([[ @@ -399,8 +393,21 @@ test_any([[ object p = Parser.HTML(); p->lazy_entity_end (1); p->add_entity ("e", "x"); - return p->finish("&e; &e<t> &e <t> a='&e&e' &e=f; b=\"&e\" &e>")->read(); -]], "x x<t> &e <t> a='xx' &e=f; b=\"x\" x>"); + return p->finish("&e; &e<t> &e <t> a='&e;&e' &e=f; b=\"&e\" &e>")->read(); +]], "x &e<t> &e <t> a='x&e' &e=f; b=\"&e\" &e>"); +test_any([[ + object p = Parser.HTML(); + p->lazy_entity_end (1); + p->_set_entity_callback (lambda (object p, string s) {return ({"[",s,"]"});}); + return p->finish ("&abc|&abc<>&abc;")->read(); +]], "&abc|&abc<>[&abc;]"); +test_any([[ + object p = Parser.HTML(); + p->lazy_entity_end (1); + p->_set_entity_callback (lambda (object p, string s) {return ({"[",s,"]"});}); + p->_set_data_callback (lambda (object p, string s) {return ({"{",s,"}"});}); + return p->finish ("&abc|&abc<>&abc;")->read(); +]], "{&abc|}{&abc<>}[&abc;]{}"); // Recursive parsing in tags test_any([[