webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit ec68d5635764887d323bc7e3e09c01fda411e865
parent 26361ccd0ab0f19276d7727b8f589b1109cfbfd1
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Thu, 27 Jun 2019 19:37:07 +0200

ignore all within <script> or <style> (WIP)

Diffstat:
main.c | 59++++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/main.c b/main.c @@ -89,6 +89,34 @@ static char *blocktags[] = { static String htmldata; +static const char *ignorestate, *endtag; +static int (*getnext)(void); + +/* return a space for all data until some case-insensitive string occurs. This + is used to parse incorrect HTML/XML that contains unescaped HTML in script + or style tags. If you see some </script> tag in a CDATA or comment + section then e-mail W3C and tell them the web is too complex. */ +static inline int +getnext_ignore(void) +{ + int c; + + if ((c = getnext()) == EOF) + return EOF; + + if (tolower(c) == tolower((unsigned char)*ignorestate)) { + ignorestate++; + if (*ignorestate == '\0') { + parser.getnext = getnext; /* restore */ + return c; + } + } else { + ignorestate = endtag; + } + + return ' '; +} + /* Clear string only; don't free, prevents unnecessary reallocation. */ static void string_clear(String *s) @@ -218,10 +246,10 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen) /* TODO: add to tscrape too */ /* TODO: support some more HTML entities */ n = xml_entitytostr(data, buf, sizeof(buf)); - if (n <= 0) - xmldata(p, data, datalen); + if (n > 0) + xmldata(p, buf, (size_t)n); else - string_append(&htmldata, buf, n); + xmldata(p, data, datalen); } static void @@ -282,16 +310,6 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) } } -static char ignoretag[8]; -static XMLParser xo; /* old context */ - -static void -xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort) -{ - if (!strcasecmp(t, ignoretag)) - memcpy(p, &xo, sizeof(*p)); /* restore context */ -} - static void xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) { @@ -301,12 +319,15 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) /* temporary replace the callback except the reader and end of tag restore the context once we receive the same ignored tag in the end tag handler */ - if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) { - strlcpy(ignoretag, t, sizeof(ignoretag)); - memcpy(&xo, p, sizeof(xo)); /* store old context */ - memset(p, 0, sizeof(*p)); - p->xmltagend = xmlignoretagend; - p->getnext = xo.getnext; + if (!strcasecmp(t, "script")) { + ignorestate = endtag = "</script>"; + getnext = p->getnext; /* for restore */ + p->getnext = getnext_ignore; + return; + } else if (!strcasecmp(t, "style")) { + ignorestate = endtag = "</style>"; + getnext = p->getnext; /* for restore */ + p->getnext = getnext_ignore; return; }