webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit 421341e1a2b737cb269a144a1634511705161651
parent 6dae546b7c15b859321849c8b7b7294e6d916adc
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 22 Jul 2017 16:02:56 +0200

initial support to ignore tags (script and CSS)

this is not fully working yet because scripts can contain literal characters
such as < and >.

Diffstat:
main.c | 21+++++++++++++++++++++
1 file changed, 21 insertions(+), 0 deletions(-)

diff --git a/main.c b/main.c @@ -17,6 +17,7 @@ static int isdatastart; struct node { char tag[256]; + int isignore; int ispre; int isinline; int isblock; @@ -28,6 +29,12 @@ static char src[4096]; /* src or href attribute */ static struct node nodes[MAX_DEPTH]; static int curnode; +/* TODO: support literal text in script somehow? < > */ +static char *ignoretags[] = { + "style", + "script", +}; + static char *pretags[] = { "pre", "code", @@ -88,6 +95,8 @@ xmldata(XMLParser *p, const char *data, size_t datalen) const char *s = data; cur = &nodes[curnode]; + if (cur->isignore) + goto end; /* TODO: if not <pre> or w/e, skip? */ if (isdatastart && isspace(*s)) { @@ -110,6 +119,7 @@ xmldata(XMLParser *p, const char *data, size_t datalen) } } +end: /* TODO: remove trailing space also ? */ isdatastart = 0; } @@ -139,6 +149,12 @@ xmltagstart(XMLParser *p, const char *tag, size_t taglen) src[0] = '\0'; /* src, href */ strlcpy(cur->tag, tag, sizeof(cur->tag)); + for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) { + if (!strcmp(ignoretags[i], tag)) { + cur->isignore = 1; + break; + } + } for (i = 0; i < sizeof(pretags) / sizeof(*pretags); i++) { if (!strcmp(pretags[i], tag)) { cur->ispre = 1; @@ -168,6 +184,8 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort) if (curnode) curnode--; cur = &nodes[curnode]; + if (cur->isignore) + return; #if 0 if (src[0]) @@ -196,6 +214,9 @@ xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) int i; cur = &nodes[curnode]; + if (cur->isignore) + return; + if (cur->isblock) fputs("\n", stdout);