webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit 54f38abd3722c07e900820343e7c5288c6b0fdce
parent dacc8c21011cdd6f6c9dc4ebd177478b2151a2c1
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 20 Aug 2017 20:56:39 +0200

remove preprocess code, compare tags and attribute case-insensitive

idea to ignore literal tags (HTML).

Diffstat:
main.c | 103+++++++++++++++++++++++++++++++------------------------------------------------
1 file changed, 40 insertions(+), 63 deletions(-)

diff --git a/main.c b/main.c @@ -19,7 +19,7 @@ static XMLParser parser; struct node { char tag[256]; -/* int isignore;*/ + int isignore; int ispre; int isinline; int isblock; @@ -85,7 +85,6 @@ static char *blocktags[] = { }; static String htmldata; -static String preprocess; /* Clear string only; don't free, prevents unnecessary reallocation. */ static void @@ -204,8 +203,8 @@ xmldata(XMLParser *p, const char *data, size_t datalen) cur = &nodes[curnode]; string_append(&htmldata, data, datalen); -/* if (cur->isignore) - return;*/ + if (cur->isignore) + return; } static void @@ -233,28 +232,28 @@ xmltagstart(XMLParser *p, const char *tag, size_t taglen) src[0] = '\0'; /* src, href */ strlcpy(cur->tag, tag, sizeof(cur->tag)); -#if 0 +#if 1 for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) { - if (!strcmp(ignoretags[i], tag)) { + if (!strcasecmp(ignoretags[i], tag)) { cur->isignore = 1; break; } } #endif for (i = 0; i < sizeof(pretags) / sizeof(*pretags); i++) { - if (!strcmp(pretags[i], tag)) { + if (!strcasecmp(pretags[i], tag)) { cur->ispre = 1; break; } } for (i = 0; i < sizeof(blocktags) / sizeof(*blocktags); i++) { - if (!strcmp(blocktags[i], tag)) { + if (!strcasecmp(blocktags[i], tag)) { cur->isblock = 1; break; } } for (i = 0; i < sizeof(inlinetags) / sizeof(*inlinetags); i++) { - if (!strcmp(inlinetags[i], tag)) { + if (!strcasecmp(inlinetags[i], tag)) { cur->isinline = 1; break; } @@ -270,8 +269,8 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort) if (curnode) curnode--; cur = &nodes[curnode]; -/* if (cur->isignore) - return;*/ + if (cur->isignore) + return; #if 0 if (src[0]) @@ -279,7 +278,7 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort) src[0] = '\0'; #endif - if (!strcmp(tag, "tr")) + if (!strcasecmp(tag, "tr")) fputs(" | ", stdout); /* HACK */ if (cur->isblock) @@ -300,31 +299,47 @@ static void xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) { struct node *cur; - int i; + int c, i; cur = &nodes[curnode]; -/* if (cur->isignore) - return;*/ + if (cur->isignore) { +#if 0 + /* HACK: ignored tag is parsed, hook into reader and read raw data + until literal end tag (without using the normal parser). */ + + /* TODO: process (buffered) as xml[c]data (no entity) */ + while ((c = getchar()) != EOF) { + if (c == '<') { + /* TODO: check /endtag */ + break; + } + } + if (c == EOF) { + } + +#endif + return; + } if (cur->isblock) fputs("\n", stdout); - if (!strcmp(tag, "td") || !strcmp(tag, "th")) + if (!strcasecmp(tag, "td") || !strcasecmp(tag, "th")) fputs(" | ", stdout); /* HACK */ - if (!strcmp(cur->tag, "li")) { + if (!strcasecmp(cur->tag, "li")) { /* indent nested list items */ for (i = curnode; i; i--) { - if (!strcmp(nodes[i].tag, "li")) + if (!strcasecmp(nodes[i].tag, "li")) continue; - if (!strcmp(nodes[i].tag, "ul") || - !strcmp(nodes[i].tag, "ol")) + if (!strcasecmp(nodes[i].tag, "ul") || + !strcasecmp(nodes[i].tag, "ol")) fputs(" ", stdout); } /* TODO: for <ol>, keep list counter on ol element (parent), support ordered number type only */ fputs("* ", stdout); - } else if (!strcmp(cur->tag, "hr")) { + } else if (!strcasecmp(cur->tag, "hr")) { for (i = 0; i < 36; i++) putchar('-'); } @@ -338,65 +353,27 @@ static void xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, size_t namelen, const char *value, size_t valuelen) { - if (!strcmp(tag, "a") && !strcmp(name, "href") && valuelen) + if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen) strlcpy(src, value, sizeof(src)); - if ((!strcmp(tag, "img") || !strcmp(tag, "video") || !strcmp(tag, "audio")) && - !strcmp(name, "src") && valuelen) + if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") || + !strcasecmp(tag, "audio")) && + !strcasecmp(name, "src") && valuelen) strlcpy(src, value, sizeof(src)); } -/*static size_t readoffset;*/ - int readchar(void) { return getchar(); -#if 0 - size_t i, j; - int c; - - for (; readoffset < preprocess.len; ) { - if (preprocess.data[read_offset] != '<') - return preprocess.data[read_offset++]; - - for (j = 0; j < sizeof(ignoretags) / sizeof(*ignoretags); j++) { - if (!strncmp(&preprocess.data[i + 1], ignoretags[i], sizeof(ignoretags[i]) - 1)) { - if (strchr(" \t>", preprocess.data[i + 1 + sizeof(ignoretags[i]) - 1])) { - /* TODO: search until end of this tag */ - } - } - } - /* TODO: if no match just return char */ - return preprocess.data[read_offset++]; - } - return EOF; -#endif } -/* TODO: preprocess data, strip <script>, <style> etc */ int main(void) { - - char buf[BUFSIZ]; - int n; - if (pledge("stdio", NULL) < 0) err(1, "pledge"); -#if 0 - /* TODO: optimize later */ - while (1) { - /* TODO: check read error */ - n = read(0, buf, sizeof(buf) - 1); - if (n <= 0) - break; - buf[n] = '\0'; - string_append(&preprocess, buf, n); - } -#endif - parser.xmlattr = xmlattr; parser.xmlcdata = xmlcdata; parser.xmldata = xmldata;