webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit fd8b8950efb4f0b5d2d2bb679b7ded6131725fb5
parent d87d026a246edadd201b607c15881172ac2564f1
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 21 Sep 2019 16:25:35 +0200

more refactoring, update TODO and bump LICENSE year

Diffstat:
LICENSE | 2+-
TODO | 1+
main.c | 172++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
3 files changed, 97 insertions(+), 78 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -1,6 +1,6 @@ ISC License -Copyright (c) 2017-2018 Hiltjo Posthuma <hiltjo@codemadness.org> +Copyright (c) 2017-2019 Hiltjo Posthuma <hiltjo@codemadness.org> Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/TODO b/TODO @@ -1,5 +1,6 @@ - base href. specify and parse relative url, allow to specify base and also parse <base href=""> +- handle <link /> to RSS/Atom feed, show as link. - handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre - print safe (not certain control chars, except newline, TAB etc). - improve/remove duplicate white-space/newlines? diff --git a/main.c b/main.c @@ -19,6 +19,8 @@ struct uri { char port[6]; /* numeric port */ }; +static int termwidth = 72; + #if 0 /* linked-list of link references */ struct linkref { @@ -33,12 +35,18 @@ static int linkcount; #endif enum DisplayType { - DisplayInline = 1, - DisplayPre = 2, - DisplayInlineBlock = 4, - DisplayBlock = 8, - DisplayListItem = 16, - DisplayTableCell = 32, + DisplayUnknown = 0, + DisplayNone = 1, + DisplayPre = 2, + DisplayInline = 4, + DisplayInlineBlock = 8, + DisplayBlock = 16, + DisplayList = 32, + DisplayListItem = 64, + DisplayTable = 128, + DisplayTableRow = 256, + DisplayTableCell = 512, + DisplayHeader = 1024, }; struct node { @@ -66,10 +74,6 @@ static char src[4096]; /* src or href attribute */ static struct node nodes[MAX_DEPTH]; static int curnode; -/* TODO: temporary workaround, handle whitespace, and tag types properly: - atleast: inline-block, inline, block, pre */ -static int ignoredata; - static struct { char *tag; enum DisplayType displaytype; @@ -87,27 +91,32 @@ static struct { { "span", DisplayInline }, { "img", DisplayInline }, { "label", DisplayInline }, + /* table */ + { "table", DisplayTable }, + /* table-row */ + { "tr", DisplayTableRow }, /* table-cell */ { "td", DisplayTableCell }, { "th", DisplayTableCell }, /* list-item */ { "li", DisplayListItem }, + /* header */ + { "h1", DisplayHeader }, + { "h2", DisplayHeader }, + { "h3", DisplayHeader }, + { "h4", DisplayHeader }, + { "h5", DisplayHeader }, + { "h6", DisplayHeader }, + /* break */ + { "br", 0 }, + /* list */ + { "ul", DisplayList }, + { "ol", DisplayList }, /* block */ - { "h1", DisplayBlock }, - { "h2", DisplayBlock }, - { "h3", DisplayBlock }, - { "h4", DisplayBlock }, - { "h5", DisplayBlock }, - { "h6", DisplayBlock }, { "p", DisplayBlock }, - { "ul", DisplayBlock }, - { "lo", DisplayBlock }, + { "blockquote", DisplayBlock }, { "hr", DisplayBlock }, - { "br", DisplayBlock }, { "title", DisplayBlock }, - { "tr", DisplayBlock }, - { "table", DisplayBlock }, - { "blockquote", DisplayBlock }, { "div", DisplayBlock }, }; @@ -380,6 +389,12 @@ absuri(char *buf, size_t bufsiz, const char *link, const char *base) static void xmlcdata(XMLParser *p, const char *data, size_t datalen) { + struct node *cur; + + cur = &nodes[curnode]; + if (cur->displaytype & DisplayNone) + return; + printsafe(data); } @@ -394,21 +409,10 @@ xmldataend(XMLParser *p) return; start = htmldata.data; -#if 1 + + /* TODO: white-space handling */ s = start; e = s + strlen(s); -#else - /* TODO: white-space handling */ - for (s = start; *s; s++) { - if (*s != '\r' && *s != '\n') - break; - } - - for (e = s + strlen(s); e > s; e--) { - if (*e != '\r' && *e != '\n') - break; - } -#endif if (cur->displaytype & DisplayPre) { fwrite(s, 1, e - s, stdout); @@ -433,17 +437,26 @@ xmldataend(XMLParser *p) static void xmldata(XMLParser *p, const char *data, size_t datalen) { - if (ignoredata) + struct node *cur; + + cur = &nodes[curnode]; + if (cur->displaytype & DisplayNone) return; + string_append(&htmldata, data, datalen); } static void xmldataentity(XMLParser *p, const char *data, size_t datalen) { + struct node *cur; char buf[16]; int n; + cur = &nodes[curnode]; + if (cur->displaytype & DisplayNone) + return; + /* convert basic XML entities */ /* &nbsp; &copy;, copy table from Links (check license) */ /* rsquo, hellip, ndash, lsquo */ @@ -471,11 +484,7 @@ xmltagstart(XMLParser *x, const char *t, size_t tl) src[0] = '\0'; /* src, href */ strlcpy(cur->tag, t, sizeof(cur->tag)); - if (!strcasecmp(t, "table")) - ignoredata = 1; - else if (!strcasecmp(t, "td") || !strcasecmp(t, "th")) - ignoredata = 0; - + /* set display type */ for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) { if (!strcasecmp(tags[i].tag, t)) { cur->displaytype |= tags[i].displaytype; @@ -492,27 +501,32 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) cur = &nodes[curnode]; - if (!strcasecmp(t, "tr")) { - fputs(" | ", stdout); /* HACK: last cell */ - return; - } else if (!strcasecmp(t, "td") || !strcasecmp(t, "th")) { - ignoredata = 1; - return; - } else if (!strcasecmp(t, "table")) { - ignoredata = 0; - } - - if (cur->displaytype & DisplayBlock) + if (cur->displaytype & DisplayBlock) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayPre) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTable) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTableRow) { + fputs(" | ", stdout); /* HACK: assume last cell */ + } else if (cur->displaytype & DisplayTableCell) { + } else if (cur->displaytype & DisplayList) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayListItem) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayHeader) { + fputs("\n", stdout); + if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') { + if (t[1] >= '3') + for (i = 0; i < termwidth; i++) + putchar('-'); + else if (t[1] >= '1') + for (i = 0; i < termwidth; i++) + putchar('='); + putchar('\n'); + } + } else if (!strcasecmp(t, "br")) { fputs("\n", stdout); - - if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') { - if (t[1] >= '3') - for (i = 0; i < 72; i++) - putchar('-'); - else if (t[1] >= '1') - for (i = 0; i < 72; i++) - putchar('='); - putchar('\n'); } curnode--; @@ -541,9 +555,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) cur = &nodes[curnode]; - if (cur->displaytype & DisplayBlock) - fputs("\n", stdout); -#if 0 +#ifdef maybe /* show links as reference at the bottom */ if (src[0]) { printf(" [%d]", ++linkcount); @@ -556,6 +568,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) else links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); links_cur->type = estrdup(t); + /* TODO: absuri */ links_cur->url = estrdup(src); } src[0] = '\0'; @@ -575,26 +588,29 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) } } - if (cur->displaytype & DisplayBlock) + if (cur->displaytype & DisplayBlock) { fputs("\n", stdout); - - if (!strcasecmp(t, "td") || !strcasecmp(t, "th")) - fputs(" | ", stdout); /* HACK */ - - if (!strcasecmp(t, "li")) { + } else if (cur->displaytype & DisplayHeader) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTableRow) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTableCell) { + fputs(" | ", stdout); + } else if (cur->displaytype & DisplayList) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayListItem) { /* indent nested list items */ for (i = curnode; i; i--) { - if (!strcasecmp(nodes[i].tag, "li")) + if (nodes[i].displaytype & DisplayListItem) continue; - if (!strcasecmp(nodes[i].tag, "ul") || - !strcasecmp(nodes[i].tag, "ol")) + if (nodes[i].displaytype & DisplayList) fputs(" ", stdout); } /* TODO: for <ol>, keep list counter on ol element (parent), support ordered number type only */ fputs("* ", stdout); - } else if (!strcasecmp(t, "hr")) { - for (i = 0; i < 72; i++) + } else if (!strcasecmp(t, "hr")) { /* ruler */ + for (i = 0; i < termwidth; i++) putchar('-'); } } @@ -612,7 +628,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, strlcpy(src, value, sizeof(src)); } -#if 0 +#ifdef maybe void printlinkrefs(void) { @@ -644,7 +660,9 @@ main(void) parser.getnext = getchar; xml_parse(&parser); -/* printlinkrefs();*/ +#ifdef maybe + printlinkrefs(); +#endif putchar('\n'); return 0;