webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit 2a56590cbe1c1739171a28d4c30b5b318cb0b364
parent e4a9e2404be2db1687430631e912f1809992a23b
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 21 Sep 2019 20:02:18 +0200

testing improve white-space handling

Diffstat:
README | 6+++---
TODO | 7+++++--
main.c | 53+++++++++++++++++++++++++++++++++++++----------------
3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/README b/README @@ -1,15 +1,15 @@ NOTE! work-in-progress (very slowly). -Text-based webpage viewer +Text-based HTML dump Goals / scope: -The tool will render a webpage only to stdout, similarly like links -dump or +The tool will only render HTML to stdout, similarly to links -dump or lynx -dump but simpler and more secure. - It will be usable and secure for rendering HTML mails. - No remote resources will be downloaded. - Data will be written to stdout only. -- No support for Javascript, CSS support, frames or forms. +- No support for Javascript, CSS support, frames or form input. diff --git a/TODO b/TODO @@ -1,13 +1,16 @@ +- improve/remove duplicate white-space/newlines? +- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre. - base href. specify and parse relative url, allow to specify base and also parse <base href=""> -- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre. - detect <link /> to RSS/Atom feed, show as link. example: <link rel="alternate" href="atom.xml" type="application/atom+xml" title="Codemadness Atom Feed" /> or <link rel="alternate" title="Tweakers Mixed RSS feed" type="application/rss+xml" href="https://tweakers.net/feeds/mixed.xml"> - print safe (not certain control chars, except newline, TAB etc). -- improve/remove duplicate white-space/newlines? +- rework parsing of <script> and <style> with unescaped characters like < and >. - <code> should not be treated as a block (<pre> does?) +- make the code easy to embed/restructure to make a HTML-to-plain-text converter + for HTML in RSS/Atom feeds. - add links as reference, for example on page: http://absmagazin.de/2018 the MP3 urls. - add COMPATOBJ for strlcpy and strlcat. - write a proper Makefile. diff --git a/main.c b/main.c @@ -402,37 +402,49 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen) printsafe(data); } +#if 0 +static void +xmldatastart(XMLParser *p) +{ +// printf("DEBUG: %s\n", __func__); +} +#endif + static void xmldataend(XMLParser *p) { struct node *cur; char *start, *s, *e; +// printf("DEBUG: %s\n", __func__); + if (!htmldata.data || !htmldata.len) return; cur = &nodes[curnode]; - if (cur->displaytype & DisplayNone) { + +// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype); + + if (!cur->displaytype || (cur->displaytype & DisplayNone)) { /* nothing */ } else if (cur->displaytype & DisplayPre) { fwrite(htmldata.data, 1, htmldata.len, stdout); } else { start = htmldata.data; - s = start; - e = s + htmldata.len; - /* TODO: better white-space handling */ - for (; s < e; s++) { - if (isspace((unsigned char)*s)) { - if (s != start && !isspace((unsigned char)s[-1])) + e = htmldata.data + htmldata.len; + + /* TODO: better white-space handling, for example if there is only + white-space between 2 block elements then it can be ignored. */ + for (s = start; s < e; s++) { + if (*s == '\r') { + continue; + } else if (isspace((unsigned char)*s)) { + if (s == start || !isspace((unsigned char)s[-1])) putchar(' '); - } else { - if (!iscntrl((unsigned char)*s)) - putchar(*s); + } else if (!iscntrl((unsigned char)*s)) { + putchar(*s); } } - if (s != start && e != start && !isspace((unsigned char)s[-1]) && - isspace((unsigned char)e[-1])) - putchar(' '); } string_clear(&htmldata); @@ -479,19 +491,25 @@ xmltagstart(XMLParser *x, const char *t, size_t tl) struct node *cur; int i; +// printf("start of tag: %s\n", t); + if (curnode >= MAX_DEPTH - 2) errx(1, "max tag depth reached: %d\n", curnode); curnode++; cur = &nodes[curnode]; memset(cur, 0, sizeof(*cur)); - src[0] = '\0'; /* src, href */ + cur->displaytype = DisplayInline; strlcpy(cur->tag, t, sizeof(cur->tag)); + src[0] = '\0'; /* src, href */ + /* set display type */ for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) { if (!strcasecmp(tags[i].tag, t)) { - cur->displaytype |= tags[i].displaytype; + cur->displaytype = tags[i].displaytype; +// printf("match on tag: %s == %s, displaytype: %d\n", +// tags[i].tag, t, cur->displaytype); break; } } @@ -505,6 +523,8 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) cur = &nodes[curnode]; +// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag); + if (cur->displaytype & DisplayBlock) { fputs("\n", stdout); } else if (cur->displaytype & DisplayPre) { @@ -609,7 +629,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) if (nodes[i].displaytype & DisplayListItem) continue; if (nodes[i].displaytype & DisplayList) - fputs(" ", stdout); + fputs(" ", stdout); } /* TODO: for <ol>, keep list counter on ol element (parent), support ordered number type only */ @@ -656,6 +676,7 @@ main(void) parser.xmlattr = xmlattr; parser.xmlcdata = xmlcdata; parser.xmldata = xmldata; +// parser.xmldatastart = xmldatastart; parser.xmldataend = xmldataend; parser.xmldataentity = xmldataentity; parser.xmltagstart = xmltagstart;