webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit b708236e10ae2b6af6e62514f2ca159fd6eeeabd
parent 69314d208de2a232366a14a9c9fef7400e4e0647
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 15 Sep 2019 20:03:21 +0200

some improvements

- improve table rendering (in a hacky way for now).
- print some characters safe (disallow control-characters except TAB and
  newline for now).
- print "Link references" before the links at the bottom.
- update TODO.

Diffstat:
TODO | 2++
main.c | 72++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/TODO b/TODO @@ -1,3 +1,5 @@ +- print safe (not certain control chars, except newline, TAB etc). + - improve/remove duplicate white-space/newlines? - cleanup code. diff --git a/main.c b/main.c @@ -47,6 +47,7 @@ static char src[4096]; /* src or href attribute */ #define MAX_DEPTH 256 static struct node nodes[MAX_DEPTH]; static int curnode; +static int ignoredata; static char *pretags[] = { "pre", @@ -152,32 +153,46 @@ string_append(String *s, const char *data, size_t len) s->data[s->len] = '\0'; } -#if 0 -static void -safeprint(const char *s, size_t len) +char * +xstrdup(const char *s) { - size_t i; + char *p; + + if (!(p = strdup(s))) + err(1, "strdup"); + return p; +} + +void * +xcalloc(size_t nmemb, size_t size) +{ + void *p; + + if (!(p = calloc(nmemb, size))) + err(1, "calloc"); + return p; +} - for (i = 0; i < len && *s; i++) { +static void +printsafe(const char *s) +{ + for (; *s; s++) { switch (*s) { case '\t': case '\n': putchar(*s); break; default: - if (iscntrl(*s)) - putchar(' '); - else + if (!iscntrl((unsigned char)*s)) putchar(*s); } } } -#endif static void xmlcdata(XMLParser *p, const char *data, size_t datalen) { - fputs(data, stdout); + printsafe(data); } static void @@ -215,7 +230,8 @@ xmldataend(XMLParser *p) if (s != start && !isspace((unsigned char)s[-1])) putchar(' '); } else { - putchar(*s); + if (!iscntrl((unsigned char)*s)) + putchar(*s); } } if (s != start && e != start && !isspace((unsigned char)s[-1]) && @@ -231,6 +247,8 @@ xmldataend(XMLParser *p) static void xmldata(XMLParser *p, const char *data, size_t datalen) { + if (ignoredata) + return; string_append(&htmldata, data, datalen); } @@ -267,6 +285,11 @@ xmltagstart(XMLParser *x, const char *t, size_t tl) src[0] = '\0'; /* src, href */ strlcpy(cur->tag, t, sizeof(cur->tag)); + if (!strcasecmp(t, "table")) + ignoredata = 1; + else if (!strcasecmp(t, "td") || !strcasecmp(t, "th")) + ignoredata = 0; + for (i = 0; i < sizeof(pretags) / sizeof(*pretags); i++) { if (!strcasecmp(pretags[i], t)) { cur->ispre = 1; @@ -295,18 +318,25 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) cur = &nodes[curnode]; - if (!strcasecmp(t, "tr")) - fputs(" | ", stdout); /* HACK */ + if (!strcasecmp(t, "tr")) { + fputs(" | ", stdout); /* HACK: last cell */ + return; + } else if (!strcasecmp(t, "td") || !strcasecmp(t, "th")) { + ignoredata = 1; + return; + } else if (!strcasecmp(t, "table")) { + ignoredata = 0; + } if (cur->isblock) fputs("\n", stdout); if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') { if (t[1] >= '3') - for (i = 0; i < 36; i++) + for (i = 0; i < 72; i++) putchar('-'); else if (t[1] >= '1') - for (i = 0; i < 36; i++) + for (i = 0; i < 72; i++) putchar('='); putchar('\n'); } @@ -345,11 +375,11 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) printf("[%s]", t); /* TODO: check allocation */ if (!links_head) - links_cur = links_head = calloc(1, sizeof(*links_head)); + links_cur = links_head = xcalloc(1, sizeof(*links_head)); else - links_cur = links_cur->next = calloc(1, sizeof(*links_head)); - links_cur->type = strdup(t); - links_cur->url = strdup(src); + links_cur = links_cur->next = xcalloc(1, sizeof(*links_head)); + links_cur->type = xstrdup(t); + links_cur->url = xstrdup(src); } src[0] = '\0'; #endif @@ -373,7 +403,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) support ordered number type only */ fputs("* ", stdout); } else if (!strcasecmp(t, "hr")) { - for (i = 0; i < 36; i++) + for (i = 0; i < 72; i++) putchar('-'); } } @@ -396,6 +426,8 @@ printlinkrefs(void) { size_t i; + printf("\n\nLink references:\n"); + /* TODO: add title attribute or some basic description? */ for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++) printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);