webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit 114efd43e79a417abbda2e8c427d9dd57b482bce
parent ea14e82082be78917aaa6e380879c0e230330b47
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Mon, 24 Jul 2017 10:06:48 +0200

wip

Diffstat:
TODO | 1+
main.c | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/TODO b/TODO @@ -0,0 +1 @@ +? xml.c: make sure to always call xmldata handler even if datalen == 0 ? diff --git a/main.c b/main.c @@ -1,3 +1,6 @@ +/* TODO: escape control characters */ +/* TODO: specify and parse relative url */ + #include <ctype.h> #include <err.h> #include <stdint.h> @@ -76,9 +79,12 @@ static char *blocktags[] = { "title", "tr", "table", + "code", + "blockquote", }; static String htmldata; +static String preprocess; /* Clear string only; don't free, prevents unnecessary reallocation. */ static void @@ -115,6 +121,28 @@ string_append(String *s, const char *data, size_t len) s->data[s->len] = '\0'; } +#if 0 +static void +safeprint(const char *s, size_t len) +{ + size_t i; + + for (i = 0; i < len && *s; i++) { + switch (*s) { + case '\t': + case '\n': + putchar(*s); + break; + default: + if (iscntrl(*s)) + putchar(' '); + else + putchar(*s); + } + } +} +#endif + static void xmlcdata(XMLParser *p, const char *data, size_t datalen) { @@ -128,31 +156,41 @@ xmldataend(XMLParser *p) char *start, *s, *e; cur = &nodes[curnode]; + if (!htmldata.data || !htmldata.len) + return; start = htmldata.data; - for (s = start; *s; s++) +#if 1 + s = start; + e = s + strlen(s); +#else + for (s = start; *s; s++) { if (*s != '\r' && *s != '\n') break; + } - e = s + strlen(s); - for (; e > s; e--) + for (e = s + strlen(s); e > s; e--) { if (*e != '\r' && *e != '\n') break; + } +#endif if (cur->ispre) { fwrite(s, 1, e - s, stdout); } else { +#if 0 for (; s < e; s++) { - if (!isspace(*s)) - break; - } - for (; s < e; s++) { - if (!isspace(*s)) { - if (s != start && isspace(s[-1])) + if (isspace(*s)) { + if (s != start && !isspace(s[-1])) putchar(' '); + } else { putchar(*s); } } + if (s != start && e != start && !isspace(s[-1]) && isspace(e[-1])) + putchar(' '); +#endif + printf("DEBUG: |%s|\n", start); } string_clear(&htmldata); @@ -164,10 +202,9 @@ xmldata(XMLParser *p, const char *data, size_t datalen) struct node *cur; cur = &nodes[curnode]; + string_append(&htmldata, data, datalen); if (cur->isignore) return; - - string_append(&htmldata, data, datalen); } static void @@ -239,6 +276,9 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort) src[0] = '\0'; #endif + if (!strcmp(tag, "tr")) + fputs(" | ", stdout); /* HACK */ + if (cur->isblock) fputs("\n", stdout); @@ -266,6 +306,9 @@ xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) if (cur->isblock) fputs("\n", stdout); + if (!strcmp(tag, "td")) + fputs(" | ", stdout); /* HACK */ + if (!strcmp(cur->tag, "li")) { /* indent nested list items */ for (i = curnode; i; i--) { @@ -295,18 +338,57 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, if (!strcmp(tag, "a") && !strcmp(name, "href") && valuelen) strlcpy(src, value, sizeof(src)); - /* TODO: check alt and title attr also? */ - if (!strcmp(tag, "img") && !strcmp(name, "src") && valuelen) + if ((!strcmp(tag, "img") || !strcmp(tag, "video") || !strcmp(tag, "audio")) && + !strcmp(name, "src") && valuelen) strlcpy(src, value, sizeof(src)); } +static size_t read_offset; + +int +readchar(void) +{ + size_t i, j; + int c; + + for (; readoffset < preprocess.len; ) { + if (preprocess.data[read_offset] != '<') + return preprocess.data[read_offset++]; + + for (j = 0; j < sizeof(ignoretags) / sizeof(*ignoretags); j++) { + if (!strncmp(&preprocess.data[i + 1], ignoretags[i], sizeof(ignoretags[i]) - 1)) { + if (strchr(" \t>", preprocess.data[i + 1 + sizeof(ignoretags[i]) - 1])) { + /* TODO: search until end of this tag */ + } + } + } + /* TODO: if no match just return char */ + return preprocess.data[read_offset++]; + } + return EOF; +} + /* TODO: preprocess data, strip <script>, <style> etc */ int main(void) { + + char buf[BUFSIZ]; + int n; + if (pledge("stdio", NULL) < 0) err(1, "pledge"); + /* TODO: optimize later */ + while (1) { + /* TODO: check read error */ + n = read(0, buf, sizeof(buf) - 1); + if (n <= 0) + break; + buf[n] = '\0'; + string_append(&preprocess, buf, n); + } + parser.xmlattr = xmlattr; parser.xmlcdata = xmlcdata; parser.xmldata = xmldata; @@ -316,7 +398,7 @@ main(void) parser.xmltagend = xmltagend; parser.xmltagstartparsed = xmltagstartparsed; - parser.getnext = getchar; + parser.getnext = readchar; xml_parse(&parser); putchar('\n');