webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit ea14e82082be78917aaa6e380879c0e230330b47
parent 421341e1a2b737cb269a144a1634511705161651
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 22 Jul 2017 23:49:01 +0200

improve whitespace handling (needs more work)

Diffstat:
Makefile | 2+-
main.c | 108++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
2 files changed, 78 insertions(+), 32 deletions(-)

diff --git a/Makefile b/Makefile @@ -1,5 +1,5 @@ build: clean - cc xml.c main.c -o main + cc -ggdb -O0 -Wall xml.c main.c -o main clean: rm -f main *.o diff --git a/main.c b/main.c @@ -10,10 +10,9 @@ #include "xml.h" /* string and size */ -#define STRP(s) s,sizeof(s)-1 +/*#define STRP(s) s,sizeof(s)-1*/ static XMLParser parser; -static int isdatastart; struct node { char tag[256]; @@ -23,6 +22,15 @@ struct node { int isblock; }; +typedef struct node Node; + +/* String data / memory pool */ +typedef struct string { + char *data; /* data */ + size_t len; /* string length */ + size_t bufsiz; /* allocated size */ +} String; + static char src[4096]; /* src or href attribute */ #define MAX_DEPTH 256 @@ -70,58 +78,96 @@ static char *blocktags[] = { "table", }; +static String htmldata; + +/* Clear string only; don't free, prevents unnecessary reallocation. */ static void -xmlcdata(XMLParser *p, const char *data, size_t datalen) +string_clear(String *s) { - fputs(data, stdout); + if (s->data) + s->data[0] = '\0'; + s->len = 0; } static void -xmldatastart(XMLParser *p) +string_buffer_realloc(String *s, size_t newlen) { - isdatastart = 1; + size_t alloclen; + + for (alloclen = 64; alloclen <= newlen; alloclen *= 2) + ; + if (!(s->data = realloc(s->data, alloclen))) + err(1, "realloc"); + s->bufsiz = alloclen; } static void -xmldataend(XMLParser *p) +string_append(String *s, const char *data, size_t len) { - isdatastart = 0; + if (!len) + return; + /* check if allocation is necesary, don't shrink buffer, + * should be more than bufsiz ofcourse. */ + if (s->len + len >= s->bufsiz) + string_buffer_realloc(s, s->len + len + 1); + memcpy(s->data + s->len, data, len); + s->len += len; + s->data[s->len] = '\0'; } static void -xmldata(XMLParser *p, const char *data, size_t datalen) +xmlcdata(XMLParser *p, const char *data, size_t datalen) +{ + fputs(data, stdout); +} + +static void +xmldataend(XMLParser *p) { struct node *cur; - const char *s = data; + char *start, *s, *e; cur = &nodes[curnode]; - if (cur->isignore) - goto end; - /* TODO: if not <pre> or w/e, skip? */ - if (isdatastart && isspace(*s)) { - for (s++; *s; s++) { - if (!isspace(*s)) - break; - } - putchar(' '); - } + start = htmldata.data; + for (s = start; *s; s++) + if (*s != '\r' && *s != '\n') + break; + + e = s + strlen(s); + for (; e > s; e--) + if (*e != '\r' && *e != '\n') + break; if (cur->ispre) { - for (; *s; s++) - putchar(*s); + fwrite(s, 1, e - s, stdout); } else { - for (; *s; s++) { - if (isspace(*s)) - putchar(' '); - else + for (; s < e; s++) { + if (!isspace(*s)) + break; + } + for (; s < e; s++) { + if (!isspace(*s)) { + if (s != start && isspace(s[-1])) + putchar(' '); putchar(*s); + } } } -end: - /* TODO: remove trailing space also ? */ - isdatastart = 0; + string_clear(&htmldata); +} + +static void +xmldata(XMLParser *p, const char *data, size_t datalen) +{ + struct node *cur; + + cur = &nodes[curnode]; + if (cur->isignore) + return; + + string_append(&htmldata, data, datalen); } static void @@ -136,7 +182,7 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen) if (n <= 0) xmldata(p, data, datalen); else - fputs(buf, stdout); + string_append(&htmldata, buf, n); } static void @@ -254,6 +300,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, strlcpy(src, value, sizeof(src)); } +/* TODO: preprocess data, strip <script>, <style> etc */ int main(void) { @@ -262,7 +309,6 @@ main(void) parser.xmlattr = xmlattr; parser.xmlcdata = xmlcdata; - parser.xmldatastart = xmldatastart; parser.xmldata = xmldata; parser.xmldataend = xmldataend; parser.xmldataentity = xmldataentity;