webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit dcc69463abb4a70f95b6126629e5d6ab57e393e3
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 22 Jul 2017 14:36:51 +0200

initial repo (experiment)

Diffstat:
Makefile | 5+++++
main.c | 234+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
xml.c | 436+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
xml.h | 44++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 719 insertions(+), 0 deletions(-)

diff --git a/Makefile b/Makefile @@ -0,0 +1,5 @@ +build: clean + cc xml.c main.c -o main + +clean: + rm -f main *.o diff --git a/main.c b/main.c @@ -0,0 +1,234 @@ +#include <ctype.h> +#include <err.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> + +#include "xml.h" + +/* string and size */ +#define STRP(s) s,sizeof(s)-1 + +static XMLParser parser; +static int isdatastart; +static int iscdatastart; + +struct node { + char tag[256]; + int ispre; + int isinline; + int isblock; +}; + +#define MAX_DEPTH 256 +static struct node nodes[MAX_DEPTH]; +static int curnode; + +static void +printindent(int count) +{ +/* while (count--) + putchar('\t');*/ +} + +static void +xmlcommentstart(XMLParser *p) +{ + /*printf("<!--");*/ +} + +static void +xmlcomment(XMLParser *p, const char *data, size_t datalen) +{ + /*printf("%s", data);*/ +} + +static void +xmlcommentend(XMLParser *p) +{ + /*printf("-->");*/ +} + +static void +xmlcdatastart(XMLParser *p) +{ + iscdatastart = 1; +/* printf("<![CDATA[");*/ +} + +static void +xmlcdata(XMLParser *p, const char *data, size_t datalen) +{ + /* TODO */ +/* printf("%s", data);*/ + iscdatastart = 0; +} + +static void +xmlcdataend(XMLParser *p) +{ +/* printf("]]>");*/ + iscdatastart = 0; +} + +static void +xmldatastart(XMLParser *p) +{ + isdatastart = 1; +} + +static void +xmldataend(XMLParser *p) +{ + isdatastart = 0; +} + +static void +xmldata(XMLParser *p, const char *data, size_t datalen) +{ + struct node *cur; + const char *s = data; + + cur = &nodes[curnode]; + + /* TODO: if not <pre> or w/e, skip? */ + if (isdatastart && isspace(*s)) { + for (s++; *s; s++) { + if (!isspace(*s)) + break; + } + putchar(' '); + } + + if (cur->ispre) { + for (; *s; s++) { + putchar(*s); + } + } else { + for (; *s; s++) { + if (isspace(*s)) + putchar(' '); + else + putchar(*s); + } + } + + /* TODO: remove trailing space also ? */ + isdatastart = 0; +} + +static void +xmldataentity(XMLParser *p, const char *data, size_t datalen) +{ + /* TODO: convert HTML entity */ + /*printf("%s", data);*/ + + xmldata(p, data, datalen); +} + +static void +xmltagstart(XMLParser *p, const char *tag, size_t taglen) +{ + struct node *cur = &nodes[curnode]; + + memset(cur, 0, sizeof(*cur)); + strlcpy(cur->tag, tag, sizeof(cur->tag)); + + if (!strcmp(tag, "pre")) + cur->ispre = 1; + else if (tag[0] == 'h' && tag[1] >= '1' && tag[1] <= '6' && tag[2] == '\0' || + !strcmp(tag, "p") || !strcmp(tag, "ul") || !strcmp(tag, "ol") || + !strcmp(tag, "li") || !strcmp(tag, "hr") || + !strcmp(tag, "br") || !strcmp(tag, "title") || !strcmp(tag, "tr") || + !strcmp(tag, "table")) + cur->isblock = 1; + else if (!strcmp(tag, "a") || !strcmp(tag, "span") || !strcmp(tag, "img")) + cur->isinline = 1; + + if (!cur->isinline) + printindent(curnode); + +/* printf("<%s", tag);*/ +} + +static void +xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort) +{ + struct node *cur; + + if (curnode) + curnode--; + + if (isshort) { +/* printf("/>");*/ + return; + } + cur = &nodes[curnode]; + if (!cur->isinline) + printindent(curnode); +/* printf("</%s>", tag);*/ + if (cur->isblock) + fputs("\n", stdout); +} + +static void +xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort) +{ + struct node *cur; + + cur = &nodes[curnode]; + if (cur->isblock) + putchar('\n'); + if (!strcmp(cur->tag, "li")) + fputs("* ", stdout); + else if (!strcmp(cur->tag, "hr")) + fputs("----------", stdout); + + if (isshort) + return; + + curnode++; + +/* printf(">");*/ +} + +static void +xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, + size_t namelen, const char *value, size_t valuelen) +{ +/* if (!strcmp(tag, "a") && !strcmp(name, "href") && valuelen) + printf(" [%s]", value);*/ + +/* printf(" %s=\"%s\"", name, value);*/ +} + +int +main(int argc, char *argv[]) +{ + if (pledge("stdio", NULL) < 0) + err(1, "pledge"); + + parser.xmlattr = xmlattr; + parser.xmlcdatastart = xmlcdatastart; + parser.xmlcdata = xmlcdata; + parser.xmlcdataend = xmlcdataend; + parser.xmlcommentstart = xmlcommentstart; + parser.xmlcomment = xmlcomment; + parser.xmlcommentend = xmlcommentend; + parser.xmldatastart = xmldatastart; + parser.xmldata = xmldata; + parser.xmldataend = xmldataend; + parser.xmldataentity = xmldataentity; + parser.xmltagstart = xmltagstart; + parser.xmltagend = xmltagend; + parser.xmltagstartparsed = xmltagstartparsed; + + parser.getnext = getchar; + xml_parse(&parser); + putchar('\n'); + + return 0; +} diff --git a/xml.c b/xml.c @@ -0,0 +1,436 @@ +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "xml.h" + +static void +xml_parseattrs(XMLParser *x) +{ + size_t namelen = 0, valuelen; + int c, endsep, endname = 0; + + while ((c = x->getnext()) != EOF) { + if (isspace(c)) { /* TODO: simplify endname ? */ + if (namelen) + endname = 1; + continue; + } + if (c == '?') + ; /* ignore */ + else if (c == '=') { + x->name[namelen] = '\0'; + } else if (namelen && ((endname && isalpha(c)) || (c == '>' || c == '/'))) { + /* attribute without value */ + x->name[namelen] = '\0'; + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + endname = 0; + x->name[0] = c; + namelen = 1; + } else if (namelen && (c == '\'' || c == '"')) { + /* attribute with value */ + endsep = c; /* c is end separator */ + if (x->xmlattrstart) + x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); + for (valuelen = 0; (c = x->getnext()) != EOF;) { + if (c == '&') { /* entities */ + x->data[valuelen] = '\0'; + /* call data function with data before entity if there is data */ + if (valuelen && x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + while ((c = x->getnext()) != EOF) { + if (c == endsep) + break; + if (valuelen < sizeof(x->data) - 1) + x->data[valuelen++] = c; + else { + /* TODO: entity too long? this should be very strange. */ + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + valuelen = 0; + break; + } + if (c == ';') { + x->data[valuelen] = '\0'; + if (x->xmlattrentity) + x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + valuelen = 0; + break; + } + } + } else if (c != endsep) { + if (valuelen < sizeof(x->data) - 1) { + x->data[valuelen++] = c; + } else { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + x->data[0] = c; + valuelen = 1; + } + } + if (c == endsep) { + x->data[valuelen] = '\0'; + if (x->xmlattr) + x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); + if (x->xmlattrend) + x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); + break; + } + } + namelen = 0; + endname = 0; + } else if (namelen < sizeof(x->name) - 1) { + x->name[namelen++] = c; + } + if (c == '>') { + break; + } else if (c == '/') { + x->isshorttag = 1; + namelen = 0; + x->name[0] = '\0'; + } + } +} + +static void +xml_parsecomment(XMLParser *x) +{ + static const char *end = "-->"; + size_t datalen = 0, i = 0; + char tmp[4]; + int c; + + if (x->xmlcommentstart) + x->xmlcommentstart(x); + while ((c = x->getnext()) != EOF) { + if (c == end[i]) { + if (end[++i] == '\0') { /* end */ + x->data[datalen] = '\0'; + if (x->xmlcomment) + x->xmlcomment(x, x->data, datalen); + if (x->xmlcommentend) + x->xmlcommentend(x); + return; + } + } else if (i) { + if (x->xmlcomment) { + x->data[datalen] = '\0'; + if (datalen) + x->xmlcomment(x, x->data, datalen); + memcpy(tmp, end, i); + tmp[i] = '\0'; + x->xmlcomment(x, tmp, i); + } + i = 0; + x->data[0] = c; + datalen = 1; + } else if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcomment) + x->xmlcomment(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +static void +xml_parsecdata(XMLParser *x) +{ + static const char *end = "]]>"; + size_t datalen = 0, i = 0; + char tmp[4]; + int c; + + if (x->xmlcdatastart) + x->xmlcdatastart(x); + while ((c = x->getnext()) != EOF) { + if (c == end[i]) { + if (end[++i] == '\0') { /* end */ + x->data[datalen] = '\0'; + if (x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + if (x->xmlcdataend) + x->xmlcdataend(x); + return; + } + } else if (i) { + x->data[datalen] = '\0'; + if (x->xmlcdata) { + if (datalen) + x->xmlcdata(x, x->data, datalen); + memcpy(tmp, end, i); + tmp[i] = '\0'; + x->xmlcdata(x, tmp, i); + } + i = 0; + x->data[0] = c; + datalen = 1; + } else if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmlcdata) + x->xmlcdata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } +} + +int +xml_codepointtoutf8(uint32_t cp, uint32_t *utf) +{ + if (cp >= 0x10000) { + /* 4 bytes */ + *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) | + ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | + (cp & 0x3f); + return 4; + } else if (cp >= 0x00800) { + /* 3 bytes */ + *utf = 0xe08080 | + ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) | + (cp & 0x3f); + return 3; + } else if (cp >= 0x80) { + /* 2 bytes */ + *utf = 0xc080 | + ((cp & 0xfc0) << 2) | (cp & 0x3f); + return 2; + } + *utf = cp & 0xff; + return *utf ? 1 : 0; /* 1 byte */ +} + +ssize_t +xml_namedentitytostr(const char *e, char *buf, size_t bufsiz) +{ + const struct { + char *entity; + int c; + } entities[] = { + { .entity = "&amp;", .c = '&' }, + { .entity = "&lt;", .c = '<' }, + { .entity = "&gt;", .c = '>' }, + { .entity = "&apos;", .c = '\'' }, + { .entity = "&quot;", .c = '"' }, + { .entity = "&AMP;", .c = '&' }, + { .entity = "&LT;", .c = '<' }, + { .entity = "&GT;", .c = '>' }, + { .entity = "&APOS;", .c = '\'' }, + { .entity = "&QUOT;", .c = '"' } + }; + size_t i; + + /* buffer is too small */ + if (bufsiz < 2) + return -1; + + /* doesn't start with &: can't match */ + if (*e != '&') + return 0; + + for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { + if (!strcmp(e, entities[i].entity)) { + buf[0] = entities[i].c; + buf[1] = '\0'; + return 1; + } + } + return 0; +} + +ssize_t +xml_numericentitytostr(const char *e, char *buf, size_t bufsiz) +{ + uint32_t l = 0, cp = 0; + size_t b, len; + char *end; + + /* buffer is too small */ + if (bufsiz < 5) + return -1; + + /* not a numeric entity */ + if (!(e[0] == '&' && e[1] == '#')) + return 0; + + /* e[1] == '#', numeric / hexadecimal entity */ + e += 2; /* skip "&#" */ + errno = 0; + /* hex (16) or decimal (10) */ + if (*e == 'x') + l = strtoul(e + 1, &end, 16); + else + l = strtoul(e, &end, 10); + /* invalid value or not a well-formed entity */ + if (errno || *end != ';') + return 0; + len = xml_codepointtoutf8(l, &cp); + /* make string */ + for (b = 0; b < len; b++) + buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff; + buf[len] = '\0'; + + return (ssize_t)len; +} + +/* convert named- or numeric entity string to buffer string + * returns byte-length of string. */ +ssize_t +xml_entitytostr(const char *e, char *buf, size_t bufsiz) +{ + /* buffer is too small */ + if (bufsiz < 5) + return -1; + /* doesn't start with & */ + if (e[0] != '&') + return 0; + /* named entity */ + if (e[1] != '#') + return xml_namedentitytostr(e, buf, bufsiz); + else /* numeric entity */ + return xml_numericentitytostr(e, buf, bufsiz); +} + +void +xml_parse(XMLParser *x) +{ + int c, ispi; + size_t datalen, tagdatalen, taglen; + + if (!x->getnext) + return; + while ((c = x->getnext()) != EOF && c != '<') + ; /* skip until < */ + + while (c != EOF) { + if (c == '<') { /* parse tag */ + if ((c = x->getnext()) == EOF) + return; + x->tag[0] = '\0'; + x->taglen = 0; + if (c == '!') { /* cdata and comments */ + for (tagdatalen = 0; (c = x->getnext()) != EOF;) { + if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */ + x->data[tagdatalen++] = c; /* TODO: prevent overflow */ + if (c == '>') + break; + else if (c == '-' && tagdatalen == sizeof("--") - 1 && + (x->data[0] == '-')) { + xml_parsecomment(x); + break; + } else if (c == '[') { + if (tagdatalen == sizeof("[CDATA[") - 1 && + !strncmp(x->data, "[CDATA[", tagdatalen)) { + xml_parsecdata(x); + break; + } + } + } + } else { + /* normal tag (open, short open, close), processing instruction. */ + if (isspace(c)) + while ((c = x->getnext()) != EOF && isspace(c)) + ; + if (c == EOF) + return; + x->tag[0] = c; + ispi = (c == '?') ? 1 : 0; + x->isshorttag = ispi; + taglen = 1; + while ((c = x->getnext()) != EOF) { + if (c == '/') /* TODO: simplify short tag? */ + x->isshorttag = 1; /* short tag */ + else if (c == '>' || isspace(c)) { + x->tag[taglen] = '\0'; + if (x->tag[0] == '/') { /* end tag, starts with </ */ + x->taglen = --taglen; /* len -1 because of / */ + if (taglen && x->xmltagend) + x->xmltagend(x, &(x->tag)[1], x->taglen, 0); + } else { + x->taglen = taglen; + /* start tag */ + if (x->xmltagstart) + x->xmltagstart(x, x->tag, x->taglen); + if (isspace(c)) + xml_parseattrs(x); + if (x->xmltagstartparsed) + x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); + } + /* call tagend for shortform or processing instruction */ + if ((x->isshorttag || ispi) && x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, 1); + break; + } else if (taglen < sizeof(x->tag) - 1) + x->tag[taglen++] = c; + } + } + } else { + /* parse tag data */ + datalen = 0; + if (x->xmldatastart) + x->xmldatastart(x); + while ((c = x->getnext()) != EOF) { + if (c == '&') { + if (datalen) { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + } + x->data[0] = c; + datalen = 1; + while ((c = x->getnext()) != EOF) { + if (c == '<') + break; + if (datalen < sizeof(x->data) - 1) + x->data[datalen++] = c; + if (isspace(c)) + break; + else if (c == ';') { + x->data[datalen] = '\0'; + if (x->xmldataentity) + x->xmldataentity(x, x->data, datalen); + datalen = 0; + break; + } + } + } else if (c != '<') { + if (datalen < sizeof(x->data) - 1) { + x->data[datalen++] = c; + } else { + x->data[datalen] = '\0'; + if (x->xmldata) + x->xmldata(x, x->data, datalen); + x->data[0] = c; + datalen = 1; + } + } + if (c == '<') { + x->data[datalen] = '\0'; + if (x->xmldata && datalen) + x->xmldata(x, x->data, datalen); + if (x->xmldataend) + x->xmldataend(x); + break; + } + } + } + } +} diff --git a/xml.h b/xml.h @@ -0,0 +1,44 @@ +typedef struct xmlparser { + /* handlers */ + void (*xmlattr)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlattrend)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrstart)(struct xmlparser *, const char *, size_t, + const char *, size_t); + void (*xmlattrentity)(struct xmlparser *, const char *, size_t, + const char *, size_t, const char *, size_t); + void (*xmlcdatastart)(struct xmlparser *); + void (*xmlcdata)(struct xmlparser *, const char *, size_t); + void (*xmlcdataend)(struct xmlparser *); + void (*xmlcommentstart)(struct xmlparser *); + void (*xmlcomment)(struct xmlparser *, const char *, size_t); + void (*xmlcommentend)(struct xmlparser *); + void (*xmldata)(struct xmlparser *, const char *, size_t); + void (*xmldataend)(struct xmlparser *); + void (*xmldataentity)(struct xmlparser *, const char *, size_t); + void (*xmldatastart)(struct xmlparser *); + void (*xmltagend)(struct xmlparser *, const char *, size_t, int); + void (*xmltagstart)(struct xmlparser *, const char *, size_t); + void (*xmltagstartparsed)(struct xmlparser *, const char *, + size_t, int); + + int (*getnext)(void); + + /* current tag */ + char tag[1024]; + size_t taglen; + /* current tag is in short form ? <tag /> */ + int isshorttag; + /* current attribute name */ + char name[256]; + /* data buffer used for tag data, cdata and attribute data */ + char data[BUFSIZ]; +} XMLParser; + +int xml_codepointtoutf8(uint32_t, uint32_t *); +ssize_t xml_entitytostr(const char *, char *, size_t); +ssize_t xml_namedentitytostr(const char *, char *, size_t); +ssize_t xml_numericentitytostr(const char *, char *, size_t); + +void xml_parse(XMLParser *);