webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit cd440128ab251321e18dc25802936a30cf25a5e9
parent 0ac210d169689c8e8a33351adf6a2d06b9f7322d
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Thu, 27 Jun 2019 19:41:13 +0200

sync XML improvements from sfeed

Diffstat:
xml.c | 125+++++++++++++++++++++++++++++++++++++------------------------------------------
xml.h | 5+++++
2 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/xml.c b/xml.c @@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x) size_t namelen = 0, valuelen; int c, endsep, endname = 0, valuestart = 0; - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { if (isspace(c)) { if (namelen) endname = 1; @@ -51,7 +51,7 @@ xml_parseattrs(XMLParser *x) goto startvalue; } - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { startvalue: if (c == '&') { /* entities */ x->data[valuelen] = '\0'; @@ -60,7 +60,7 @@ startvalue: x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); x->data[0] = c; valuelen = 1; - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) break; if (valuelen < sizeof(x->data) - 1) @@ -124,9 +124,9 @@ xml_parsecomment(XMLParser *x) if (x->xmlcommentstart) x->xmlcommentstart(x); - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { if (c == '-' || c == '>') { - if (x->xmlcomment) { + if (x->xmlcomment && datalen) { x->data[datalen] = '\0'; x->xmlcomment(x, x->data, datalen); datalen = 0; @@ -173,9 +173,9 @@ xml_parsecdata(XMLParser *x) if (x->xmlcdatastart) x->xmlcdatastart(x); - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { if (c == ']' || c == '>') { - if (x->xmlcdata) { + if (x->xmlcdata && datalen) { x->data[datalen] = '\0'; x->xmlcdata(x, x->data, datalen); datalen = 0; @@ -247,19 +247,19 @@ static int namedentitytostr(const char *e, char *buf, size_t bufsiz) { static const struct { - char *entity; + const char *entity; int c; } entities[] = { - { "&amp;", '&' }, - { "&lt;", '<' }, - { "&gt;", '>' }, - { "&apos;", '\'' }, - { "&quot;", '"' }, - { "&AMP;", '&' }, - { "&LT;", '<' }, - { "&GT;", '>' }, - { "&APOS;", '\'' }, - { "&QUOT;", '"' } + { "amp;", '&' }, + { "lt;", '<' }, + { "gt;", '>' }, + { "apos;", '\'' }, + { "quot;", '"' }, + { "AMP;", '&' }, + { "LT;", '<' }, + { "GT;", '>' }, + { "APOS;", '\'' }, + { "QUOT;", '"' } }; size_t i; @@ -267,10 +267,6 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz) if (bufsiz < 2) return -1; - /* doesn't start with &: can't match */ - if (*e != '&') - return 0; - for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { if (!strcmp(e, entities[i].entity)) { buf[0] = entities[i].c; @@ -292,12 +288,6 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz) if (bufsiz < 5) return -1; - /* not a numeric entity */ - if (e[0] != '&' || e[1] != '#') - return 0; - - /* e[1] == '#', numeric / hexadecimal entity */ - e += 2; /* skip "&#" */ errno = 0; /* hex (16) or decimal (10) */ if (*e == 'x') @@ -318,37 +308,32 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz) int xml_entitytostr(const char *e, char *buf, size_t bufsiz) { - /* buffer is too small */ - if (bufsiz < 5) - return -1; /* doesn't start with & */ if (e[0] != '&') return 0; - /* named entity */ - if (e[1] != '#') - return namedentitytostr(e, buf, bufsiz); - else /* numeric entity */ - return numericentitytostr(e, buf, bufsiz); + /* numeric entity */ + if (e[1] == '#') + return numericentitytostr(e + 2, buf, bufsiz); + else /* named entity */ + return namedentitytostr(e + 1, buf, bufsiz); } void xml_parse(XMLParser *x) { - int c, ispi; - size_t datalen, tagdatalen, taglen; + size_t datalen, tagdatalen; + int c, isend; - if (!x->getnext) - return; - while ((c = x->getnext()) != EOF && c != '<') + while ((c = GETNEXT()) != EOF && c != '<') ; /* skip until < */ while (c != EOF) { if (c == '<') { /* parse tag */ - if ((c = x->getnext()) == EOF) + if ((c = GETNEXT()) == EOF) return; if (c == '!') { /* cdata and comments */ - for (tagdatalen = 0; (c = x->getnext()) != EOF;) { + for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ if (tagdatalen <= sizeof("[CDATA[") - 1) x->data[tagdatalen++] = c; @@ -367,30 +352,32 @@ xml_parse(XMLParser *x) } } } else { - x->tag[0] = '\0'; - x->taglen = 0; - /* normal tag (open, short open, close), processing instruction. */ - if (isspace(c)) - while ((c = x->getnext()) != EOF && isspace(c)) - ; - if (c == EOF) - return; x->tag[0] = c; - ispi = (c == '?') ? 1 : 0; - x->isshorttag = ispi; - taglen = 1; - while ((c = x->getnext()) != EOF) { + x->taglen = 1; + x->isshorttag = isend = 0; + + /* treat processing instruction as shorttag, don't strip "?" prefix. */ + if (c == '?') { + x->isshorttag = 1; + } else if (c == '/') { + if ((c = GETNEXT()) == EOF) + return; + x->tag[0] = c; + isend = 1; + } + + while ((c = GETNEXT()) != EOF) { if (c == '/') x->isshorttag = 1; /* short tag */ else if (c == '>' || isspace(c)) { - x->tag[taglen] = '\0'; - if (x->tag[0] == '/') { /* end tag, starts with </ */ - x->taglen = --taglen; /* len -1 because of / */ - if (taglen && x->xmltagend) - x->xmltagend(x, &(x->tag)[1], x->taglen, 0); + x->tag[x->taglen] = '\0'; + if (isend) { /* end tag, starts with </ */ + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; } else { - x->taglen = taglen; /* start tag */ if (x->xmltagstart) x->xmltagstart(x, x->tag, x->taglen); @@ -400,11 +387,15 @@ xml_parse(XMLParser *x) x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); } /* call tagend for shortform or processing instruction */ - if ((x->isshorttag || ispi) && x->xmltagend) - x->xmltagend(x, x->tag, x->taglen, 1); + if (x->isshorttag) { + if (x->xmltagend) + x->xmltagend(x, x->tag, x->taglen, x->isshorttag); + x->tag[0] = '\0'; + x->taglen = 0; + } break; - } else if (taglen < sizeof(x->tag) - 1) - x->tag[taglen++] = c; /* NOTE: tag name truncation */ + } else if (x->taglen < sizeof(x->tag) - 1) + x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ } } } else { @@ -412,7 +403,7 @@ xml_parse(XMLParser *x) datalen = 0; if (x->xmldatastart) x->xmldatastart(x); - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { if (c == '&') { if (datalen) { x->data[datalen] = '\0'; @@ -421,7 +412,7 @@ xml_parse(XMLParser *x) } x->data[0] = c; datalen = 1; - while ((c = x->getnext()) != EOF) { + while ((c = GETNEXT()) != EOF) { if (c == '<') break; if (datalen < sizeof(x->data) - 1) diff --git a/xml.h b/xml.h @@ -1,3 +1,6 @@ +#ifndef _XML_H +#define _XML_H + typedef struct xmlparser { /* handlers */ void (*xmlattr)(struct xmlparser *, const char *, size_t, @@ -23,6 +26,7 @@ typedef struct xmlparser { void (*xmltagstartparsed)(struct xmlparser *, const char *, size_t, int); + #define GETNEXT (x)->getnext int (*getnext)(void); /* current tag */ @@ -38,3 +42,4 @@ typedef struct xmlparser { int xml_entitytostr(const char *, char *, size_t); void xml_parse(XMLParser *); +#endif