webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit b82529ac7152b6326161c23b267d7719090ba168
parent f3f8b7d8e8f4b72c072488b524cfd0b08791fdb4
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sun, 22 Sep 2019 19:14:41 +0200

rename main.c to webdump.c

Diffstat:
main.c | 697-------------------------------------------------------------------------------
webdump.c | 706+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 706 insertions(+), 697 deletions(-)

diff --git a/main.c b/main.c @@ -1,697 +0,0 @@ -#include <ctype.h> -#include <err.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <strings.h> -#include <unistd.h> - -#include "xml.h" - -static XMLParser parser; - -/* uri */ -struct uri { - char proto[48]; - char host[256]; - char path[2048]; - char port[6]; /* numeric port */ -}; - -static int termwidth = 72; - -#if 0 -/* linked-list of link references */ -struct linkref { - char *type; - char *url; - struct linkref *next; -}; - -static struct linkref *links_head; -static struct linkref *links_cur; -static int linkcount; -#endif - -enum DisplayType { - DisplayUnknown = 0, - DisplayNone = 1, - DisplayPre = 2, - DisplayInline = 4, - DisplayInlineBlock = 8, - DisplayBlock = 16, - DisplayList = 32, - DisplayListItem = 64, - DisplayTable = 128, - DisplayTableRow = 256, - DisplayTableCell = 512, - DisplayHeader = 1024, -}; - -struct node { - char tag[256]; - enum DisplayType displaytype; -}; - -typedef struct node Node; - -/* String data / memory pool */ -typedef struct string { - char *data; /* data */ - size_t len; /* string length */ - size_t bufsiz; /* allocated size */ -} String; - -int absuri(char *, size_t, const char *, const char *); -int parseuri(const char *, struct uri *, int); - -static char *basehref = "https://codemadness.org"; - -static char src[4096]; /* src or href attribute */ - -#define MAX_DEPTH 256 -static struct node nodes[MAX_DEPTH]; -static int curnode; - -static struct { - char *tag; - enum DisplayType displaytype; -} tags[] = { - /* pre */ - { "pre", DisplayPre }, - { "code", DisplayPre }, - /* inline */ -#if 0 - { "b", DisplayInline }, - { "i", DisplayInline }, - { "u", DisplayInline }, - { "strong", DisplayInline }, - { "em", DisplayInline }, - { "a", DisplayInline }, - { "span", DisplayInline }, - { "img", DisplayInline }, - { "label", DisplayInline }, -#endif - /* table */ - { "table", DisplayTable }, - /* table-row */ - { "tr", DisplayTableRow }, - /* table-cell */ - { "td", DisplayTableCell }, - { "th", DisplayTableCell }, - /* list-item */ - { "li", DisplayListItem }, - /* header */ - { "h1", DisplayHeader }, - { "h2", DisplayHeader }, - { "h3", DisplayHeader }, - { "h4", DisplayHeader }, - { "h5", DisplayHeader }, - { "h6", DisplayHeader }, - /* break */ - { "br", 0 }, - /* list */ - { "ul", DisplayList }, - { "ol", DisplayList }, - /* block */ - { "p", DisplayBlock }, - { "blockquote", DisplayBlock }, - { "hr", DisplayBlock }, - { "title", DisplayBlock }, - { "nav", DisplayBlock }, - { "main", DisplayBlock }, - { "article", DisplayBlock }, - { "header", DisplayBlock }, - { "footer", DisplayBlock }, - { "div", DisplayBlock }, -}; - -static String htmldata; - -static const char *ignorestate, *endtag; -static int (*getnext)(void); - -/* return a space for all data until some case-insensitive string occurs. This - is used to parse incorrect HTML/XML that contains unescaped HTML in script - or style tags. If you see some </script> tag in a CDATA or comment - section then e-mail W3C and tell them the web is too complex. */ -static inline int -getnext_ignore(void) -{ - int c; - - if ((c = getnext()) == EOF) - return EOF; - - if (tolower(c) == tolower((unsigned char)*ignorestate)) { - ignorestate++; - if (*ignorestate == '\0') { - parser.getnext = getnext; /* restore */ - return c; - } - } else { - ignorestate = endtag; - } - - return ' '; -} - -/* Clear string only; don't free, prevents unnecessary reallocation. */ -static void -string_clear(String *s) -{ - if (s->data) - s->data[0] = '\0'; - s->len = 0; -} - -static void -string_buffer_realloc(String *s, size_t newlen) -{ - size_t alloclen; - - for (alloclen = 64; alloclen <= newlen; alloclen *= 2) - ; - if (!(s->data = realloc(s->data, alloclen))) - err(1, "realloc"); - s->bufsiz = alloclen; -} - -static void -string_append(String *s, const char *data, size_t len) -{ - if (!len) - return; - /* check if allocation is necesary, don't shrink buffer, - * should be more than bufsiz ofcourse. */ - if (s->len + len >= s->bufsiz) - string_buffer_realloc(s, s->len + len + 1); - memcpy(s->data + s->len, data, len); - s->len += len; - s->data[s->len] = '\0'; -} - -char * -estrdup(const char *s) -{ - char *p; - - if (!(p = strdup(s))) - err(1, "strdup"); - return p; -} - -void * -ecalloc(size_t nmemb, size_t size) -{ - void *p; - - if (!(p = calloc(nmemb, size))) - err(1, "calloc"); - return p; -} - -static void -printsafe(const char *s) -{ - for (; *s; s++) { - switch (*s) { - case '\t': - case '\n': - putchar(*s); - break; - default: - if (!iscntrl((unsigned char)*s)) - putchar(*s); - } - } -} - -int -parseuri(const char *s, struct uri *u, int rel) -{ - const char *p = s, *b; - char *endptr = NULL; - size_t i; - unsigned long l; - - u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0'; - if (!*s) - return 0; - - /* prefix is "//", don't read protocol, skip to domain parsing */ - if (!strncmp(p, "//", 2)) { - p += 2; /* skip "//" */ - } else { - /* protocol part */ - for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || - *p == '+' || *p == '-' || *p == '.'); p++) - ; - if (!strncmp(p, "://", 3)) { - if ((size_t)(p - s) >= sizeof(u->proto)) - return -1; /* protocol too long */ - memcpy(u->proto, s, p - s); - u->proto[p - s] = '\0'; - p += 3; /* skip "://" */ - } else { - p = s; /* no protocol format, set to start */ - /* relative url: read rest as path, else as domain */ - if (rel) - goto readpath; - } - } - /* IPv6 address */ - if (*p == '[') { - /* bracket not found or host too long */ - if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 || - (size_t)(b - p) >= sizeof(u->host)) - return -1; - memcpy(u->host, p, b - p + 1); - u->host[b - p + 1] = '\0'; - p = b + 1; - } else { - /* domain / host part, skip until port, path or end. */ - if ((i = strcspn(p, ":/")) >= sizeof(u->host)) - return -1; /* host too long */ - memcpy(u->host, p, i); - u->host[i] = '\0'; - p = &p[i]; - } - /* port */ - if (*p == ':') { - if ((i = strcspn(++p, "/")) >= sizeof(u->port)) - return -1; /* port too long */ - memcpy(u->port, p, i); - u->port[i] = '\0'; - /* check for valid port: range 1 - 65535 */ - errno = 0; - l = strtoul(u->port, &endptr, 10); - if (errno || u->port[0] == '\0' || *endptr || - !l || l > 65535) - return -1; - p = &p[i]; - } -readpath: - if (u->host[0]) { - p = &p[strspn(p, "/")]; - strlcpy(u->path, "/", sizeof(u->path)); - } else { - /* absolute uri must have a host specified */ - if (!rel) - return -1; - } - /* treat truncation as an error */ - if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path)) - return -1; - return 0; -} - -static int -encodeuri(char *buf, size_t bufsiz, const char *s) -{ - static const char *table = "0123456789ABCDEF"; - size_t i, b; - - for (i = 0, b = 0; s[i]; i++) { - if (s[i] == ' ' || - (unsigned char)s[i] > 127 || - iscntrl((unsigned char)s[i])) { - if (b + 3 >= bufsiz) - return -1; - buf[b++] = '%'; - buf[b++] = table[((unsigned char)s[i] >> 4) & 15]; - buf[b++] = table[(unsigned char)s[i] & 15]; - } else if (b < bufsiz) { - buf[b++] = s[i]; - } else { - return -1; - } - } - if (b >= bufsiz) - return -1; - buf[b] = '\0'; - - return 0; -} - -/* Get absolute uri; if `link` is relative use `base` to make it absolute. - * the returned string in `buf` is uri encoded, see: encodeuri(). */ -int -absuri(char *buf, size_t bufsiz, const char *link, const char *base) -{ - struct uri ulink, ubase; - char tmp[4096], *host, *p, *port; - int c, r; - size_t i; - - buf[0] = '\0'; - if (parseuri(base, &ubase, 0) == -1 || - parseuri(link, &ulink, 1) == -1 || - (!ulink.host[0] && !ubase.host[0])) - return -1; - - if (!strncmp(link, "//", 2)) { - host = ulink.host; - port = ulink.port; - } else { - host = ulink.host[0] ? ulink.host : ubase.host; - port = ulink.port[0] ? ulink.port : ubase.port; - } - r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s", - ulink.proto[0] ? - ulink.proto : - (ubase.proto[0] ? ubase.proto : "http"), - host, - port[0] ? ":" : "", - port); - if (r < 0 || (size_t)r >= sizeof(tmp)) - return -1; /* error or truncation */ - - /* relative to root */ - if (!ulink.host[0] && ulink.path[0] != '/') { - /* relative to base url path */ - if (ulink.path[0]) { - if ((p = strrchr(ubase.path, '/'))) { - /* temporary null-terminate */ - c = *(++p); - *p = '\0'; - i = strlcat(tmp, ubase.path, sizeof(tmp)); - *p = c; /* restore */ - if (i >= sizeof(tmp)) - return -1; - } - } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >= - sizeof(tmp)) { - return -1; - } - } - if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp)) - return -1; - - return encodeuri(buf, bufsiz, tmp); -} - -static void -xmlcdata(XMLParser *p, const char *data, size_t datalen) -{ - struct node *cur; - - cur = &nodes[curnode]; - if (cur->displaytype & DisplayNone) - return; - - printsafe(data); -} - -#if 0 -static void -xmldatastart(XMLParser *p) -{ -// printf("DEBUG: %s\n", __func__); -} -#endif - -static void -xmldataend(XMLParser *p) -{ - struct node *cur; - char *start, *s, *e; - -// printf("DEBUG: %s\n", __func__); - - if (!htmldata.data || !htmldata.len) - return; - - cur = &nodes[curnode]; - -// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype); - - if (!cur->displaytype || (cur->displaytype & DisplayNone)) { - /* nothing */ - } else if (cur->displaytype & DisplayPre) { - fwrite(htmldata.data, 1, htmldata.len, stdout); - } else { - start = htmldata.data; - e = htmldata.data + htmldata.len; - - /* TODO: better white-space handling, for example if there is only - white-space between 2 block elements then it can be ignored. */ - for (s = start; s < e; s++) { - if (*s == '\r') { - continue; - } else if (isspace((unsigned char)*s)) { - if (s == start || !isspace((unsigned char)s[-1])) - putchar(' '); - } else if (!iscntrl((unsigned char)*s)) { - putchar(*s); - } - } - } - - string_clear(&htmldata); -} - -static void -xmldata(XMLParser *p, const char *data, size_t datalen) -{ - struct node *cur; - - cur = &nodes[curnode]; - if (cur->displaytype & DisplayNone) - return; - - string_append(&htmldata, data, datalen); -} - -static void -xmldataentity(XMLParser *p, const char *data, size_t datalen) -{ - struct node *cur; - char buf[16]; - int n; - - cur = &nodes[curnode]; - if (cur->displaytype & DisplayNone) - return; - - /* convert basic XML entities */ - /* &nbsp; &copy;, copy table from Links (check license) */ - /* rsquo, hellip, ndash, lsquo */ - /* TODO: add to tscrape too */ - /* TODO: support some more HTML entities */ - n = xml_entitytostr(data, buf, sizeof(buf)); - if (n > 0) - xmldata(p, buf, (size_t)n); - else - xmldata(p, data, datalen); -} - -static void -xmltagstart(XMLParser *x, const char *t, size_t tl) -{ - struct node *cur; - int i; - -// printf("start of tag: %s\n", t); - - if (curnode >= MAX_DEPTH - 2) - errx(1, "max tag depth reached: %d\n", curnode); - curnode++; - - cur = &nodes[curnode]; - memset(cur, 0, sizeof(*cur)); - cur->displaytype = DisplayInline; - strlcpy(cur->tag, t, sizeof(cur->tag)); - - src[0] = '\0'; /* src, href */ - - /* set display type */ - for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) { - if (!strcasecmp(tags[i].tag, t)) { - cur->displaytype = tags[i].displaytype; -// printf("match on tag: %s == %s, displaytype: %d\n", -// tags[i].tag, t, cur->displaytype); - break; - } - } -} - -static void -xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) -{ - struct node *cur; - int i; - - cur = &nodes[curnode]; - -// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag); - - if (cur->displaytype & DisplayBlock) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayPre) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayTable) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayTableRow) { - fputs(" | ", stdout); /* HACK: assume last cell */ - } else if (cur->displaytype & DisplayList) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayListItem) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayHeader) { - fputs("\n", stdout); - if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') { - if (t[1] >= '3') - for (i = 0; i < termwidth; i++) - putchar('-'); - else if (t[1] >= '1') - for (i = 0; i < termwidth; i++) - putchar('='); - putchar('\n'); - } - } else if (!strcasecmp(t, "br")) { - fputs("\n", stdout); - } - - curnode--; -} - -static void -xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) -{ - struct node *cur; - int i; - - /* temporary replace the callback except the reader and end of tag - restore the context once we receive the same ignored tag in the - end tag handler */ - if (!strcasecmp(t, "script")) { - ignorestate = endtag = "</script>"; - getnext = p->getnext; /* for restore */ - p->getnext = getnext_ignore; - return; - } else if (!strcasecmp(t, "style")) { - ignorestate = endtag = "</style>"; - getnext = p->getnext; /* for restore */ - p->getnext = getnext_ignore; - return; - } - - cur = &nodes[curnode]; - -#ifdef maybe - /* show links as reference at the bottom */ - if (src[0]) { - printf(" [%d]", ++linkcount); - if (!strcasecmp(t, "img") || !strcasecmp(t, "video") || - !strcasecmp(t, "audio")) - printf("[%s]", t); - /* TODO: check allocation */ - if (!links_head) - links_cur = links_head = ecalloc(1, sizeof(*links_head)); - else - links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); - links_cur->type = estrdup(t); - /* TODO: absuri */ - links_cur->url = estrdup(src); - } - src[0] = '\0'; -#endif - -#if 0 - /* show links inline */ - if (src[0]) { - char absurl[1024]; - if (absuri(absurl, sizeof(absurl), src, basehref) != -1) { - if (!strcasecmp(t, "img") || !strcasecmp(t, "video") || - !strcasecmp(t, "audio")) - printf("[%s](", t); - else - printf("[%s](", "link"); - printsafe(absurl); - putchar(')'); - } - } -#endif - - if (cur->displaytype & DisplayBlock) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayHeader) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayTableRow) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayTableCell) { - fputs(" | ", stdout); - } else if (cur->displaytype & DisplayList) { - fputs("\n", stdout); - } else if (cur->displaytype & DisplayListItem) { - /* indent nested list items */ - for (i = curnode; i; i--) { - if (nodes[i].displaytype & DisplayListItem) - continue; - if (nodes[i].displaytype & DisplayList) - fputs(" ", stdout); - } - /* TODO: for <ol>, keep list counter on ol element (parent), - support ordered number type only */ - fputs("* ", stdout); - } else if (!strcasecmp(t, "hr")) { /* ruler */ - for (i = 0; i < termwidth; i++) - putchar('-'); - } -} - -static void -xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, - size_t namelen, const char *value, size_t valuelen) -{ - if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen) - strlcpy(src, value, sizeof(src)); - - if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") || - !strcasecmp(tag, "audio")) && - !strcasecmp(name, "src") && valuelen) - strlcpy(src, value, sizeof(src)); -} - -#ifdef maybe -void -printlinkrefs(void) -{ - size_t i; - - printf("\n\nLink references:\n"); - - /* TODO: add title attribute or some basic description? */ - for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++) - printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type); -} -#endif - -int -main(void) -{ - if (pledge("stdio", NULL) < 0) - err(1, "pledge"); - - parser.xmlattr = xmlattr; - parser.xmlcdata = xmlcdata; - parser.xmldata = xmldata; -// parser.xmldatastart = xmldatastart; - parser.xmldataend = xmldataend; - parser.xmldataentity = xmldataentity; - parser.xmltagstart = xmltagstart; - parser.xmltagend = xmltagend; - parser.xmltagstartparsed = xmltagstartparsed; - - parser.getnext = getchar; - xml_parse(&parser); - -#ifdef maybe - printlinkrefs(); -#endif - putchar('\n'); - - return 0; -} diff --git a/webdump.c b/webdump.c @@ -0,0 +1,706 @@ +#include <ctype.h> +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> + +#include "xml.h" + +static XMLParser parser; + +#ifndef __OpenBSD__ +#define pledge(p1,p2) 0 +#endif + +#undef strlcat +size_t strlcat(char *, const char *, size_t); +#undef strlcpy +size_t strlcpy(char *, const char *, size_t); + +/* uri */ +struct uri { + char proto[48]; + char host[256]; + char path[2048]; + char port[6]; /* numeric port */ +}; + +static int termwidth = 72; + +#if 0 +/* linked-list of link references */ +struct linkref { + char *type; + char *url; + struct linkref *next; +}; + +static struct linkref *links_head; +static struct linkref *links_cur; +static int linkcount; +#endif + +enum DisplayType { + DisplayUnknown = 0, + DisplayNone = 1, + DisplayPre = 2, + DisplayInline = 4, + DisplayInlineBlock = 8, + DisplayBlock = 16, + DisplayList = 32, + DisplayListItem = 64, + DisplayTable = 128, + DisplayTableRow = 256, + DisplayTableCell = 512, + DisplayHeader = 1024, +}; + +struct node { + char tag[256]; + enum DisplayType displaytype; +}; + +typedef struct node Node; + +/* String data / memory pool */ +typedef struct string { + char *data; /* data */ + size_t len; /* string length */ + size_t bufsiz; /* allocated size */ +} String; + +int absuri(char *, size_t, const char *, const char *); +int parseuri(const char *, struct uri *, int); + +static char *basehref = "https://codemadness.org"; + +static char src[4096]; /* src or href attribute */ + +#define MAX_DEPTH 256 +static struct node nodes[MAX_DEPTH]; +static int curnode; + +static struct { + char *tag; + enum DisplayType displaytype; +} tags[] = { + /* pre */ + { "pre", DisplayPre }, + { "code", DisplayPre }, + /* inline */ +#if 0 + { "b", DisplayInline }, + { "i", DisplayInline }, + { "u", DisplayInline }, + { "strong", DisplayInline }, + { "em", DisplayInline }, + { "a", DisplayInline }, + { "span", DisplayInline }, + { "img", DisplayInline }, + { "label", DisplayInline }, +#endif + /* table */ + { "table", DisplayTable }, + /* table-row */ + { "tr", DisplayTableRow }, + /* table-cell */ + { "td", DisplayTableCell }, + { "th", DisplayTableCell }, + /* list-item */ + { "li", DisplayListItem }, + /* header */ + { "h1", DisplayHeader }, + { "h2", DisplayHeader }, + { "h3", DisplayHeader }, + { "h4", DisplayHeader }, + { "h5", DisplayHeader }, + { "h6", DisplayHeader }, + /* break */ + { "br", 0 }, + /* list */ + { "ul", DisplayList }, + { "ol", DisplayList }, + /* block */ + { "p", DisplayBlock }, + { "blockquote", DisplayBlock }, + { "hr", DisplayBlock }, + { "title", DisplayBlock }, + { "nav", DisplayBlock }, + { "main", DisplayBlock }, + { "article", DisplayBlock }, + { "header", DisplayBlock }, + { "footer", DisplayBlock }, + { "div", DisplayBlock }, +}; + +static String htmldata; + +static const char *ignorestate, *endtag; +static int (*getnext)(void); + +/* return a space for all data until some case-insensitive string occurs. This + is used to parse incorrect HTML/XML that contains unescaped HTML in script + or style tags. If you see some </script> tag in a CDATA or comment + section then e-mail W3C and tell them the web is too complex. */ +static inline int +getnext_ignore(void) +{ + int c; + + if ((c = getnext()) == EOF) + return EOF; + + if (tolower(c) == tolower((unsigned char)*ignorestate)) { + ignorestate++; + if (*ignorestate == '\0') { + parser.getnext = getnext; /* restore */ + return c; + } + } else { + ignorestate = endtag; + } + + return ' '; +} + +/* Clear string only; don't free, prevents unnecessary reallocation. */ +static void +string_clear(String *s) +{ + if (s->data) + s->data[0] = '\0'; + s->len = 0; +} + +static void +string_buffer_realloc(String *s, size_t newlen) +{ + size_t alloclen; + + for (alloclen = 64; alloclen <= newlen; alloclen *= 2) + ; + if (!(s->data = realloc(s->data, alloclen))) + err(1, "realloc"); + s->bufsiz = alloclen; +} + +static void +string_append(String *s, const char *data, size_t len) +{ + if (!len) + return; + /* check if allocation is necesary, don't shrink buffer, + * should be more than bufsiz ofcourse. */ + if (s->len + len >= s->bufsiz) + string_buffer_realloc(s, s->len + len + 1); + memcpy(s->data + s->len, data, len); + s->len += len; + s->data[s->len] = '\0'; +} + +char * +estrdup(const char *s) +{ + char *p; + + if (!(p = strdup(s))) + err(1, "strdup"); + return p; +} + +void * +ecalloc(size_t nmemb, size_t size) +{ + void *p; + + if (!(p = calloc(nmemb, size))) + err(1, "calloc"); + return p; +} + +static void +printsafe(const char *s) +{ + for (; *s; s++) { + switch (*s) { + case '\t': + case '\n': + putchar(*s); + break; + default: + if (!iscntrl((unsigned char)*s)) + putchar(*s); + } + } +} + +int +parseuri(const char *s, struct uri *u, int rel) +{ + const char *p = s, *b; + char *endptr = NULL; + size_t i; + unsigned long l; + + u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0'; + if (!*s) + return 0; + + /* prefix is "//", don't read protocol, skip to domain parsing */ + if (!strncmp(p, "//", 2)) { + p += 2; /* skip "//" */ + } else { + /* protocol part */ + for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || + *p == '+' || *p == '-' || *p == '.'); p++) + ; + if (!strncmp(p, "://", 3)) { + if ((size_t)(p - s) >= sizeof(u->proto)) + return -1; /* protocol too long */ + memcpy(u->proto, s, p - s); + u->proto[p - s] = '\0'; + p += 3; /* skip "://" */ + } else { + p = s; /* no protocol format, set to start */ + /* relative url: read rest as path, else as domain */ + if (rel) + goto readpath; + } + } + /* IPv6 address */ + if (*p == '[') { + /* bracket not found or host too long */ + if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 || + (size_t)(b - p) >= sizeof(u->host)) + return -1; + memcpy(u->host, p, b - p + 1); + u->host[b - p + 1] = '\0'; + p = b + 1; + } else { + /* domain / host part, skip until port, path or end. */ + if ((i = strcspn(p, ":/")) >= sizeof(u->host)) + return -1; /* host too long */ + memcpy(u->host, p, i); + u->host[i] = '\0'; + p = &p[i]; + } + /* port */ + if (*p == ':') { + if ((i = strcspn(++p, "/")) >= sizeof(u->port)) + return -1; /* port too long */ + memcpy(u->port, p, i); + u->port[i] = '\0'; + /* check for valid port: range 1 - 65535 */ + errno = 0; + l = strtoul(u->port, &endptr, 10); + if (errno || u->port[0] == '\0' || *endptr || + !l || l > 65535) + return -1; + p = &p[i]; + } +readpath: + if (u->host[0]) { + p = &p[strspn(p, "/")]; + strlcpy(u->path, "/", sizeof(u->path)); + } else { + /* absolute uri must have a host specified */ + if (!rel) + return -1; + } + /* treat truncation as an error */ + if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path)) + return -1; + return 0; +} + +static int +encodeuri(char *buf, size_t bufsiz, const char *s) +{ + static const char *table = "0123456789ABCDEF"; + size_t i, b; + + for (i = 0, b = 0; s[i]; i++) { + if (s[i] == ' ' || + (unsigned char)s[i] > 127 || + iscntrl((unsigned char)s[i])) { + if (b + 3 >= bufsiz) + return -1; + buf[b++] = '%'; + buf[b++] = table[((unsigned char)s[i] >> 4) & 15]; + buf[b++] = table[(unsigned char)s[i] & 15]; + } else if (b < bufsiz) { + buf[b++] = s[i]; + } else { + return -1; + } + } + if (b >= bufsiz) + return -1; + buf[b] = '\0'; + + return 0; +} + +/* Get absolute uri; if `link` is relative use `base` to make it absolute. + * the returned string in `buf` is uri encoded, see: encodeuri(). */ +int +absuri(char *buf, size_t bufsiz, const char *link, const char *base) +{ + struct uri ulink, ubase; + char tmp[4096], *host, *p, *port; + int c, r; + size_t i; + + buf[0] = '\0'; + if (parseuri(base, &ubase, 0) == -1 || + parseuri(link, &ulink, 1) == -1 || + (!ulink.host[0] && !ubase.host[0])) + return -1; + + if (!strncmp(link, "//", 2)) { + host = ulink.host; + port = ulink.port; + } else { + host = ulink.host[0] ? ulink.host : ubase.host; + port = ulink.port[0] ? ulink.port : ubase.port; + } + r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s", + ulink.proto[0] ? + ulink.proto : + (ubase.proto[0] ? ubase.proto : "http"), + host, + port[0] ? ":" : "", + port); + if (r < 0 || (size_t)r >= sizeof(tmp)) + return -1; /* error or truncation */ + + /* relative to root */ + if (!ulink.host[0] && ulink.path[0] != '/') { + /* relative to base url path */ + if (ulink.path[0]) { + if ((p = strrchr(ubase.path, '/'))) { + /* temporary null-terminate */ + c = *(++p); + *p = '\0'; + i = strlcat(tmp, ubase.path, sizeof(tmp)); + *p = c; /* restore */ + if (i >= sizeof(tmp)) + return -1; + } + } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >= + sizeof(tmp)) { + return -1; + } + } + if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp)) + return -1; + + return encodeuri(buf, bufsiz, tmp); +} + +static void +xmlcdata(XMLParser *p, const char *data, size_t datalen) +{ + struct node *cur; + + cur = &nodes[curnode]; + if (cur->displaytype & DisplayNone) + return; + + printsafe(data); +} + +#if 0 +static void +xmldatastart(XMLParser *p) +{ +// printf("DEBUG: %s\n", __func__); +} +#endif + +static void +xmldataend(XMLParser *p) +{ + struct node *cur; + char *start, *s, *e; + +// printf("DEBUG: %s\n", __func__); + + if (!htmldata.data || !htmldata.len) + return; + + cur = &nodes[curnode]; + +// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype); + + if (!cur->displaytype || (cur->displaytype & DisplayNone)) { + /* nothing */ + } else if (cur->displaytype & DisplayPre) { + fwrite(htmldata.data, 1, htmldata.len, stdout); + } else { + start = htmldata.data; + e = htmldata.data + htmldata.len; + + /* TODO: better white-space handling, for example if there is only + white-space between 2 block elements then it can be ignored. */ + for (s = start; s < e; s++) { + if (*s == '\r') { + continue; + } else if (isspace((unsigned char)*s)) { + if (s == start || !isspace((unsigned char)s[-1])) + putchar(' '); + } else if (!iscntrl((unsigned char)*s)) { + putchar(*s); + } + } + } + + string_clear(&htmldata); +} + +static void +xmldata(XMLParser *p, const char *data, size_t datalen) +{ + struct node *cur; + + cur = &nodes[curnode]; + if (cur->displaytype & DisplayNone) + return; + + string_append(&htmldata, data, datalen); +} + +static void +xmldataentity(XMLParser *p, const char *data, size_t datalen) +{ + struct node *cur; + char buf[16]; + int n; + + cur = &nodes[curnode]; + if (cur->displaytype & DisplayNone) + return; + + /* convert basic XML entities */ + /* &nbsp; &copy;, copy table from Links (check license) */ + /* rsquo, hellip, ndash, lsquo */ + /* TODO: add to tscrape too */ + /* TODO: support some more HTML entities */ + n = xml_entitytostr(data, buf, sizeof(buf)); + if (n > 0) + xmldata(p, buf, (size_t)n); + else + xmldata(p, data, datalen); +} + +static void +xmltagstart(XMLParser *x, const char *t, size_t tl) +{ + struct node *cur; + int i; + +// printf("start of tag: %s\n", t); + + if (curnode >= MAX_DEPTH - 2) + errx(1, "max tag depth reached: %d\n", curnode); + curnode++; + + cur = &nodes[curnode]; + memset(cur, 0, sizeof(*cur)); + cur->displaytype = DisplayInline; + strlcpy(cur->tag, t, sizeof(cur->tag)); + + src[0] = '\0'; /* src, href */ + + /* set display type */ + for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) { + if (!strcasecmp(tags[i].tag, t)) { + cur->displaytype = tags[i].displaytype; +// printf("match on tag: %s == %s, displaytype: %d\n", +// tags[i].tag, t, cur->displaytype); + break; + } + } +} + +static void +xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) +{ + struct node *cur; + int i; + + cur = &nodes[curnode]; + +// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag); + + if (cur->displaytype & DisplayBlock) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayPre) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTable) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTableRow) { + fputs(" | ", stdout); /* HACK: assume last cell */ + } else if (cur->displaytype & DisplayList) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayListItem) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayHeader) { + fputs("\n", stdout); + if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') { + if (t[1] >= '3') + for (i = 0; i < termwidth; i++) + putchar('-'); + else if (t[1] >= '1') + for (i = 0; i < termwidth; i++) + putchar('='); + putchar('\n'); + } + } else if (!strcasecmp(t, "br")) { + fputs("\n", stdout); + } + + curnode--; +} + +static void +xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) +{ + struct node *cur; + int i; + + /* temporary replace the callback except the reader and end of tag + restore the context once we receive the same ignored tag in the + end tag handler */ + if (!strcasecmp(t, "script")) { + ignorestate = endtag = "</script>"; + getnext = p->getnext; /* for restore */ + p->getnext = getnext_ignore; + return; + } else if (!strcasecmp(t, "style")) { + ignorestate = endtag = "</style>"; + getnext = p->getnext; /* for restore */ + p->getnext = getnext_ignore; + return; + } + + cur = &nodes[curnode]; + +#ifdef maybe + /* show links as reference at the bottom */ + if (src[0]) { + printf(" [%d]", ++linkcount); + if (!strcasecmp(t, "img") || !strcasecmp(t, "video") || + !strcasecmp(t, "audio")) + printf("[%s]", t); + /* TODO: check allocation */ + if (!links_head) + links_cur = links_head = ecalloc(1, sizeof(*links_head)); + else + links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); + links_cur->type = estrdup(t); + /* TODO: absuri */ + links_cur->url = estrdup(src); + } + src[0] = '\0'; +#endif + +#if 0 + /* show links inline */ + if (src[0]) { + char absurl[1024]; + if (absuri(absurl, sizeof(absurl), src, basehref) != -1) { + if (!strcasecmp(t, "img") || !strcasecmp(t, "video") || + !strcasecmp(t, "audio")) + printf("[%s](", t); + else + printf("[%s](", "link"); + printsafe(absurl); + putchar(')'); + } + } +#endif + + if (cur->displaytype & DisplayBlock) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayHeader) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTableRow) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayTableCell) { + fputs(" | ", stdout); + } else if (cur->displaytype & DisplayList) { + fputs("\n", stdout); + } else if (cur->displaytype & DisplayListItem) { + /* indent nested list items */ + for (i = curnode; i; i--) { + if (nodes[i].displaytype & DisplayListItem) + continue; + if (nodes[i].displaytype & DisplayList) + fputs(" ", stdout); + } + /* TODO: for <ol>, keep list counter on ol element (parent), + support ordered number type only */ + fputs("* ", stdout); + } else if (!strcasecmp(t, "hr")) { /* ruler */ + for (i = 0; i < termwidth; i++) + putchar('-'); + } +} + +static void +xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, + size_t namelen, const char *value, size_t valuelen) +{ + if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen) + strlcpy(src, value, sizeof(src)); + + if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") || + !strcasecmp(tag, "audio")) && + !strcasecmp(name, "src") && valuelen) + strlcpy(src, value, sizeof(src)); +} + +#ifdef maybe +void +printlinkrefs(void) +{ + size_t i; + + printf("\n\nLink references:\n"); + + /* TODO: add title attribute or some basic description? */ + for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++) + printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type); +} +#endif + +int +main(void) +{ + if (pledge("stdio", NULL) < 0) + err(1, "pledge"); + + parser.xmlattr = xmlattr; + parser.xmlcdata = xmlcdata; + parser.xmldata = xmldata; +// parser.xmldatastart = xmldatastart; + parser.xmldataend = xmldataend; + parser.xmldataentity = xmldataentity; + parser.xmltagstart = xmltagstart; + parser.xmltagend = xmltagend; + parser.xmltagstartparsed = xmltagstartparsed; + + parser.getnext = getchar; + xml_parse(&parser); + +#ifdef maybe + printlinkrefs(); +#endif + putchar('\n'); + + return 0; +}