webdump

Unnamed repository; edit this file 'description' to name the repository.
git clone git://git.z3bra.org/webdump.git
Log | Files | Refs | README | LICENSE

commit 9ac2648a64f0b2d125da2a39ed8e8f4ff2e234b4
parent b708236e10ae2b6af6e62514f2ca159fd6eeeabd
Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date:   Sat, 21 Sep 2019 15:23:08 +0200

improvements

- initial url parsing and base href support (WIP).
- rename xstrdup and xcalloc to estrdup and ecalloc (exits on failure).
- show links inline, disable printing references at the bottom for now.
- update TODO.

Diffstat:
TODO | 11++++-------
main.c | 223++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 213 insertions(+), 21 deletions(-)

diff --git a/TODO b/TODO @@ -1,12 +1,9 @@ +- base href. + specify and parse relative url, allow to specify base and also parse <base href=""> +- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre - print safe (not certain control chars, except newline, TAB etc). - - improve/remove duplicate white-space/newlines? - cleanup code. - -=== - - <code> should not be treated as a block (<pre> does?) - -? xml.c: make sure to always call xmldata handler even if datalen == 0 ? - - add links as reference, for example on page: http://absmagazin.de/2018 the MP3 urls. +? xml.c: make sure to always call xmldata handler even if datalen == 0 ? diff --git a/main.c b/main.c @@ -1,8 +1,6 @@ -/* TODO: escape control characters */ -/* TODO: specify and parse relative url, allow to specify base and also parse <base href=""> ? */ - #include <ctype.h> #include <err.h> +#include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -11,11 +9,18 @@ #include "xml.h" -/* string and size */ -/*#define STRP(s) s,sizeof(s)-1*/ - static XMLParser parser; +/* uri */ +struct uri { + char proto[48]; + char host[256]; + char path[2048]; + char port[6]; /* numeric port */ +}; + +#if 0 +/* linked-list of link references */ struct linkref { char *type; char *url; @@ -25,6 +30,7 @@ struct linkref { static struct linkref *links_head; static struct linkref *links_cur; static int linkcount; +#endif struct node { char tag[256]; @@ -42,11 +48,19 @@ typedef struct string { size_t bufsiz; /* allocated size */ } String; +int absuri(char *, size_t, const char *, const char *); +int parseuri(const char *, struct uri *, int); + +static char *basehref = "https://codemadness.org"; + static char src[4096]; /* src or href attribute */ #define MAX_DEPTH 256 static struct node nodes[MAX_DEPTH]; static int curnode; + +/* TODO: temporary workaround, handle whitespace, and tag types properly: + atleast: inline-block, inline, block, pre */ static int ignoredata; static char *pretags[] = { @@ -154,7 +168,7 @@ string_append(String *s, const char *data, size_t len) } char * -xstrdup(const char *s) +estrdup(const char *s) { char *p; @@ -164,7 +178,7 @@ xstrdup(const char *s) } void * -xcalloc(size_t nmemb, size_t size) +ecalloc(size_t nmemb, size_t size) { void *p; @@ -189,6 +203,171 @@ printsafe(const char *s) } } +int +parseuri(const char *s, struct uri *u, int rel) +{ + const char *p = s, *b; + char *endptr = NULL; + size_t i; + unsigned long l; + + u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0'; + if (!*s) + return 0; + + /* prefix is "//", don't read protocol, skip to domain parsing */ + if (!strncmp(p, "//", 2)) { + p += 2; /* skip "//" */ + } else { + /* protocol part */ + for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) || + *p == '+' || *p == '-' || *p == '.'); p++) + ; + if (!strncmp(p, "://", 3)) { + if ((size_t)(p - s) >= sizeof(u->proto)) + return -1; /* protocol too long */ + memcpy(u->proto, s, p - s); + u->proto[p - s] = '\0'; + p += 3; /* skip "://" */ + } else { + p = s; /* no protocol format, set to start */ + /* relative url: read rest as path, else as domain */ + if (rel) + goto readpath; + } + } + /* IPv6 address */ + if (*p == '[') { + /* bracket not found or host too long */ + if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 || + (size_t)(b - p) >= sizeof(u->host)) + return -1; + memcpy(u->host, p, b - p + 1); + u->host[b - p + 1] = '\0'; + p = b + 1; + } else { + /* domain / host part, skip until port, path or end. */ + if ((i = strcspn(p, ":/")) >= sizeof(u->host)) + return -1; /* host too long */ + memcpy(u->host, p, i); + u->host[i] = '\0'; + p = &p[i]; + } + /* port */ + if (*p == ':') { + if ((i = strcspn(++p, "/")) >= sizeof(u->port)) + return -1; /* port too long */ + memcpy(u->port, p, i); + u->port[i] = '\0'; + /* check for valid port: range 1 - 65535 */ + errno = 0; + l = strtoul(u->port, &endptr, 10); + if (errno || u->port[0] == '\0' || *endptr || + !l || l > 65535) + return -1; + p = &p[i]; + } +readpath: + if (u->host[0]) { + p = &p[strspn(p, "/")]; + strlcpy(u->path, "/", sizeof(u->path)); + } else { + /* absolute uri must have a host specified */ + if (!rel) + return -1; + } + /* treat truncation as an error */ + if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path)) + return -1; + return 0; +} + +static int +encodeuri(char *buf, size_t bufsiz, const char *s) +{ + static const char *table = "0123456789ABCDEF"; + size_t i, b; + + for (i = 0, b = 0; s[i]; i++) { + if (s[i] == ' ' || + (unsigned char)s[i] > 127 || + iscntrl((unsigned char)s[i])) { + if (b + 3 >= bufsiz) + return -1; + buf[b++] = '%'; + buf[b++] = table[((unsigned char)s[i] >> 4) & 15]; + buf[b++] = table[(unsigned char)s[i] & 15]; + } else if (b < bufsiz) { + buf[b++] = s[i]; + } else { + return -1; + } + } + if (b >= bufsiz) + return -1; + buf[b] = '\0'; + + return 0; +} + +/* Get absolute uri; if `link` is relative use `base` to make it absolute. + * the returned string in `buf` is uri encoded, see: encodeuri(). */ +int +absuri(char *buf, size_t bufsiz, const char *link, const char *base) +{ + struct uri ulink, ubase; + char tmp[4096], *host, *p, *port; + int c, r; + size_t i; + + buf[0] = '\0'; + if (parseuri(base, &ubase, 0) == -1 || + parseuri(link, &ulink, 1) == -1 || + (!ulink.host[0] && !ubase.host[0])) + return -1; + + if (!strncmp(link, "//", 2)) { + host = ulink.host; + port = ulink.port; + } else { + host = ulink.host[0] ? ulink.host : ubase.host; + port = ulink.port[0] ? ulink.port : ubase.port; + } + r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s", + ulink.proto[0] ? + ulink.proto : + (ubase.proto[0] ? ubase.proto : "http"), + host, + port[0] ? ":" : "", + port); + if (r < 0 || (size_t)r >= sizeof(tmp)) + return -1; /* error or truncation */ + + /* relative to root */ + if (!ulink.host[0] && ulink.path[0] != '/') { + /* relative to base url path */ + if (ulink.path[0]) { + if ((p = strrchr(ubase.path, '/'))) { + /* temporary null-terminate */ + c = *(++p); + *p = '\0'; + i = strlcat(tmp, ubase.path, sizeof(tmp)); + *p = c; /* restore */ + if (i >= sizeof(tmp)) + return -1; + } + } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >= + sizeof(tmp)) { + return -1; + } + } + if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp)) + return -1; + + return encodeuri(buf, bufsiz, tmp); +} + + static void xmlcdata(XMLParser *p, const char *data, size_t datalen) { @@ -367,7 +546,8 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) cur = &nodes[curnode]; -#if 1 +#if 0 + /* show links as reference at the bottom */ if (src[0]) { printf(" [%d]", ++linkcount); if (!strcasecmp(t, "img") || !strcasecmp(t, "video") || @@ -375,15 +555,28 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) printf("[%s]", t); /* TODO: check allocation */ if (!links_head) - links_cur = links_head = xcalloc(1, sizeof(*links_head)); + links_cur = links_head = ecalloc(1, sizeof(*links_head)); else - links_cur = links_cur->next = xcalloc(1, sizeof(*links_head)); - links_cur->type = xstrdup(t); - links_cur->url = xstrdup(src); + links_cur = links_cur->next = ecalloc(1, sizeof(*links_head)); + links_cur->type = estrdup(t); + links_cur->url = estrdup(src); } src[0] = '\0'; #endif + /* show links inline */ + if (src[0]) { + char absurl[1024]; + if (absuri(absurl, sizeof(absurl), src, basehref) != -1) { + if (!strcasecmp(t, "img") || !strcasecmp(t, "video") || + !strcasecmp(t, "audio")) { + printf("[%s](%s) ", t, absurl); + } else { + printf("[%s](%s) ", "link", absurl); + } + } + } + if (cur->isblock) fputs("\n", stdout); @@ -421,6 +614,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name, strlcpy(src, value, sizeof(src)); } +#if 0 void printlinkrefs(void) { @@ -432,6 +626,7 @@ printlinkrefs(void) for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++) printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type); } +#endif int main(void) @@ -451,7 +646,7 @@ main(void) parser.getnext = getchar; xml_parse(&parser); - printlinkrefs(); +/* printlinkrefs();*/ putchar('\n'); return 0;