2 * Copyright (c) 2014 Sebastian Freundt
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 #include "archive_platform.h"
27 __FBSDID("$FreeBSD$");
30 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
32 * For the purposes of this file we used the final draft from:
33 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
36 * [ ] real-world warcs can contain resources at endpoints ending in /
37 * e.g. http://bibnum.bnf.fr/warc/
38 * if you're lucky their response contains a Content-Location: header
39 * pointing to a unix-compliant filename, in the example above it's
40 * Content-Location: http://bibnum.bnf.fr/warc/index.html
41 * however, that's not mandated and github for example doesn't follow
43 * We need a set of archive options to control what to do with
44 * entries like these, at the moment care is taken to skip them.
48 #ifdef HAVE_SYS_STAT_H
71 #include "archive_entry.h"
72 #include "archive_private.h"
73 #include "archive_read_private.h"
83 /* request, unsupported */
85 /* response, unsupported */
87 /* revisit, unsupported */
89 /* conversion, unsupported */
91 /* continuation, unsupported at the moment */
108 /* content length ahead */
110 /* and how much we've processed so far */
112 /* and how much we need to consume between calls */
117 /* previous version */
119 /* stringified format name */
120 struct archive_string sver;
123 static int _warc_bid(struct archive_read *a, int);
124 static int _warc_cleanup(struct archive_read *a);
125 static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126 static int _warc_skip(struct archive_read *a);
127 static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
129 /* private routines */
130 static unsigned int _warc_rdver(const char buf[10], size_t bsz);
131 static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132 static warc_string_t _warc_rduri(const char *buf, size_t bsz);
133 static ssize_t _warc_rdlen(const char *buf, size_t bsz);
134 static time_t _warc_rdrtm(const char *buf, size_t bsz);
135 static time_t _warc_rdmtm(const char *buf, size_t bsz);
136 static const char *_warc_find_eoh(const char *buf, size_t bsz);
137 static const char *_warc_find_eol(const char *buf, size_t bsz);
140 archive_read_support_format_warc(struct archive *_a)
142 struct archive_read *a = (struct archive_read *)_a;
146 archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147 ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
149 if ((w = calloc(1, sizeof(*w))) == NULL) {
150 archive_set_error(&a->archive, ENOMEM,
151 "Can't allocate warc data");
152 return (ARCHIVE_FATAL);
155 r = __archive_read_register_format(
157 _warc_bid, NULL, _warc_rdhdr, _warc_read,
158 _warc_skip, NULL, _warc_cleanup, NULL, NULL);
160 if (r != ARCHIVE_OK) {
168 _warc_cleanup(struct archive_read *a)
170 struct warc_s *w = a->format->data;
172 if (w->pool.len > 0U) {
175 archive_string_free(&w->sver);
177 a->format->data = NULL;
182 _warc_bid(struct archive_read *a, int best_bid)
188 (void)best_bid; /* UNUSED */
190 /* check first line of file, it should be a record already */
191 if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
192 /* no idea what to do */
194 } else if (nrd < 12) {
195 /* nah, not for us, our magic cookie is at least 12 bytes */
199 /* otherwise snarf the record's version number */
200 ver = _warc_rdver(hdr, nrd);
201 if (ver < 1200U || ver > 10000U) {
202 /* we only support WARC 0.12 to 1.0 */
206 /* otherwise be confident */
211 _warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
213 #define HDR_PROBE_LEN (12U)
214 struct warc_s *w = a->format->data;
219 /* for the file name, saves some strndup()'ing */
221 /* warc record type, not that we really use it a lot */
223 /* content-length+error monad */
225 /* record time is the WARC-Date time we reinterpret it as ctime */
227 /* mtime is the Last-Modified time which will be the entry's mtime */
231 /* just use read_ahead() they keep track of unconsumed
232 * bits and bobs for us; no need to put an extra shift in
233 * and reproduce that functionality here */
234 buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
239 &a->archive, ARCHIVE_ERRNO_MISC,
240 "Bad record header");
241 return (ARCHIVE_FATAL);
242 } else if (buf == NULL) {
243 /* there should be room for at least WARC/bla\r\n
244 * must be EOF therefore */
245 return (ARCHIVE_EOF);
247 /* looks good so far, try and find the end of the header now */
248 eoh = _warc_find_eoh(buf, nrd);
250 /* still no good, the header end might be beyond the
251 * probe we've requested, but then again who'd cram
252 * so much stuff into the header *and* be 28500-compliant */
254 &a->archive, ARCHIVE_ERRNO_MISC,
255 "Bad record header");
256 return (ARCHIVE_FATAL);
258 ver = _warc_rdver(buf, eoh - buf);
259 /* we currently support WARC 0.12 to 1.0 */
262 &a->archive, ARCHIVE_ERRNO_MISC,
263 "Invalid record version");
264 return (ARCHIVE_FATAL);
265 } else if (ver < 1200U || ver > 10000U) {
267 &a->archive, ARCHIVE_ERRNO_MISC,
268 "Unsupported record version: %u.%u",
269 ver / 10000, (ver % 10000) / 100);
270 return (ARCHIVE_FATAL);
272 cntlen = _warc_rdlen(buf, eoh - buf);
274 /* nightmare! the specs say content-length is mandatory
275 * so I don't feel overly bad stopping the reader here */
278 "Bad content length");
279 return (ARCHIVE_FATAL);
281 rtime = _warc_rdrtm(buf, eoh - buf);
282 if (rtime == (time_t)-1) {
283 /* record time is mandatory as per WARC/1.0,
284 * so just barf here, fast and loud */
288 return (ARCHIVE_FATAL);
291 /* let the world know we're a WARC archive */
292 a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293 if (ver != w->pver) {
294 /* stringify this entry's version */
295 archive_string_sprintf(&w->sver,
296 "WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297 /* remember the version */
300 /* start off with the type */
301 ftyp = _warc_rdtyp(buf, eoh - buf);
302 /* and let future calls know about the content */
305 mtime = 0;/* Avoid compiling error on some platform. */
310 /* only try and read the filename in the cases that are
311 * guaranteed to have one */
312 fnam = _warc_rduri(buf, eoh - buf);
313 /* check the last character in the URI to avoid creating
314 * directory endpoints as files, see Todo above */
315 if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316 /* break here for now */
321 /* bang to our string pool, so we save a
322 * malloc()+free() roundtrip */
323 if (fnam.len + 1U > w->pool.len) {
324 w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325 w->pool.str = realloc(w->pool.str, w->pool.len);
327 memcpy(w->pool.str, fnam.str, fnam.len);
328 w->pool.str[fnam.len] = '\0';
329 /* let no one else know about the pool, it's a secret, shhh */
330 fnam.str = w->pool.str;
332 /* snarf mtime or deduce from rtime
333 * this is a custom header added by our writer, it's quite
334 * hard to believe anyone else would go through with it
335 * (apart from being part of some http responses of course) */
336 if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
346 /* now eat some of those delicious buffer bits */
347 __archive_read_consume(a, eoh - buf);
353 /* populate entry object */
354 archive_entry_set_filetype(entry, AE_IFREG);
355 archive_entry_copy_pathname(entry, fnam.str);
356 archive_entry_set_size(entry, cntlen);
357 archive_entry_set_perm(entry, 0644);
358 /* rtime is the new ctime, mtime stays mtime */
359 archive_entry_set_ctime(entry, rtime, 0L);
360 archive_entry_set_mtime(entry, mtime, 0L);
365 /* consume the content and start over */
373 _warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
375 struct warc_s *w = a->format->data;
379 if (w->cntoff >= w->cntlen) {
381 /* it's our lucky day, no work, we can leave early */
384 *off = w->cntoff + 4U/*for \r\n\r\n separator*/;
386 return (ARCHIVE_EOF);
390 __archive_read_consume(a, w->unconsumed);
394 rab = __archive_read_ahead(a, 1U, &nrd);
397 /* big catastrophe */
399 } else if (nrd == 0) {
401 } else if ((size_t)nrd > w->cntlen - w->cntoff) {
402 /* clamp to content-length */
403 nrd = w->cntlen - w->cntoff;
410 w->unconsumed = (size_t)nrd;
415 _warc_skip(struct archive_read *a)
417 struct warc_s *w = a->format->data;
419 __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
426 /* private routines */
428 deconst(const void *c)
430 return (char *)0x1 + (((const char *)c) - (const char *)0x1);
434 xmemmem(const char *hay, const size_t haysize,
435 const char *needle, const size_t needlesize)
437 const char *const eoh = hay + haysize;
438 const char *const eon = needle + needlesize;
446 /* trivial checks first
447 * a 0-sized needle is defined to be found anywhere in haystack
448 * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
449 * that happens to begin with *NEEDLE) */
450 if (needlesize == 0UL) {
452 } else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
457 /* First characters of haystack and needle are the same now. Both are
458 * guaranteed to be at least one character long. Now computes the sum
459 * of characters values of needle together with the sum of the first
460 * needle_len characters of haystack. */
461 for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
462 hp < eoh && np < eon;
463 hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
465 /* HP now references the (NEEDLESIZE + 1)-th character. */
467 /* haystack is smaller than needle, :O */
474 /* now loop through the rest of haystack,
475 * updating the sum iteratively */
476 for (cand = hay; hp < eoh; hp++) {
480 /* Since the sum of the characters is already known to be
481 * equal at that point, it is enough to check just NEEDLESIZE - 1
482 * characters for equality,
483 * also CAND is by design < HP, so no need for range checks */
484 if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
485 return deconst(cand);
492 strtoi_lim(const char *str, const char **ep, int llim, int ulim)
496 /* we keep track of the number of digits via rulim */
499 for (sp = str, rulim = ulim > 10 ? ulim : 10;
500 res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
507 } else if (res < llim || res > ulim) {
510 *ep = (const char*)sp;
515 time_from_tm(struct tm *t)
518 /* Use platform timegm() if available. */
520 #elif HAVE__MKGMTIME64
521 return (_mkgmtime64(t));
523 /* Else use direct calculation using POSIX assumptions. */
524 /* First, fix up tm_yday based on the year/month/day. */
525 if (mktime(t) == (time_t)-1)
527 /* Then we can compute timegm() from first principles. */
532 + (t->tm_year - 70) * 31536000
533 + ((t->tm_year - 69) / 4) * 86400
534 - ((t->tm_year - 1) / 100) * 86400
535 + ((t->tm_year + 299) / 400) * 86400);
540 xstrpisotime(const char *s, char **endptr)
542 /** like strptime() but strictly for ISO 8601 Zulu strings */
544 time_t res = (time_t)-1;
546 /* make sure tm is clean */
547 memset(&tm, 0, sizeof(tm));
549 /* as a courtesy to our callers, and since this is a non-standard
550 * routine, we skip leading whitespace */
551 while (*s == ' ' || *s == '\t')
555 if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
559 if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
562 /* read day-of-month */
563 if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
567 if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
571 if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
575 if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
579 /* massage TM to fulfill some of POSIX' constraints */
583 /* now convert our custom tm struct to a unix stamp using UTC */
584 res = time_from_tm(&tm);
587 if (endptr != NULL) {
588 *endptr = deconst(s);
594 _warc_rdver(const char *buf, size_t bsz)
596 static const char magic[] = "WARC/";
598 unsigned int ver = 0U;
599 unsigned int end = 0U;
601 if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
602 /* buffer too small or invalid magic */
605 /* looks good so far, read the version number for a laugh */
606 buf += sizeof(magic) - 1U;
608 if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
609 isdigit((unsigned char)buf[2U])) {
610 /* we support a maximum of 2 digits in the minor version */
611 if (isdigit((unsigned char)buf[3U]))
613 /* set up major version */
614 ver = (buf[0U] - '0') * 10000U;
615 /* set up minor version */
617 ver += (buf[2U] - '0') * 1000U;
618 ver += (buf[3U] - '0') * 100U;
620 ver += (buf[2U] - '0') * 100U;
622 * WARC below version 0.12 has a space-separated header
623 * WARC 0.12 and above terminates the version with a CRLF
627 if (memcmp(c, "\r\n", 2U) != 0)
631 if (*c != ' ' && *c != '\t')
639 _warc_rdtyp(const char *buf, size_t bsz)
641 static const char _key[] = "\r\nWARC-Type:";
642 const char *val, *eol;
644 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
648 val += sizeof(_key) - 1U;
649 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
654 /* overread whitespace */
655 while (val < eol && (*val == ' ' || *val == '\t'))
658 if (val + 8U == eol) {
659 if (memcmp(val, "resource", 8U) == 0)
661 else if (memcmp(val, "response", 8U) == 0)
668 _warc_rduri(const char *buf, size_t bsz)
670 static const char _key[] = "\r\nWARC-Target-URI:";
671 const char *val, *uri, *eol, *p;
672 warc_string_t res = {0U, NULL};
674 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
678 /* overread whitespace */
679 val += sizeof(_key) - 1U;
680 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
685 while (val < eol && (*val == ' ' || *val == '\t'))
688 /* overread URL designators */
689 if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
690 /* not touching that! */
694 /* spaces inside uri are not allowed, CRLF should follow */
695 for (p = val; p < eol; p++) {
696 if (isspace((unsigned char)*p))
700 /* there must be at least space for ftp */
701 if (uri < (val + 3U))
704 /* move uri to point to after :// */
707 /* now then, inspect the URI */
708 if (memcmp(val, "file", 4U) == 0) {
709 /* perfect, nothing left to do here */
711 } else if (memcmp(val, "http", 4U) == 0 ||
712 memcmp(val, "ftp", 3U) == 0) {
713 /* overread domain, and the first / */
714 while (uri < eol && *uri++ != '/');
716 /* not sure what to do? best to bugger off */
725 _warc_rdlen(const char *buf, size_t bsz)
727 static const char _key[] = "\r\nContent-Length:";
728 const char *val, *eol;
732 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
736 val += sizeof(_key) - 1U;
737 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
742 /* skip leading whitespace */
743 while (val < eol && (*val == ' ' || *val == '\t'))
745 /* there must be at least one digit */
746 if (!isdigit((unsigned char)*val))
749 len = strtol(val, &on, 10);
750 if (errno != 0 || on != eol) {
751 /* line must end here */
759 _warc_rdrtm(const char *buf, size_t bsz)
761 static const char _key[] = "\r\nWARC-Date:";
762 const char *val, *eol;
766 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
770 val += sizeof(_key) - 1U;
771 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
776 /* xstrpisotime() kindly overreads whitespace for us, so use that */
777 res = xstrpisotime(val, &on);
779 /* line must end here */
786 _warc_rdmtm(const char *buf, size_t bsz)
788 static const char _key[] = "\r\nLast-Modified:";
789 const char *val, *eol;
793 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
797 val += sizeof(_key) - 1U;
798 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
803 /* xstrpisotime() kindly overreads whitespace for us, so use that */
804 res = xstrpisotime(val, &on);
806 /* line must end here */
813 _warc_find_eoh(const char *buf, size_t bsz)
815 static const char _marker[] = "\r\n\r\n";
816 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
819 hit += sizeof(_marker) - 1U;
825 _warc_find_eol(const char *buf, size_t bsz)
827 static const char _marker[] = "\r\n";
828 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
832 /* archive_read_support_format_warc.c ends here */