2 * Copyright (C) 2004, 2005, 2007-2009 Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2002 Internet Software Consortium.
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
18 /* $Id: journal.c,v 1.99.70.4 2009/01/19 23:47:02 tbox Exp $ */
28 #include <isc/stdio.h>
29 #include <isc/string.h>
32 #include <dns/compress.h>
34 #include <dns/dbiterator.h>
36 #include <dns/fixedname.h>
37 #include <dns/journal.h>
39 #include <dns/rdataset.h>
40 #include <dns/rdatasetiter.h>
41 #include <dns/result.h>
47 * A journal file consists of
49 * \li A fixed-size header of type journal_rawheader_t.
51 * \li The index. This is an unordered array of index entries
52 * of type journal_rawpos_t giving the locations
53 * of some arbitrary subset of the journal's addressable
54 * transactions. The index entries are used as hints to
55 * speed up the process of locating a transaction with a given
56 * serial number. Unused index entries have an "offset"
57 * field of zero. The size of the index can vary between
58 * journal files, but does not change during the lifetime
59 * of a file. The size can be zero.
61 * \li The journal data. This consists of one or more transactions.
62 * Each transaction begins with a transaction header of type
63 * journal_rawxhdr_t. The transaction header is followed by a
64 * sequence of RRs, similar in structure to an IXFR difference
65 * sequence (RFC1995). That is, the pre-transaction SOA,
66 * zero or more other deleted RRs, the post-transaction SOA,
67 * and zero or more other added RRs. Unlike in IXFR, each RR
68 * is prefixed with a 32-bit length.
70 * The journal data part grows as new transactions are
71 * appended to the file. Only those transactions
72 * whose serial number is current-(2^31-1) to current
73 * are considered "addressable" and may be pointed
74 * to from the header or index. They may be preceded
75 * by old transactions that are no longer addressable,
76 * and they may be followed by transactions that were
77 * appended to the journal but never committed by updating
78 * the "end" position in the header. The latter will
79 * be overwritten when new transactions are added.
82 * When true, accept IXFR difference sequences where the
83 * SOA serial number does not change (BIND 8 sends such
86 static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
88 /**************************************************************************/
90 * Miscellaneous utilities.
93 #define JOURNAL_COMMON_LOGARGS \
94 dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
96 #define JOURNAL_DEBUG_LOGARGS(n) \
97 JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
100 * It would be non-sensical (or at least obtuse) to use FAIL() with an
101 * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
102 * from complaining about "end-of-loop code not reached".
105 do { result = (code); \
106 if (result != ISC_R_SUCCESS) goto failure; \
110 do { result = (op); \
111 if (result != ISC_R_SUCCESS) goto failure; \
114 static isc_result_t index_to_disk(dns_journal_t *);
116 static inline isc_uint32_t
117 decode_uint32(unsigned char *p) {
118 return ((p[0] << 24) +
125 encode_uint32(isc_uint32_t val, unsigned char *p) {
126 p[0] = (isc_uint8_t)(val >> 24);
127 p[1] = (isc_uint8_t)(val >> 16);
128 p[2] = (isc_uint8_t)(val >> 8);
129 p[3] = (isc_uint8_t)(val >> 0);
133 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
134 dns_diffop_t op, dns_difftuple_t **tp)
138 dns_rdataset_t rdataset;
139 dns_rdata_t rdata = DNS_RDATA_INIT;
140 dns_name_t *zonename;
142 zonename = dns_db_origin(db);
145 result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
146 if (result != ISC_R_SUCCESS)
149 dns_rdataset_init(&rdataset);
150 result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
151 (isc_stdtime_t)0, &rdataset, NULL);
152 if (result != ISC_R_SUCCESS)
155 result = dns_rdataset_first(&rdataset);
156 if (result != ISC_R_SUCCESS)
159 dns_rdataset_current(&rdataset, &rdata);
161 result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
164 dns_rdataset_disassociate(&rdataset);
165 dns_db_detachnode(db, &node);
166 return (ISC_R_SUCCESS);
169 dns_db_detachnode(db, &node);
171 UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
178 * On-disk representation of a "pointer" to a journal entry.
179 * These are used in the journal header to locate the beginning
180 * and end of the journal, and in the journal index to locate
181 * other transactions.
184 unsigned char serial[4]; /*%< SOA serial before update. */
186 * XXXRTH Should offset be 8 bytes?
187 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
188 * XXXAG ... but we will not be able to seek >2G anyway on many
189 * platforms as long as we are using fseek() rather
192 unsigned char offset[4]; /*%< Offset from beginning of file. */
197 * The header is of a fixed size, with some spare room for future
200 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
203 * The on-disk representation of the journal header.
204 * All numbers are stored in big-endian order.
208 /*% File format version ID. */
209 unsigned char format[16];
210 /*% Position of the first addressable transaction */
211 journal_rawpos_t begin;
212 /*% Position of the next (yet nonexistent) transaction. */
213 journal_rawpos_t end;
214 /*% Number of index entries following the header. */
215 unsigned char index_size[4];
217 /* Pad the header to a fixed size. */
218 unsigned char pad[JOURNAL_HEADER_SIZE];
219 } journal_rawheader_t;
222 * The on-disk representation of the transaction header.
223 * There is one of these at the beginning of each transaction.
226 unsigned char size[4]; /*%< In bytes, excluding header. */
227 unsigned char serial0[4]; /*%< SOA serial before update. */
228 unsigned char serial1[4]; /*%< SOA serial after update. */
232 * The on-disk representation of the RR header.
233 * There is one of these at the beginning of each RR.
236 unsigned char size[4]; /*%< In bytes, excluding header. */
237 } journal_rawrrhdr_t;
240 * The in-core representation of the journal header.
247 #define POS_VALID(pos) ((pos).offset != 0)
248 #define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)
251 unsigned char format[16];
254 isc_uint32_t index_size;
258 * The in-core representation of the transaction header.
263 isc_uint32_t serial0;
264 isc_uint32_t serial1;
268 * The in-core representation of the RR header.
276 * Initial contents to store in the header of a newly created
279 * The header starts with the magic string ";BIND LOG V9\n"
280 * to identify the file as a BIND 9 journal file. An ASCII
281 * identification string is used rather than a binary magic
282 * number to be consistent with BIND 8 (BIND 8 journal files
283 * are ASCII text files).
286 static journal_header_t
287 initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
289 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
292 JOURNAL_STATE_INVALID,
295 JOURNAL_STATE_TRANSACTION
299 unsigned int magic; /*%< JOUR */
300 isc_mem_t *mctx; /*%< Memory context */
301 journal_state_t state;
302 const char *filename; /*%< Journal file name */
303 FILE * fp; /*%< File handle */
304 isc_offset_t offset; /*%< Current file offset */
305 journal_header_t header; /*%< In-core journal header */
306 unsigned char *rawindex; /*%< In-core buffer for journal index in on-disk format */
307 journal_pos_t *index; /*%< In-core journal index */
309 /*% Current transaction state (when writing). */
311 unsigned int n_soa; /*%< Number of SOAs seen */
312 journal_pos_t pos[2]; /*%< Begin/end position */
315 /*% Iteration state (when reading). */
317 /* These define the part of the journal we iterate over. */
318 journal_pos_t bpos; /*%< Position before first, */
319 journal_pos_t epos; /*%< and after last transaction */
320 /* The rest is iterator state. */
321 isc_uint32_t current_serial; /*%< Current SOA serial */
322 isc_buffer_t source; /*%< Data from disk */
323 isc_buffer_t target; /*%< Data from _fromwire check */
324 dns_decompress_t dctx; /*%< Dummy decompression ctx */
325 dns_name_t name; /*%< Current domain name */
326 dns_rdata_t rdata; /*%< Current rdata */
327 isc_uint32_t ttl; /*%< Current TTL */
328 unsigned int xsize; /*%< Size of transaction data */
329 unsigned int xpos; /*%< Current position in it */
330 isc_result_t result; /*%< Result of last call */
334 #define DNS_JOURNAL_MAGIC ISC_MAGIC('J', 'O', 'U', 'R')
335 #define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
338 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
339 cooked->serial = decode_uint32(raw->serial);
340 cooked->offset = decode_uint32(raw->offset);
344 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
345 encode_uint32(cooked->serial, raw->serial);
346 encode_uint32(cooked->offset, raw->offset);
350 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
351 INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
352 memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
353 journal_pos_decode(&raw->h.begin, &cooked->begin);
354 journal_pos_decode(&raw->h.end, &cooked->end);
355 cooked->index_size = decode_uint32(raw->h.index_size);
359 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
360 INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
361 memset(raw->pad, 0, sizeof(raw->pad));
362 memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
363 journal_pos_encode(&raw->h.begin, &cooked->begin);
364 journal_pos_encode(&raw->h.end, &cooked->end);
365 encode_uint32(cooked->index_size, raw->h.index_size);
369 * Journal file I/O subroutines, with error checking and reporting.
372 journal_seek(dns_journal_t *j, isc_uint32_t offset) {
374 result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
375 if (result != ISC_R_SUCCESS) {
376 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
377 "%s: seek: %s", j->filename,
378 isc_result_totext(result));
379 return (ISC_R_UNEXPECTED);
382 return (ISC_R_SUCCESS);
386 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
389 result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
390 if (result != ISC_R_SUCCESS) {
391 if (result == ISC_R_EOF)
392 return (ISC_R_NOMORE);
393 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
395 j->filename, isc_result_totext(result));
396 return (ISC_R_UNEXPECTED);
399 return (ISC_R_SUCCESS);
403 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
406 result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
407 if (result != ISC_R_SUCCESS) {
408 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
410 j->filename, isc_result_totext(result));
411 return (ISC_R_UNEXPECTED);
414 return (ISC_R_SUCCESS);
418 journal_fsync(dns_journal_t *j) {
420 result = isc_stdio_flush(j->fp);
421 if (result != ISC_R_SUCCESS) {
422 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
424 j->filename, isc_result_totext(result));
425 return (ISC_R_UNEXPECTED);
427 result = isc_stdio_sync(j->fp);
428 if (result != ISC_R_SUCCESS) {
429 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
431 j->filename, isc_result_totext(result));
432 return (ISC_R_UNEXPECTED);
434 return (ISC_R_SUCCESS);
438 * Read/write a transaction header at the current file position.
442 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
443 journal_rawxhdr_t raw;
445 result = journal_read(j, &raw, sizeof(raw));
446 if (result != ISC_R_SUCCESS)
448 xhdr->size = decode_uint32(raw.size);
449 xhdr->serial0 = decode_uint32(raw.serial0);
450 xhdr->serial1 = decode_uint32(raw.serial1);
451 return (ISC_R_SUCCESS);
455 journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
456 isc_uint32_t serial0, isc_uint32_t serial1)
458 journal_rawxhdr_t raw;
459 encode_uint32(size, raw.size);
460 encode_uint32(serial0, raw.serial0);
461 encode_uint32(serial1, raw.serial1);
462 return (journal_write(j, &raw, sizeof(raw)));
467 * Read an RR header at the current file position.
471 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
472 journal_rawrrhdr_t raw;
474 result = journal_read(j, &raw, sizeof(raw));
475 if (result != ISC_R_SUCCESS)
477 rrhdr->size = decode_uint32(raw.size);
478 return (ISC_R_SUCCESS);
482 journal_file_create(isc_mem_t *mctx, const char *filename) {
485 journal_header_t header;
486 journal_rawheader_t rawheader;
487 int index_size = 56; /* XXX configurable */
489 void *mem; /* Memory for temporary index image. */
491 INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
493 result = isc_stdio_open(filename, "wb", &fp);
494 if (result != ISC_R_SUCCESS) {
495 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
497 filename, isc_result_totext(result));
498 return (ISC_R_UNEXPECTED);
501 header = initial_journal_header;
502 header.index_size = index_size;
503 journal_header_encode(&header, &rawheader);
505 size = sizeof(journal_rawheader_t) +
506 index_size * sizeof(journal_rawpos_t);
508 mem = isc_mem_get(mctx, size);
510 (void)isc_stdio_close(fp);
511 (void)isc_file_remove(filename);
512 return (ISC_R_NOMEMORY);
514 memset(mem, 0, size);
515 memcpy(mem, &rawheader, sizeof(rawheader));
517 result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
518 if (result != ISC_R_SUCCESS) {
519 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
521 filename, isc_result_totext(result));
522 (void)isc_stdio_close(fp);
523 (void)isc_file_remove(filename);
524 isc_mem_put(mctx, mem, size);
525 return (ISC_R_UNEXPECTED);
527 isc_mem_put(mctx, mem, size);
529 result = isc_stdio_close(fp);
530 if (result != ISC_R_SUCCESS) {
531 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
533 filename, isc_result_totext(result));
534 (void)isc_file_remove(filename);
535 return (ISC_R_UNEXPECTED);
538 return (ISC_R_SUCCESS);
542 journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
543 isc_boolean_t create, dns_journal_t **journalp) {
546 journal_rawheader_t rawheader;
549 INSIST(journalp != NULL && *journalp == NULL);
550 j = isc_mem_get(mctx, sizeof(*j));
552 return (ISC_R_NOMEMORY);
555 j->state = JOURNAL_STATE_INVALID;
557 j->filename = filename;
561 result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
563 if (result == ISC_R_FILENOTFOUND) {
565 isc_log_write(JOURNAL_COMMON_LOGARGS,
567 "journal file %s does not exist, "
570 CHECK(journal_file_create(mctx, filename));
574 result = isc_stdio_open(j->filename, "rb+", &fp);
576 FAIL(ISC_R_NOTFOUND);
579 if (result != ISC_R_SUCCESS) {
580 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
582 j->filename, isc_result_totext(result));
583 FAIL(ISC_R_UNEXPECTED);
589 * Set magic early so that seek/read can succeed.
591 j->magic = DNS_JOURNAL_MAGIC;
593 CHECK(journal_seek(j, 0));
594 CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
596 if (memcmp(rawheader.h.format, initial_journal_header.format,
597 sizeof(initial_journal_header.format)) != 0) {
598 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
599 "%s: journal format not recognized",
601 FAIL(ISC_R_UNEXPECTED);
603 journal_header_decode(&rawheader, &j->header);
606 * If there is an index, read the raw index into a dynamically
607 * allocated buffer and then convert it into a cooked index.
609 if (j->header.index_size != 0) {
611 unsigned int rawbytes;
614 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
615 j->rawindex = isc_mem_get(mctx, rawbytes);
616 if (j->rawindex == NULL)
617 FAIL(ISC_R_NOMEMORY);
619 CHECK(journal_read(j, j->rawindex, rawbytes));
621 j->index = isc_mem_get(mctx, j->header.index_size *
622 sizeof(journal_pos_t));
623 if (j->index == NULL)
624 FAIL(ISC_R_NOMEMORY);
627 for (i = 0; i < j->header.index_size; i++) {
628 j->index[i].serial = decode_uint32(p);
630 j->index[i].offset = decode_uint32(p);
633 INSIST(p == j->rawindex + rawbytes);
635 j->offset = -1; /* Invalid, must seek explicitly. */
638 * Initialize the iterator.
640 dns_name_init(&j->it.name, NULL);
641 dns_rdata_init(&j->it.rdata);
644 * Set up empty initial buffers for unchecked and checked
645 * wire format RR data. They will be reallocated
648 isc_buffer_init(&j->it.source, NULL, 0);
649 isc_buffer_init(&j->it.target, NULL, 0);
650 dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
653 write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
656 return (ISC_R_SUCCESS);
660 if (j->index != NULL) {
661 isc_mem_put(j->mctx, j->index, j->header.index_size *
662 sizeof(journal_rawpos_t));
666 (void)isc_stdio_close(j->fp);
667 isc_mem_put(j->mctx, j, sizeof(*j));
672 dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
673 dns_journal_t **journalp) {
678 result = journal_open(mctx, filename, write, write, journalp);
679 if (result == ISC_R_NOTFOUND) {
680 namelen = strlen(filename);
681 if (namelen > 4 && strcmp(filename + namelen - 4, ".jnl") == 0)
684 result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
686 if (result != ISC_R_SUCCESS)
688 result = journal_open(mctx, backup, write, write, journalp);
694 * A comparison function defining the sorting order for
695 * entries in the IXFR-style journal file.
697 * The IXFR format requires that deletions are sorted before
698 * additions, and within either one, SOA records are sorted
701 * Also sort the non-SOA records by type as a courtesy to the
702 * server receiving the IXFR - it may help reduce the amount of
703 * rdataset merging it has to do.
706 ixfr_order(const void *av, const void *bv) {
707 dns_difftuple_t const * const *ap = av;
708 dns_difftuple_t const * const *bp = bv;
709 dns_difftuple_t const *a = *ap;
710 dns_difftuple_t const *b = *bp;
713 r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
717 r = (b->rdata.type == dns_rdatatype_soa) -
718 (a->rdata.type == dns_rdatatype_soa);
722 r = (a->rdata.type - b->rdata.type);
727 * Advance '*pos' to the next journal transaction.
730 * *pos refers to a valid journal transaction.
733 * When ISC_R_SUCCESS is returned,
734 * *pos refers to the next journal transaction.
739 * ISC_R_NOMORE *pos pointed at the last transaction
740 * Other results due to file errors are possible.
743 journal_next(dns_journal_t *j, journal_pos_t *pos) {
746 REQUIRE(DNS_JOURNAL_VALID(j));
748 result = journal_seek(j, pos->offset);
749 if (result != ISC_R_SUCCESS)
752 if (pos->serial == j->header.end.serial)
753 return (ISC_R_NOMORE);
755 * Read the header of the current transaction.
756 * This will return ISC_R_NOMORE if we are at EOF.
758 result = journal_read_xhdr(j, &xhdr);
759 if (result != ISC_R_SUCCESS)
763 * Check serial number consistency.
765 if (xhdr.serial0 != pos->serial) {
766 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
767 "%s: journal file corrupt: "
768 "expected serial %u, got %u",
769 j->filename, pos->serial, xhdr.serial0);
770 return (ISC_R_UNEXPECTED);
774 * Check for offset wraparound.
776 if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
778 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
779 "%s: offset too large", j->filename);
780 return (ISC_R_UNEXPECTED);
783 pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
784 pos->serial = xhdr.serial1;
785 return (ISC_R_SUCCESS);
789 * If the index of the journal 'j' contains an entry "better"
790 * than '*best_guess', replace '*best_guess' with it.
792 * "Better" means having a serial number closer to 'serial'
793 * but not greater than 'serial'.
796 index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
798 if (j->index == NULL)
800 for (i = 0; i < j->header.index_size; i++) {
801 if (POS_VALID(j->index[i]) &&
802 DNS_SERIAL_GE(serial, j->index[i].serial) &&
803 DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
804 *best_guess = j->index[i];
809 * Add a new index entry. If there is no room, make room by removing
810 * the odd-numbered entries and compacting the others into the first
811 * half of the index. This decimates old index entries exponentially
812 * over time, so that the index always contains a much larger fraction
813 * of recent serial numbers than of old ones. This is deliberate -
814 * most index searches are for outgoing IXFR, and IXFR tends to request
815 * recent versions more often than old ones.
818 index_add(dns_journal_t *j, journal_pos_t *pos) {
820 if (j->index == NULL)
823 * Search for a vacant position.
825 for (i = 0; i < j->header.index_size; i++) {
826 if (! POS_VALID(j->index[i]))
829 if (i == j->header.index_size) {
832 * Found no vacant position. Make some room.
834 for (i = 0; i < j->header.index_size; i += 2) {
835 j->index[k++] = j->index[i];
837 i = k; /* 'i' identifies the first vacant position. */
838 while (k < j->header.index_size) {
839 POS_INVALIDATE(j->index[k]);
843 INSIST(i < j->header.index_size);
844 INSIST(! POS_VALID(j->index[i]));
847 * Store the new index entry.
853 * Invalidate any existing index entries that could become
854 * ambiguous when a new transaction with number 'serial' is added.
857 index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
859 if (j->index == NULL)
861 for (i = 0; i < j->header.index_size; i++) {
862 if (! DNS_SERIAL_GT(serial, j->index[i].serial))
863 POS_INVALIDATE(j->index[i]);
868 * Try to find a transaction with initial serial number 'serial'
869 * in the journal 'j'.
871 * If found, store its position at '*pos' and return ISC_R_SUCCESS.
873 * If 'serial' is current (= the ending serial number of the
874 * last transaction in the journal), set '*pos' to
875 * the position immediately following the last transaction and
876 * return ISC_R_SUCCESS.
878 * If 'serial' is within the range of addressable serial numbers
879 * covered by the journal but that particular serial number is missing
880 * (from the journal, not just from the index), return ISC_R_NOTFOUND.
882 * If 'serial' is outside the range of addressable serial numbers
883 * covered by the journal, return ISC_R_RANGE.
887 journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
889 journal_pos_t current_pos;
890 REQUIRE(DNS_JOURNAL_VALID(j));
892 if (DNS_SERIAL_GT(j->header.begin.serial, serial))
893 return (ISC_R_RANGE);
894 if (DNS_SERIAL_GT(serial, j->header.end.serial))
895 return (ISC_R_RANGE);
896 if (serial == j->header.end.serial) {
897 *pos = j->header.end;
898 return (ISC_R_SUCCESS);
901 current_pos = j->header.begin;
902 index_find(j, serial, ¤t_pos);
904 while (current_pos.serial != serial) {
905 if (DNS_SERIAL_GT(current_pos.serial, serial))
906 return (ISC_R_NOTFOUND);
907 result = journal_next(j, ¤t_pos);
908 if (result != ISC_R_SUCCESS)
912 return (ISC_R_SUCCESS);
916 dns_journal_begin_transaction(dns_journal_t *j) {
919 journal_rawxhdr_t hdr;
921 REQUIRE(DNS_JOURNAL_VALID(j));
922 REQUIRE(j->state == JOURNAL_STATE_WRITE);
925 * Find the file offset where the new transaction should
926 * be written, and seek there.
928 if (JOURNAL_EMPTY(&j->header)) {
929 offset = sizeof(journal_rawheader_t) +
930 j->header.index_size * sizeof(journal_rawpos_t);
932 offset = j->header.end.offset;
934 j->x.pos[0].offset = offset;
935 j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
938 CHECK(journal_seek(j, offset));
941 * Write a dummy transaction header of all zeroes to reserve
942 * space. It will be filled in when the transaction is
945 memset(&hdr, 0, sizeof(hdr));
946 CHECK(journal_write(j, &hdr, sizeof(hdr)));
947 j->x.pos[1].offset = j->offset;
949 j->state = JOURNAL_STATE_TRANSACTION;
950 result = ISC_R_SUCCESS;
956 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
964 REQUIRE(DNS_DIFF_VALID(diff));
965 REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
967 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
968 (void)dns_diff_print(diff, NULL);
971 * Pass 1: determine the buffer size needed, and
972 * keep track of SOA serial numbers.
975 for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
976 t = ISC_LIST_NEXT(t, link))
978 if (t->rdata.type == dns_rdatatype_soa) {
980 j->x.pos[j->x.n_soa].serial =
981 dns_soa_getserial(&t->rdata);
984 size += sizeof(journal_rawrrhdr_t);
985 size += t->name.length; /* XXX should have access macro? */
987 size += t->rdata.length;
990 mem = isc_mem_get(j->mctx, size);
992 return (ISC_R_NOMEMORY);
994 isc_buffer_init(&buffer, mem, size);
997 * Pass 2. Write RRs to buffer.
999 for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
1000 t = ISC_LIST_NEXT(t, link))
1003 * Write the RR header.
1005 isc_buffer_putuint32(&buffer, t->name.length + 10 +
1008 * Write the owner name, RR header, and RR data.
1010 isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
1011 isc_buffer_putuint16(&buffer, t->rdata.type);
1012 isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1013 isc_buffer_putuint32(&buffer, t->ttl);
1014 INSIST(t->rdata.length < 65536);
1015 isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1016 INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1017 isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1020 isc_buffer_usedregion(&buffer, &used);
1021 INSIST(used.length == size);
1023 j->x.pos[1].offset += used.length;
1026 * Write the buffer contents to the journal file.
1028 CHECK(journal_write(j, used.base, used.length));
1030 result = ISC_R_SUCCESS;
1034 isc_mem_put(j->mctx, mem, size);
1040 dns_journal_commit(dns_journal_t *j) {
1041 isc_result_t result;
1042 journal_rawheader_t rawheader;
1044 REQUIRE(DNS_JOURNAL_VALID(j));
1045 REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1048 * Perform some basic consistency checks.
1050 if (j->x.n_soa != 2) {
1051 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1052 "%s: malformed transaction: %d SOAs",
1053 j->filename, j->x.n_soa);
1054 return (ISC_R_UNEXPECTED);
1056 if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1058 j->x.pos[1].serial == j->x.pos[0].serial)))
1060 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1061 "%s: malformed transaction: serial number "
1062 "would decrease", j->filename);
1063 return (ISC_R_UNEXPECTED);
1065 if (! JOURNAL_EMPTY(&j->header)) {
1066 if (j->x.pos[0].serial != j->header.end.serial) {
1067 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1068 "malformed transaction: "
1069 "%s last serial %u != "
1070 "transaction first serial %u",
1072 j->header.end.serial,
1073 j->x.pos[0].serial);
1074 return (ISC_R_UNEXPECTED);
1079 * Some old journal entries may become non-addressable
1080 * when we increment the current serial number. Purge them
1081 * by stepping header.begin forward to the first addressable
1082 * transaction. Also purge them from the index.
1084 if (! JOURNAL_EMPTY(&j->header)) {
1085 while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1086 j->header.begin.serial)) {
1087 CHECK(journal_next(j, &j->header.begin));
1089 index_invalidate(j, j->x.pos[1].serial);
1092 if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1098 * Commit the transaction data to stable storage.
1100 CHECK(journal_fsync(j));
1103 * Update the transaction header.
1105 CHECK(journal_seek(j, j->x.pos[0].offset));
1106 CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1107 sizeof(journal_rawxhdr_t),
1108 j->x.pos[0].serial, j->x.pos[1].serial));
1111 * Update the journal header.
1113 if (JOURNAL_EMPTY(&j->header)) {
1114 j->header.begin = j->x.pos[0];
1116 j->header.end = j->x.pos[1];
1117 journal_header_encode(&j->header, &rawheader);
1118 CHECK(journal_seek(j, 0));
1119 CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1124 index_add(j, &j->x.pos[0]);
1127 * Convert the index into on-disk format and write
1130 CHECK(index_to_disk(j));
1133 * Commit the header to stable storage.
1135 CHECK(journal_fsync(j));
1138 * We no longer have a transaction open.
1140 j->state = JOURNAL_STATE_WRITE;
1142 result = ISC_R_SUCCESS;
1149 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1150 isc_result_t result;
1151 CHECK(dns_diff_sort(diff, ixfr_order));
1152 CHECK(dns_journal_begin_transaction(j));
1153 CHECK(dns_journal_writediff(j, diff));
1154 CHECK(dns_journal_commit(j));
1155 result = ISC_R_SUCCESS;
1161 dns_journal_destroy(dns_journal_t **journalp) {
1162 dns_journal_t *j = *journalp;
1163 REQUIRE(DNS_JOURNAL_VALID(j));
1165 j->it.result = ISC_R_FAILURE;
1166 dns_name_invalidate(&j->it.name);
1167 dns_decompress_invalidate(&j->it.dctx);
1168 if (j->rawindex != NULL)
1169 isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1170 sizeof(journal_rawpos_t));
1171 if (j->index != NULL)
1172 isc_mem_put(j->mctx, j->index, j->header.index_size *
1173 sizeof(journal_pos_t));
1174 if (j->it.target.base != NULL)
1175 isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1176 if (j->it.source.base != NULL)
1177 isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1180 (void)isc_stdio_close(j->fp);
1182 isc_mem_put(j->mctx, j, sizeof(*j));
1187 * Roll the open journal 'j' into the database 'db'.
1188 * A new database version will be created.
1191 /* XXX Share code with incoming IXFR? */
1194 roll_forward(dns_journal_t *j, dns_db_t *db) {
1195 isc_buffer_t source; /* Transaction data from disk */
1196 isc_buffer_t target; /* Ditto after _fromwire check */
1197 isc_uint32_t db_serial; /* Database SOA serial */
1198 isc_uint32_t end_serial; /* Last journal SOA serial */
1199 isc_result_t result;
1200 dns_dbversion_t *ver = NULL;
1203 unsigned int n_soa = 0;
1204 unsigned int n_put = 0;
1206 REQUIRE(DNS_JOURNAL_VALID(j));
1207 REQUIRE(DNS_DB_VALID(db));
1209 dns_diff_init(j->mctx, &diff);
1212 * Set up empty initial buffers for unchecked and checked
1213 * wire format transaction data. They will be reallocated
1216 isc_buffer_init(&source, NULL, 0);
1217 isc_buffer_init(&target, NULL, 0);
1220 * Create the new database version.
1222 CHECK(dns_db_newversion(db, &ver));
1225 * Get the current database SOA serial number.
1227 CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1230 * Locate a journal entry for the current database serial.
1232 CHECK(journal_find(j, db_serial, &pos));
1234 * XXX do more drastic things, like marking zone stale,
1238 * XXXRTH The zone code should probably mark the zone as bad and
1239 * scream loudly into the log if this is a dynamic update
1240 * log reply that failed.
1243 end_serial = dns_journal_last_serial(j);
1244 if (db_serial == end_serial)
1245 CHECK(DNS_R_UPTODATE);
1247 CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1249 for (result = dns_journal_first_rr(j);
1250 result == ISC_R_SUCCESS;
1251 result = dns_journal_next_rr(j))
1256 dns_difftuple_t *tuple = NULL;
1260 dns_journal_current_rr(j, &name, &ttl, &rdata);
1262 if (rdata->type == dns_rdatatype_soa) {
1265 db_serial = j->it.current_serial;
1271 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1272 "%s: journal file corrupt: missing "
1273 "initial SOA", j->filename);
1274 FAIL(ISC_R_UNEXPECTED);
1276 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1277 DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1278 name, ttl, rdata, &tuple));
1279 dns_diff_append(&diff, &tuple);
1281 if (++n_put > 100) {
1282 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1283 "%s: applying diff to database (%u)",
1284 j->filename, db_serial);
1285 (void)dns_diff_print(&diff, NULL);
1286 CHECK(dns_diff_apply(&diff, db, ver));
1287 dns_diff_clear(&diff);
1291 if (result == ISC_R_NOMORE)
1292 result = ISC_R_SUCCESS;
1296 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1297 "%s: applying final diff to database (%u)",
1298 j->filename, db_serial);
1299 (void)dns_diff_print(&diff, NULL);
1300 CHECK(dns_diff_apply(&diff, db, ver));
1301 dns_diff_clear(&diff);
1306 dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1307 ISC_TRUE : ISC_FALSE);
1309 if (source.base != NULL)
1310 isc_mem_put(j->mctx, source.base, source.length);
1311 if (target.base != NULL)
1312 isc_mem_put(j->mctx, target.base, target.length);
1314 dns_diff_clear(&diff);
1320 dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1322 isc_result_t result;
1324 REQUIRE(DNS_DB_VALID(db));
1325 REQUIRE(filename != NULL);
1328 result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1329 if (result == ISC_R_NOTFOUND) {
1330 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1331 "no journal file, but that's OK");
1332 return (DNS_R_NOJOURNAL);
1334 if (result != ISC_R_SUCCESS)
1336 if (JOURNAL_EMPTY(&j->header))
1337 result = DNS_R_UPTODATE;
1339 result = roll_forward(j, db);
1341 dns_journal_destroy(&j);
1347 dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1349 isc_buffer_t source; /* Transaction data from disk */
1350 isc_buffer_t target; /* Ditto after _fromwire check */
1351 isc_uint32_t start_serial; /* Database SOA serial */
1352 isc_uint32_t end_serial; /* Last journal SOA serial */
1353 isc_result_t result;
1355 unsigned int n_soa = 0;
1356 unsigned int n_put = 0;
1358 REQUIRE(filename != NULL);
1361 result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1362 if (result == ISC_R_NOTFOUND) {
1363 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1364 return (DNS_R_NOJOURNAL);
1367 if (result != ISC_R_SUCCESS) {
1368 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1369 "journal open failure: %s: %s",
1370 isc_result_totext(result), filename);
1374 dns_diff_init(j->mctx, &diff);
1377 * Set up empty initial buffers for unchecked and checked
1378 * wire format transaction data. They will be reallocated
1381 isc_buffer_init(&source, NULL, 0);
1382 isc_buffer_init(&target, NULL, 0);
1384 start_serial = dns_journal_first_serial(j);
1385 end_serial = dns_journal_last_serial(j);
1387 CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1389 for (result = dns_journal_first_rr(j);
1390 result == ISC_R_SUCCESS;
1391 result = dns_journal_next_rr(j))
1396 dns_difftuple_t *tuple = NULL;
1400 dns_journal_current_rr(j, &name, &ttl, &rdata);
1402 if (rdata->type == dns_rdatatype_soa)
1408 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1409 "%s: journal file corrupt: missing "
1410 "initial SOA", j->filename);
1411 FAIL(ISC_R_UNEXPECTED);
1413 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1414 DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1415 name, ttl, rdata, &tuple));
1416 dns_diff_append(&diff, &tuple);
1418 if (++n_put > 100) {
1419 result = dns_diff_print(&diff, file);
1420 dns_diff_clear(&diff);
1422 if (result != ISC_R_SUCCESS)
1426 if (result == ISC_R_NOMORE)
1427 result = ISC_R_SUCCESS;
1431 result = dns_diff_print(&diff, file);
1432 dns_diff_clear(&diff);
1437 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1438 "%s: cannot print: journal file corrupt", j->filename);
1441 if (source.base != NULL)
1442 isc_mem_put(j->mctx, source.base, source.length);
1443 if (target.base != NULL)
1444 isc_mem_put(j->mctx, target.base, target.length);
1446 dns_diff_clear(&diff);
1447 dns_journal_destroy(&j);
1452 /**************************************************************************/
1454 * Miscellaneous accessors.
1456 isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1457 return (j->header.begin.serial);
1460 isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1461 return (j->header.end.serial);
1464 /**************************************************************************/
1466 * Iteration support.
1468 * When serving an outgoing IXFR, we transmit a part the journal starting
1469 * at the serial number in the IXFR request and ending at the serial
1470 * number that is current when the IXFR request arrives. The ending
1471 * serial number is not necessarily at the end of the journal:
1472 * the journal may grow while the IXFR is in progress, but we stop
1473 * when we reach the serial number that was current when the IXFR started.
1476 static isc_result_t read_one_rr(dns_journal_t *j);
1479 * Make sure the buffer 'b' is has at least 'size' bytes
1480 * allocated, and clear it.
1483 * Either b->base is NULL, or it points to b->length bytes of memory
1484 * previously allocated by isc_mem_get().
1488 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1489 if (b->length < size) {
1490 void *mem = isc_mem_get(mctx, size);
1492 return (ISC_R_NOMEMORY);
1493 if (b->base != NULL)
1494 isc_mem_put(mctx, b->base, b->length);
1498 isc_buffer_clear(b);
1499 return (ISC_R_SUCCESS);
1503 dns_journal_iter_init(dns_journal_t *j,
1504 isc_uint32_t begin_serial, isc_uint32_t end_serial)
1506 isc_result_t result;
1508 CHECK(journal_find(j, begin_serial, &j->it.bpos));
1509 INSIST(j->it.bpos.serial == begin_serial);
1511 CHECK(journal_find(j, end_serial, &j->it.epos));
1512 INSIST(j->it.epos.serial == end_serial);
1514 result = ISC_R_SUCCESS;
1516 j->it.result = result;
1517 return (j->it.result);
1522 dns_journal_first_rr(dns_journal_t *j) {
1523 isc_result_t result;
1526 * Seek to the beginning of the first transaction we are
1529 CHECK(journal_seek(j, j->it.bpos.offset));
1530 j->it.current_serial = j->it.bpos.serial;
1532 j->it.xsize = 0; /* We have no transaction data yet... */
1533 j->it.xpos = 0; /* ...and haven't used any of it. */
1535 return (read_one_rr(j));
1542 read_one_rr(dns_journal_t *j) {
1543 isc_result_t result;
1545 dns_rdatatype_t rdtype;
1546 dns_rdataclass_t rdclass;
1549 journal_xhdr_t xhdr;
1550 journal_rrhdr_t rrhdr;
1552 INSIST(j->offset <= j->it.epos.offset);
1553 if (j->offset == j->it.epos.offset)
1554 return (ISC_R_NOMORE);
1555 if (j->it.xpos == j->it.xsize) {
1557 * We are at a transaction boundary.
1558 * Read another transaction header.
1560 CHECK(journal_read_xhdr(j, &xhdr));
1561 if (xhdr.size == 0) {
1562 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1563 "%s: journal corrupt: empty transaction",
1565 FAIL(ISC_R_UNEXPECTED);
1567 if (xhdr.serial0 != j->it.current_serial) {
1568 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1569 "%s: journal file corrupt: "
1570 "expected serial %u, got %u",
1572 j->it.current_serial, xhdr.serial0);
1573 FAIL(ISC_R_UNEXPECTED);
1575 j->it.xsize = xhdr.size;
1581 CHECK(journal_read_rrhdr(j, &rrhdr));
1583 * Perform a sanity check on the journal RR size.
1584 * The smallest possible RR has a 1-byte owner name
1585 * and a 10-byte header. The largest possible
1586 * RR has 65535 bytes of data, a header, and a maximum-
1587 * size owner name, well below 70 k total.
1589 if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1590 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1591 "%s: journal corrupt: impossible RR size "
1592 "(%d bytes)", j->filename, rrhdr.size);
1593 FAIL(ISC_R_UNEXPECTED);
1596 CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1597 CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1598 isc_buffer_add(&j->it.source, rrhdr.size);
1601 * The target buffer is made the same size
1602 * as the source buffer, with the assumption that when
1603 * no compression in present, the output of dns_*_fromwire()
1604 * is no larger than the input.
1606 CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1609 * Parse the owner name. We don't know where it
1610 * ends yet, so we make the entire "remaining"
1611 * part of the buffer "active".
1613 isc_buffer_setactive(&j->it.source,
1614 j->it.source.used - j->it.source.current);
1615 CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1616 &j->it.dctx, 0, &j->it.target));
1619 * Check that the RR header is there, and parse it.
1621 if (isc_buffer_remaininglength(&j->it.source) < 10)
1622 FAIL(DNS_R_FORMERR);
1624 rdtype = isc_buffer_getuint16(&j->it.source);
1625 rdclass = isc_buffer_getuint16(&j->it.source);
1626 ttl = isc_buffer_getuint32(&j->it.source);
1627 rdlen = isc_buffer_getuint16(&j->it.source);
1632 if (isc_buffer_remaininglength(&j->it.source) != rdlen)
1633 FAIL(DNS_R_FORMERR);
1634 isc_buffer_setactive(&j->it.source, rdlen);
1635 dns_rdata_reset(&j->it.rdata);
1636 CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1637 rdtype, &j->it.source, &j->it.dctx,
1641 j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1642 if (rdtype == dns_rdatatype_soa) {
1643 /* XXX could do additional consistency checks here */
1644 j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1647 result = ISC_R_SUCCESS;
1650 j->it.result = result;
1655 dns_journal_next_rr(dns_journal_t *j) {
1656 j->it.result = read_one_rr(j);
1657 return (j->it.result);
1661 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1662 dns_rdata_t **rdata)
1664 REQUIRE(j->it.result == ISC_R_SUCCESS);
1665 *name = &j->it.name;
1667 *rdata = &j->it.rdata;
1670 /**************************************************************************/
1672 * Generating diffs from databases
1676 * Construct a diff containing all the RRs at the current name of the
1677 * database iterator 'dbit' in database 'db', version 'ver'.
1678 * Set '*name' to the current name, and append the diff to 'diff'.
1679 * All new tuples will have the operation 'op'.
1681 * Requires: 'name' must have buffer large enough to hold the name.
1682 * Typically, a dns_fixedname_t would be used.
1685 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1686 dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1689 isc_result_t result;
1690 dns_dbnode_t *node = NULL;
1691 dns_rdatasetiter_t *rdsiter = NULL;
1692 dns_difftuple_t *tuple = NULL;
1694 result = dns_dbiterator_current(dbit, &node, name);
1695 if (result != ISC_R_SUCCESS)
1698 result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1699 if (result != ISC_R_SUCCESS)
1702 for (result = dns_rdatasetiter_first(rdsiter);
1703 result == ISC_R_SUCCESS;
1704 result = dns_rdatasetiter_next(rdsiter))
1706 dns_rdataset_t rdataset;
1708 dns_rdataset_init(&rdataset);
1709 dns_rdatasetiter_current(rdsiter, &rdataset);
1711 for (result = dns_rdataset_first(&rdataset);
1712 result == ISC_R_SUCCESS;
1713 result = dns_rdataset_next(&rdataset))
1715 dns_rdata_t rdata = DNS_RDATA_INIT;
1716 dns_rdataset_current(&rdataset, &rdata);
1717 result = dns_difftuple_create(diff->mctx, op, name,
1718 rdataset.ttl, &rdata,
1720 if (result != ISC_R_SUCCESS) {
1721 dns_rdataset_disassociate(&rdataset);
1722 goto cleanup_iterator;
1724 dns_diff_append(diff, &tuple);
1726 dns_rdataset_disassociate(&rdataset);
1727 if (result != ISC_R_NOMORE)
1728 goto cleanup_iterator;
1730 if (result != ISC_R_NOMORE)
1731 goto cleanup_iterator;
1733 result = ISC_R_SUCCESS;
1736 dns_rdatasetiter_destroy(&rdsiter);
1739 dns_db_detachnode(db, &node);
1745 * Comparison function for use by dns_diff_subtract when sorting
1746 * the diffs to be subtracted. The sort keys are the rdata type
1747 * and the rdata itself. The owner name is ignored, because
1748 * it is known to be the same for all tuples.
1751 rdata_order(const void *av, const void *bv) {
1752 dns_difftuple_t const * const *ap = av;
1753 dns_difftuple_t const * const *bp = bv;
1754 dns_difftuple_t const *a = *ap;
1755 dns_difftuple_t const *b = *bp;
1757 r = (b->rdata.type - a->rdata.type);
1760 r = dns_rdata_compare(&a->rdata, &b->rdata);
1765 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1766 isc_result_t result;
1767 dns_difftuple_t *p[2];
1769 isc_boolean_t append;
1771 CHECK(dns_diff_sort(&diff[0], rdata_order));
1772 CHECK(dns_diff_sort(&diff[1], rdata_order));
1775 p[0] = ISC_LIST_HEAD(diff[0].tuples);
1776 p[1] = ISC_LIST_HEAD(diff[1].tuples);
1777 if (p[0] == NULL && p[1] == NULL)
1780 for (i = 0; i < 2; i++)
1781 if (p[!i] == NULL) {
1782 ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1783 ISC_LIST_APPEND(r->tuples, p[i], link);
1786 t = rdata_order(&p[0], &p[1]);
1788 ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1789 ISC_LIST_APPEND(r->tuples, p[0], link);
1793 ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1794 ISC_LIST_APPEND(r->tuples, p[1], link);
1799 * Identical RRs in both databases; skip them both
1800 * if the ttl differs.
1802 append = ISC_TF(p[0]->ttl != p[1]->ttl);
1803 for (i = 0; i < 2; i++) {
1804 ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1806 ISC_LIST_APPEND(r->tuples, p[i], link);
1808 dns_difftuple_free(&p[i]);
1813 result = ISC_R_SUCCESS;
1819 * Compare the databases 'dba' and 'dbb' and generate a journal
1820 * entry containing the changes to make 'dba' from 'dbb' (note
1821 * the order). This journal entry will consist of a single,
1822 * possibly very large transaction.
1826 dns_db_diff(isc_mem_t *mctx,
1827 dns_db_t *dba, dns_dbversion_t *dbvera,
1828 dns_db_t *dbb, dns_dbversion_t *dbverb,
1829 const char *journal_filename)
1832 dns_dbversion_t *ver[2];
1833 dns_dbiterator_t *dbit[2] = { NULL, NULL };
1834 isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1835 dns_fixedname_t fixname[2];
1836 isc_result_t result, itresult[2];
1837 dns_diff_t diff[2], resultdiff;
1839 dns_journal_t *journal = NULL;
1841 db[0] = dba, db[1] = dbb;
1842 ver[0] = dbvera, ver[1] = dbverb;
1844 dns_diff_init(mctx, &diff[0]);
1845 dns_diff_init(mctx, &diff[1]);
1846 dns_diff_init(mctx, &resultdiff);
1848 dns_fixedname_init(&fixname[0]);
1849 dns_fixedname_init(&fixname[1]);
1851 result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1852 if (result != ISC_R_SUCCESS)
1855 result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1856 if (result != ISC_R_SUCCESS)
1857 goto cleanup_journal;
1858 result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1859 if (result != ISC_R_SUCCESS)
1860 goto cleanup_interator0;
1862 itresult[0] = dns_dbiterator_first(dbit[0]);
1863 itresult[1] = dns_dbiterator_first(dbit[1]);
1866 for (i = 0; i < 2; i++) {
1867 if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1868 CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1869 dns_fixedname_name(&fixname[i]),
1874 itresult[i] = dns_dbiterator_next(dbit[i]);
1879 if (! have[0] && ! have[1]) {
1880 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1881 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1885 for (i = 0; i < 2; i++) {
1887 ISC_LIST_APPENDLIST(resultdiff.tuples,
1888 diff[i].tuples, link);
1889 INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1890 have[i] = ISC_FALSE;
1895 t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1896 dns_fixedname_name(&fixname[1]));
1898 ISC_LIST_APPENDLIST(resultdiff.tuples,
1899 diff[0].tuples, link);
1900 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1901 have[0] = ISC_FALSE;
1905 ISC_LIST_APPENDLIST(resultdiff.tuples,
1906 diff[1].tuples, link);
1907 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1908 have[1] = ISC_FALSE;
1912 CHECK(dns_diff_subtract(diff, &resultdiff));
1913 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1914 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1915 have[0] = have[1] = ISC_FALSE;
1918 if (itresult[0] != ISC_R_NOMORE)
1920 if (itresult[1] != ISC_R_NOMORE)
1923 if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1924 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1926 CHECK(dns_journal_write_transaction(journal, &resultdiff));
1928 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1929 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1932 dns_diff_clear(&resultdiff);
1933 dns_dbiterator_destroy(&dbit[1]);
1935 dns_dbiterator_destroy(&dbit[0]);
1937 dns_journal_destroy(&journal);
1942 dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1943 isc_uint32_t target_size)
1946 journal_pos_t best_guess;
1947 journal_pos_t current_pos;
1948 dns_journal_t *j = NULL;
1949 dns_journal_t *new = NULL;
1950 journal_rawheader_t rawheader;
1951 unsigned int copy_length;
1954 unsigned int size = 0;
1955 isc_result_t result;
1956 unsigned int indexend;
1959 isc_boolean_t is_backup = ISC_FALSE;
1961 namelen = strlen(filename);
1962 if (namelen > 4 && strcmp(filename + namelen - 4, ".jnl") == 0)
1965 result = isc_string_printf(newname, sizeof(newname), "%.*s.jnw",
1967 if (result != ISC_R_SUCCESS)
1970 result = isc_string_printf(backup, sizeof(backup), "%.*s.jbk",
1972 if (result != ISC_R_SUCCESS)
1975 result = journal_open(mctx, filename, ISC_FALSE, ISC_FALSE, &j);
1976 if (result == ISC_R_NOTFOUND) {
1977 is_backup = ISC_TRUE;
1978 result = journal_open(mctx, backup, ISC_FALSE, ISC_FALSE, &j);
1980 if (result != ISC_R_SUCCESS)
1983 if (JOURNAL_EMPTY(&j->header)) {
1984 dns_journal_destroy(&j);
1985 return (ISC_R_SUCCESS);
1988 if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1989 DNS_SERIAL_GT(serial, j->header.end.serial)) {
1990 dns_journal_destroy(&j);
1991 return (ISC_R_RANGE);
1995 * Cope with very small target sizes.
1997 indexend = sizeof(journal_rawheader_t) +
1998 j->header.index_size * sizeof(journal_rawpos_t);
1999 if (target_size < indexend * 2)
2000 target_size = target_size/2 + indexend;
2003 * See if there is any work to do.
2005 if ((isc_uint32_t) j->header.end.offset < target_size) {
2006 dns_journal_destroy(&j);
2007 return (ISC_R_SUCCESS);
2010 CHECK(journal_open(mctx, newname, ISC_TRUE, ISC_TRUE, &new));
2013 * Remove overhead so space test below can succeed.
2015 if (target_size >= indexend)
2016 target_size -= indexend;
2019 * Find if we can create enough free space.
2021 best_guess = j->header.begin;
2022 for (i = 0; i < j->header.index_size; i++) {
2023 if (POS_VALID(j->index[i]) &&
2024 DNS_SERIAL_GE(serial, j->index[i].serial) &&
2025 ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
2026 >= target_size / 2) &&
2027 j->index[i].offset > best_guess.offset)
2028 best_guess = j->index[i];
2031 current_pos = best_guess;
2032 while (current_pos.serial != serial) {
2033 CHECK(journal_next(j, ¤t_pos));
2034 if (current_pos.serial == j->header.end.serial)
2037 if (DNS_SERIAL_GE(serial, current_pos.serial) &&
2038 ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
2039 >= (target_size / 2)) &&
2040 current_pos.offset > best_guess.offset)
2041 best_guess = current_pos;
2046 INSIST(best_guess.serial != j->header.end.serial);
2047 if (best_guess.serial != serial)
2048 CHECK(journal_next(j, &best_guess));
2051 * We should now be roughly half target_size provided
2052 * we did not reach 'serial'. If not we will just copy
2053 * all uncommitted deltas regardless of the size.
2055 copy_length = j->header.end.offset - best_guess.offset;
2057 if (copy_length != 0) {
2059 * Copy best_guess to end into space just freed.
2062 if (copy_length < size)
2064 buf = isc_mem_get(mctx, size);
2066 result = ISC_R_NOMEMORY;
2070 CHECK(journal_seek(j, best_guess.offset));
2071 CHECK(journal_seek(new, indexend));
2072 for (i = 0; i < copy_length; i += size) {
2073 unsigned int len = (copy_length - i) > size ? size :
2075 CHECK(journal_read(j, buf, len));
2076 CHECK(journal_write(new, buf, len));
2079 CHECK(journal_fsync(new));
2082 * Compute new header.
2084 new->header.begin.serial = best_guess.serial;
2085 new->header.begin.offset = indexend;
2086 new->header.end.serial = j->header.end.serial;
2087 new->header.end.offset = indexend + copy_length;
2090 * Update the journal header.
2092 journal_header_encode(&new->header, &rawheader);
2093 CHECK(journal_seek(new, 0));
2094 CHECK(journal_write(new, &rawheader, sizeof(rawheader)));
2095 CHECK(journal_fsync(new));
2100 current_pos = new->header.begin;
2101 while (current_pos.serial != new->header.end.serial) {
2102 index_add(new, ¤t_pos);
2103 CHECK(journal_next(new, ¤t_pos));
2109 CHECK(index_to_disk(new));
2110 CHECK(journal_fsync(new));
2112 indexend = new->header.end.offset;
2114 dns_journal_destroy(&new);
2117 * With a UFS file system this should just succeed and be atomic.
2118 * Any IXFR outs will just continue and the old journal will be
2119 * removed on final close.
2121 * With MSDOS / NTFS we need to do a two stage rename triggered
2122 * bu EEXISTS. Hopefully all IXFR's that were active at the last
2123 * rename are now complete.
2125 if (rename(newname, filename) == -1) {
2126 if (errno == EACCES && !is_backup) {
2127 result = isc_file_remove(backup);
2128 if (result != ISC_R_SUCCESS &&
2129 result != ISC_R_FILENOTFOUND)
2131 if (rename(filename, backup) == -1)
2133 if (rename(newname, filename) == -1)
2135 (void)isc_file_remove(backup);
2138 result = ISC_R_FAILURE;
2143 dns_journal_destroy(&j);
2144 result = ISC_R_SUCCESS;
2147 (void)isc_file_remove(newname);
2149 isc_mem_put(mctx, buf, size);
2151 dns_journal_destroy(&j);
2153 dns_journal_destroy(&new);
2158 index_to_disk(dns_journal_t *j) {
2159 isc_result_t result = ISC_R_SUCCESS;
2161 if (j->header.index_size != 0) {
2164 unsigned int rawbytes;
2166 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2169 for (i = 0; i < j->header.index_size; i++) {
2170 encode_uint32(j->index[i].serial, p);
2172 encode_uint32(j->index[i].offset, p);
2175 INSIST(p == j->rawindex + rawbytes);
2177 CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2178 CHECK(journal_write(j, j->rawindex, rawbytes));