Merge from vendor branch LESS:
[dragonfly.git] / contrib / bind-9.3 / lib / dns / journal.c
1 /*
2  * Copyright (C) 2004, 2005  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2002  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: journal.c,v 1.77.2.1.10.13 2005/11/03 23:08:41 marka Exp $ */
19
20 #include <config.h>
21
22 #include <stdlib.h>
23 #include <unistd.h>
24
25 #include <isc/file.h>
26 #include <isc/mem.h>
27 #include <isc/stdio.h>
28 #include <isc/string.h>
29 #include <isc/util.h>
30
31 #include <dns/compress.h>
32 #include <dns/db.h>
33 #include <dns/dbiterator.h>
34 #include <dns/diff.h>
35 #include <dns/fixedname.h>
36 #include <dns/journal.h>
37 #include <dns/log.h>
38 #include <dns/rdataset.h>
39 #include <dns/rdatasetiter.h>
40 #include <dns/result.h>
41 #include <dns/soa.h>
42
43 /*
44  * When true, accept IXFR difference sequences where the
45  * SOA serial number does not change (BIND 8 sends such
46  * sequences).
47  */
48 static isc_boolean_t bind8_compat = ISC_TRUE; /* XXX config */
49
50 /**************************************************************************/
51 /*
52  * Miscellaneous utilities.
53  */
54
55 #define JOURNAL_COMMON_LOGARGS \
56         dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
57
58 #define JOURNAL_DEBUG_LOGARGS(n) \
59         JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
60
61 /*
62  * It would be non-sensical (or at least obtuse) to use FAIL() with an
63  * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
64  * from complaining about "end-of-loop code not reached".
65  */
66 #define FAIL(code) \
67         do { result = (code);                                   \
68                 if (result != ISC_R_SUCCESS) goto failure;      \
69         } while (0)
70
71 #define CHECK(op) \
72         do { result = (op);                                     \
73                 if (result != ISC_R_SUCCESS) goto failure;      \
74         } while (0)
75
76 static isc_result_t index_to_disk(dns_journal_t *);
77
78 static inline isc_uint32_t
79 decode_uint32(unsigned char *p) {
80         return ((p[0] << 24) +
81                 (p[1] << 16) +
82                 (p[2] <<  8) +
83                 (p[3] <<  0));
84 }
85
86 static inline void
87 encode_uint32(isc_uint32_t val, unsigned char *p) {
88         p[0] = (isc_uint8_t)(val >> 24);
89         p[1] = (isc_uint8_t)(val >> 16);
90         p[2] = (isc_uint8_t)(val >>  8);
91         p[3] = (isc_uint8_t)(val >>  0);
92 }
93
94 isc_result_t
95 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
96                       dns_diffop_t op, dns_difftuple_t **tp)
97 {
98         isc_result_t result;
99         dns_dbnode_t *node;
100         dns_rdataset_t rdataset;
101         dns_rdata_t rdata = DNS_RDATA_INIT;
102         dns_name_t *zonename;
103
104         zonename = dns_db_origin(db);
105
106         node = NULL;
107         result = dns_db_findnode(db, zonename, ISC_FALSE, &node);
108         if (result != ISC_R_SUCCESS)
109                 goto nonode;
110
111         dns_rdataset_init(&rdataset);
112         result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
113                                      (isc_stdtime_t)0, &rdataset, NULL);
114         if (result != ISC_R_SUCCESS)
115                 goto freenode;
116
117         result = dns_rdataset_first(&rdataset);
118         if (result != ISC_R_SUCCESS)
119                 goto freenode;
120
121         dns_rdataset_current(&rdataset, &rdata);
122
123         result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl,
124                                       &rdata, tp);
125
126         dns_rdataset_disassociate(&rdataset);
127         dns_db_detachnode(db, &node);
128         return (ISC_R_SUCCESS);
129
130  freenode:
131         dns_db_detachnode(db, &node);
132  nonode:
133         UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
134         return (result);
135 }
136
137 /**************************************************************************/
138 /*
139  * Journalling.
140  */
141
142 /*
143  * A journal file consists of
144  *
145  *   - A fixed-size header of type journal_rawheader_t.
146  *
147  *   - The index.  This is an unordered array of index entries
148  *     of type journal_rawpos_t giving the locations
149  *     of some arbitrary subset of the journal's addressable
150  *     transactions.  The index entries are used as hints to
151  *     speed up the process of locating a transaction with a given
152  *     serial number.  Unused index entries have an "offset"
153  *     field of zero.  The size of the index can vary between
154  *     journal files, but does not change during the lifetime
155  *     of a file.  The size can be zero.
156  *
157  *   - The journal data.  This  consists of one or more transactions.
158  *     Each transaction begins with a transaction header of type
159  *     journal_rawxhdr_t.  The transaction header is followed by a
160  *     sequence of RRs, similar in structure to an IXFR difference
161  *     sequence (RFC1995).  That is, the pre-transaction SOA,
162  *     zero or more other deleted RRs, the post-transaction SOA,
163  *     and zero or more other added RRs.  Unlike in IXFR, each RR
164  *     is prefixed with a 32-bit length.
165  *
166  *     The journal data part grows as new transactions are
167  *     appended to the file.  Only those transactions
168  *     whose serial number is current-(2^31-1) to current
169  *     are considered "addressable" and may be pointed
170  *     to from the header or index.  They may be preceded
171  *     by old transactions that are no longer addressable,
172  *     and they may be followed by transactions that were
173  *     appended to the journal but never committed by updating
174  *     the "end" position in the header.  The latter will
175  *     be overwritten when new transactions are added.
176  */
177
178 /*
179  * On-disk representation of a "pointer" to a journal entry.
180  * These are used in the journal header to locate the beginning
181  * and end of the journal, and in the journal index to locate
182  * other transactions.
183  */
184 typedef struct {
185         unsigned char   serial[4];  /* SOA serial before update. */
186         /*
187          * XXXRTH  Should offset be 8 bytes?
188          * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
189          * XXXAG  ... but we will not be able to seek >2G anyway on many
190          *            platforms as long as we are using fseek() rather
191          *            than lseek().
192          */
193         unsigned char   offset[4];  /* Offset from beginning of file. */
194 } journal_rawpos_t;
195
196 /*
197  * The on-disk representation of the journal header.
198  * All numbers are stored in big-endian order.
199  */
200
201 /*
202  * The header is of a fixed size, with some spare room for future
203  * extensions.
204  */
205 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
206
207 typedef union {
208         struct {
209                 /* File format version ID. */
210                 unsigned char           format[16];
211                 /* Position of the first addressable transaction */
212                 journal_rawpos_t        begin;
213                 /* Position of the next (yet nonexistent) transaction. */
214                 journal_rawpos_t        end;
215                 /* Number of index entries following the header. */
216                 unsigned char           index_size[4];
217         } h;
218         /* Pad the header to a fixed size. */
219         unsigned char pad[JOURNAL_HEADER_SIZE];
220 } journal_rawheader_t;
221
222 /*
223  * The on-disk representation of the transaction header.
224  * There is one of these at the beginning of each transaction.
225  */
226 typedef struct {
227         unsigned char   size[4];        /* In bytes, excluding header. */
228         unsigned char   serial0[4];     /* SOA serial before update. */
229         unsigned char   serial1[4];     /* SOA serial after update. */
230 } journal_rawxhdr_t;
231
232 /*
233  * The on-disk representation of the RR header.
234  * There is one of these at the beginning of each RR.
235  */
236 typedef struct {
237         unsigned char   size[4];        /* In bytes, excluding header. */
238 } journal_rawrrhdr_t;
239
240 /*
241  * The in-core representation of the journal header.
242  */
243 typedef struct {
244         isc_uint32_t    serial;
245         isc_offset_t    offset;
246 } journal_pos_t;
247
248 #define POS_VALID(pos)          ((pos).offset != 0)
249 #define POS_INVALIDATE(pos)     ((pos).offset = 0, (pos).serial = 0)
250
251 typedef struct {
252         unsigned char   format[16];
253         journal_pos_t   begin;
254         journal_pos_t   end;
255         isc_uint32_t    index_size;
256 } journal_header_t;
257
258 /*
259  * The in-core representation of the transaction header.
260  */
261
262 typedef struct {
263         isc_uint32_t    size;
264         isc_uint32_t    serial0;
265         isc_uint32_t    serial1;
266 } journal_xhdr_t;
267
268 /*
269  * The in-core representation of the RR header.
270  */
271 typedef struct {
272         isc_uint32_t    size;
273 } journal_rrhdr_t;
274
275
276 /*
277  * Initial contents to store in the header of a newly created
278  * journal file.
279  *
280  * The header starts with the magic string ";BIND LOG V9\n"
281  * to identify the file as a BIND 9 journal file.  An ASCII
282  * identification string is used rather than a binary magic
283  * number to be consistent with BIND 8 (BIND 8 journal files
284  * are ASCII text files).
285  */
286
287 static journal_header_t
288 initial_journal_header = { ";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0 };
289
290 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
291
292 typedef enum {
293         JOURNAL_STATE_INVALID,
294         JOURNAL_STATE_READ,
295         JOURNAL_STATE_WRITE,
296         JOURNAL_STATE_TRANSACTION
297 } journal_state_t;
298
299 struct dns_journal {
300         unsigned int            magic;          /* JOUR */
301         isc_mem_t               *mctx;          /* Memory context */
302         journal_state_t         state;
303         const char              *filename;      /* Journal file name */
304         FILE *                  fp;             /* File handle */
305         isc_offset_t            offset;         /* Current file offset */
306         journal_header_t        header;         /* In-core journal header */
307         unsigned char           *rawindex;      /* In-core buffer for journal
308                                                    index in on-disk format */
309         journal_pos_t           *index;         /* In-core journal index */
310
311         /* Current transaction state (when writing). */
312         struct {
313                 unsigned int    n_soa;          /* Number of SOAs seen */
314                 journal_pos_t   pos[2];         /* Begin/end position */
315         } x;
316
317         /* Iteration state (when reading). */
318         struct {
319                 /* These define the part of the journal we iterate over. */
320                 journal_pos_t bpos;             /* Position before first, */
321                 journal_pos_t epos;             /* and after last
322                                                    transaction */
323                 /* The rest is iterator state. */
324                 isc_uint32_t current_serial;    /* Current SOA serial */
325                 isc_buffer_t source;            /* Data from disk */
326                 isc_buffer_t target;            /* Data from _fromwire check */
327                 dns_decompress_t dctx;          /* Dummy decompression ctx */
328                 dns_name_t name;                /* Current domain name */
329                 dns_rdata_t rdata;              /* Current rdata */
330                 isc_uint32_t ttl;               /* Current TTL */
331                 unsigned int xsize;             /* Size of transaction data */
332                 unsigned int xpos;              /* Current position in it */
333                 isc_result_t result;            /* Result of last call */
334         } it;
335 };
336
337 #define DNS_JOURNAL_MAGIC       ISC_MAGIC('J', 'O', 'U', 'R')
338 #define DNS_JOURNAL_VALID(t)    ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
339
340 static void
341 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
342         cooked->serial = decode_uint32(raw->serial);
343         cooked->offset = decode_uint32(raw->offset);
344 }
345
346 static void
347 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
348         encode_uint32(cooked->serial, raw->serial);
349         encode_uint32(cooked->offset, raw->offset);
350 }
351
352 static void
353 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
354         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
355         memcpy(cooked->format, raw->h.format, sizeof(cooked->format));
356         journal_pos_decode(&raw->h.begin, &cooked->begin);
357         journal_pos_decode(&raw->h.end, &cooked->end);
358         cooked->index_size = decode_uint32(raw->h.index_size);
359 }
360
361 static void
362 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
363         INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
364         memset(raw->pad, 0, sizeof(raw->pad));
365         memcpy(raw->h.format, cooked->format, sizeof(raw->h.format));
366         journal_pos_encode(&raw->h.begin, &cooked->begin);
367         journal_pos_encode(&raw->h.end, &cooked->end);
368         encode_uint32(cooked->index_size, raw->h.index_size);
369 }
370
371 /*
372  * Journal file I/O subroutines, with error checking and reporting.
373  */
374 static isc_result_t
375 journal_seek(dns_journal_t *j, isc_uint32_t offset) {
376         isc_result_t result;
377         result = isc_stdio_seek(j->fp, (long)offset, SEEK_SET);
378         if (result != ISC_R_SUCCESS) {
379                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
380                               "%s: seek: %s", j->filename,
381                               isc_result_totext(result));
382                 return (ISC_R_UNEXPECTED);
383         }
384         j->offset = offset;
385         return (ISC_R_SUCCESS);
386 }
387
388 static isc_result_t
389 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
390         isc_result_t result;
391
392         result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
393         if (result != ISC_R_SUCCESS) {
394                 if (result == ISC_R_EOF)
395                         return (ISC_R_NOMORE);
396                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
397                               "%s: read: %s",
398                               j->filename, isc_result_totext(result));
399                 return (ISC_R_UNEXPECTED);
400         }
401         j->offset += nbytes;
402         return (ISC_R_SUCCESS);
403 }
404
405 static isc_result_t
406 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
407         isc_result_t result;
408
409         result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
410         if (result != ISC_R_SUCCESS) {
411                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
412                               "%s: write: %s",
413                               j->filename, isc_result_totext(result));
414                 return (ISC_R_UNEXPECTED);
415         }
416         j->offset += nbytes;
417         return (ISC_R_SUCCESS);
418 }
419
420 static isc_result_t
421 journal_fsync(dns_journal_t *j) {
422         isc_result_t result;
423         result = isc_stdio_flush(j->fp);
424         if (result != ISC_R_SUCCESS) {
425                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
426                               "%s: flush: %s",
427                               j->filename, isc_result_totext(result));
428                 return (ISC_R_UNEXPECTED);
429         }
430         result = isc_stdio_sync(j->fp);
431         if (result != ISC_R_SUCCESS) {
432                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
433                               "%s: fsync: %s",
434                               j->filename, isc_result_totext(result));
435                 return (ISC_R_UNEXPECTED);
436         }
437         return (ISC_R_SUCCESS);
438 }
439
440 /*
441  * Read/write a transaction header at the current file position.
442  */
443
444 static isc_result_t
445 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
446         journal_rawxhdr_t raw;
447         isc_result_t result;
448         result = journal_read(j, &raw, sizeof(raw));
449         if (result != ISC_R_SUCCESS)
450                 return (result);
451         xhdr->size = decode_uint32(raw.size);
452         xhdr->serial0 = decode_uint32(raw.serial0);
453         xhdr->serial1 = decode_uint32(raw.serial1);
454         return (ISC_R_SUCCESS);
455 }
456
457 static isc_result_t
458 journal_write_xhdr(dns_journal_t *j, isc_uint32_t size,
459                    isc_uint32_t serial0, isc_uint32_t serial1)
460 {
461         journal_rawxhdr_t raw;
462         encode_uint32(size, raw.size);
463         encode_uint32(serial0, raw.serial0);
464         encode_uint32(serial1, raw.serial1);
465         return (journal_write(j, &raw, sizeof(raw)));
466 }
467
468
469 /*
470  * Read an RR header at the current file position.
471  */
472
473 static isc_result_t
474 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
475         journal_rawrrhdr_t raw;
476         isc_result_t result;
477         result = journal_read(j, &raw, sizeof(raw));
478         if (result != ISC_R_SUCCESS)
479                 return (result);
480         rrhdr->size = decode_uint32(raw.size);
481         return (ISC_R_SUCCESS);
482 }
483
484 static isc_result_t
485 journal_file_create(isc_mem_t *mctx, const char *filename) {
486         FILE *fp = NULL;
487         isc_result_t result;
488         journal_header_t header;
489         journal_rawheader_t rawheader;
490         int index_size = 56; /* XXX configurable */
491         int size;
492         void *mem; /* Memory for temporary index image. */
493
494         INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
495
496         result = isc_stdio_open(filename, "wb", &fp);
497         if (result != ISC_R_SUCCESS) {
498                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
499                               "%s: create: %s",
500                               filename, isc_result_totext(result));
501                 return (ISC_R_UNEXPECTED);
502         }
503
504         header = initial_journal_header;
505         header.index_size = index_size;
506         journal_header_encode(&header, &rawheader);
507
508         size = sizeof(journal_rawheader_t) +
509                 index_size * sizeof(journal_rawpos_t);
510
511         mem = isc_mem_get(mctx, size);
512         if (mem == NULL) {
513                 (void)isc_stdio_close(fp);
514                 (void)isc_file_remove(filename);
515                 return (ISC_R_NOMEMORY);
516         }
517         memset(mem, 0, size);
518         memcpy(mem, &rawheader, sizeof(rawheader));
519
520         result = isc_stdio_write(mem, 1, (size_t) size, fp, NULL);
521         if (result != ISC_R_SUCCESS) {
522                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
523                                  "%s: write: %s",
524                                  filename, isc_result_totext(result));
525                 (void)isc_stdio_close(fp);
526                 (void)isc_file_remove(filename);
527                 isc_mem_put(mctx, mem, size);
528                 return (ISC_R_UNEXPECTED);
529         }
530         isc_mem_put(mctx, mem, size);
531
532         result = isc_stdio_close(fp);
533         if (result != ISC_R_SUCCESS) {
534                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
535                                  "%s: close: %s",
536                                  filename, isc_result_totext(result));
537                 (void)isc_file_remove(filename);
538                 return (ISC_R_UNEXPECTED);
539         }
540
541         return (ISC_R_SUCCESS);
542 }
543
544 static isc_result_t
545 journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
546              isc_boolean_t create, dns_journal_t **journalp) {
547         FILE *fp = NULL;
548         isc_result_t result;
549         journal_rawheader_t rawheader;
550         dns_journal_t *j;
551
552         INSIST(journalp != NULL && *journalp == NULL);
553         j = isc_mem_get(mctx, sizeof(*j));
554         if (j == NULL)
555                 return (ISC_R_NOMEMORY);
556
557         j->mctx = mctx;
558         j->state = JOURNAL_STATE_INVALID;
559         j->fp = NULL;
560         j->filename = filename;
561         j->index = NULL;
562         j->rawindex = NULL;
563
564         result = isc_stdio_open(j->filename, write ? "rb+" : "rb", &fp);
565
566         if (result == ISC_R_FILENOTFOUND) {
567                 if (create) {
568                         isc_log_write(JOURNAL_COMMON_LOGARGS,
569                                       ISC_LOG_INFO,
570                                       "journal file %s does not exist, "
571                                       "creating it",
572                                       j->filename);
573                         CHECK(journal_file_create(mctx, filename));
574                         /*
575                          * Retry.
576                          */
577                         result = isc_stdio_open(j->filename, "rb+", &fp);
578                 } else {
579                         FAIL(ISC_R_NOTFOUND);
580                 }
581         }
582         if (result != ISC_R_SUCCESS) {
583                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
584                               "%s: open: %s",
585                               j->filename, isc_result_totext(result));
586                 FAIL(ISC_R_UNEXPECTED);
587         }
588
589         j->fp = fp;
590
591         /*
592          * Set magic early so that seek/read can succeed.
593          */
594         j->magic = DNS_JOURNAL_MAGIC;
595
596         CHECK(journal_seek(j, 0));
597         CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
598
599         if (memcmp(rawheader.h.format, initial_journal_header.format,
600                    sizeof(initial_journal_header.format)) != 0) {
601                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
602                                  "%s: journal format not recognized",
603                                  j->filename);
604                 FAIL(ISC_R_UNEXPECTED);
605         }
606         journal_header_decode(&rawheader, &j->header);
607
608         /*
609          * If there is an index, read the raw index into a dynamically
610          * allocated buffer and then convert it into a cooked index.
611          */
612         if (j->header.index_size != 0) {
613                 unsigned int i;
614                 unsigned int rawbytes;
615                 unsigned char *p;
616
617                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
618                 j->rawindex = isc_mem_get(mctx, rawbytes);
619                 if (j->rawindex == NULL)
620                         FAIL(ISC_R_NOMEMORY);
621
622                 CHECK(journal_read(j, j->rawindex, rawbytes));
623
624                 j->index = isc_mem_get(mctx, j->header.index_size *
625                                        sizeof(journal_pos_t));
626                 if (j->index == NULL)
627                         FAIL(ISC_R_NOMEMORY);
628
629                 p = j->rawindex;
630                 for (i = 0; i < j->header.index_size; i++) {
631                         j->index[i].serial = decode_uint32(p);
632                         p += 4;
633                         j->index[i].offset = decode_uint32(p);
634                         p += 4;
635                 }
636                 INSIST(p == j->rawindex + rawbytes);
637         }
638         j->offset = -1; /* Invalid, must seek explicitly. */
639
640         /*
641          * Initialize the iterator.
642          */
643         dns_name_init(&j->it.name, NULL);
644         dns_rdata_init(&j->it.rdata);
645
646         /*
647          * Set up empty initial buffers for uncheched and checked
648          * wire format RR data.  They will be reallocated
649          * later.
650          */
651         isc_buffer_init(&j->it.source, NULL, 0);
652         isc_buffer_init(&j->it.target, NULL, 0);
653         dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
654
655         j->state =
656                 write ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
657
658         *journalp = j;
659         return (ISC_R_SUCCESS);
660
661  failure:
662         j->magic = 0;
663         if (j->index != NULL) {
664                 isc_mem_put(j->mctx, j->index, j->header.index_size *
665                             sizeof(journal_rawpos_t));
666                 j->index = NULL;
667         }
668         if (j->fp != NULL)
669                 (void)isc_stdio_close(j->fp);
670         isc_mem_put(j->mctx, j, sizeof(*j));
671         return (result);
672 }
673
674 isc_result_t
675 dns_journal_open(isc_mem_t *mctx, const char *filename, isc_boolean_t write,
676                  dns_journal_t **journalp) {
677         return (journal_open(mctx, filename, write, write, journalp));
678 }
679
680 /*
681  * A comparison function defining the sorting order for
682  * entries in the IXFR-style journal file.
683  *
684  * The IXFR format requires that deletions are sorted before
685  * additions, and within either one, SOA records are sorted
686  * before others.
687  *
688  * Also sort the non-SOA records by type as a courtesy to the
689  * server receiving the IXFR - it may help reduce the amount of
690  * rdataset merging it has to do.
691  */
692 static int
693 ixfr_order(const void *av, const void *bv) {
694         dns_difftuple_t const * const *ap = av;
695         dns_difftuple_t const * const *bp = bv;
696         dns_difftuple_t const *a = *ap;
697         dns_difftuple_t const *b = *bp;
698         int r;
699
700         r = (b->op == DNS_DIFFOP_DEL) - (a->op == DNS_DIFFOP_DEL);
701         if (r != 0)
702                 return (r);
703
704         r = (b->rdata.type == dns_rdatatype_soa) -
705                 (a->rdata.type == dns_rdatatype_soa);
706         if (r != 0)
707                 return (r);
708
709         r = (a->rdata.type - b->rdata.type);
710         return (r);
711 }
712
713 /*
714  * Advance '*pos' to the next journal transaction.
715  *
716  * Requires:
717  *      *pos refers to a valid journal transaction.
718  *
719  * Ensures:
720  *      When ISC_R_SUCCESS is returned,
721  *      *pos refers to the next journal transaction.
722  *
723  * Returns one of:
724  *
725  *    ISC_R_SUCCESS
726  *    ISC_R_NOMORE      *pos pointed at the last transaction
727  *    Other results due to file errors are possible.
728  */
729 static isc_result_t
730 journal_next(dns_journal_t *j, journal_pos_t *pos) {
731         isc_result_t result;
732         journal_xhdr_t xhdr;
733         REQUIRE(DNS_JOURNAL_VALID(j));
734
735         result = journal_seek(j, pos->offset);
736         if (result != ISC_R_SUCCESS)
737                 return (result);
738
739         if (pos->serial == j->header.end.serial)
740                 return (ISC_R_NOMORE);
741         /*
742          * Read the header of the current transaction.
743          * This will return ISC_R_NOMORE if we are at EOF.
744          */
745         result = journal_read_xhdr(j, &xhdr);
746         if (result != ISC_R_SUCCESS)
747                 return (result);
748
749         /*
750          * Check serial number consistency.
751          */
752         if (xhdr.serial0 != pos->serial) {
753                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
754                               "%s: journal file corrupt: "
755                               "expected serial %u, got %u",
756                               j->filename, pos->serial, xhdr.serial0);
757                 return (ISC_R_UNEXPECTED);
758         }
759
760         /*
761          * Check for offset wraparound.
762          */
763         if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) + xhdr.size)
764             < pos->offset) {
765                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
766                               "%s: offset too large", j->filename);
767                 return (ISC_R_UNEXPECTED);
768         }
769
770         pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
771         pos->serial = xhdr.serial1;
772         return (ISC_R_SUCCESS);
773 }
774
775 /*
776  * If the index of the journal 'j' contains an entry "better"
777  * than '*best_guess', replace '*best_guess' with it.
778  *
779  * "Better" means having a serial number closer to 'serial'
780  * but not greater than 'serial'.
781  */
782 static void
783 index_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *best_guess) {
784         unsigned int i;
785         if (j->index == NULL)
786                 return;
787         for (i = 0; i < j->header.index_size; i++) {
788                 if (POS_VALID(j->index[i]) &&
789                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
790                     DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
791                         *best_guess = j->index[i];
792         }
793 }
794
795 /*
796  * Add a new index entry.  If there is no room, make room by removing
797  * the odd-numbered entries and compacting the others into the first
798  * half of the index.  This decimates old index entries exponentially
799  * over time, so that the index always contains a much larger fraction
800  * of recent serial numbers than of old ones.  This is deliberate -
801  * most index searches are for outgoing IXFR, and IXFR tends to request
802  * recent versions more often than old ones.
803  */
804 static void
805 index_add(dns_journal_t *j, journal_pos_t *pos) {
806         unsigned int i;
807         if (j->index == NULL)
808                 return;
809         /*
810          * Search for a vacant position.
811          */
812         for (i = 0; i < j->header.index_size; i++) {
813                 if (! POS_VALID(j->index[i]))
814                         break;
815         }
816         if (i == j->header.index_size) {
817                 unsigned int k = 0;
818                 /*
819                  * Found no vacant position.  Make some room.
820                  */
821                 for (i = 0; i < j->header.index_size; i += 2) {
822                         j->index[k++] = j->index[i];
823                 }
824                 i = k; /* 'i' identifies the first vacant position. */
825                 while (k < j->header.index_size) {
826                         POS_INVALIDATE(j->index[k]);
827                         k++;
828                 }
829         }
830         INSIST(i < j->header.index_size);
831         INSIST(! POS_VALID(j->index[i]));
832
833         /*
834          * Store the new index entry.
835          */
836         j->index[i] = *pos;
837 }
838
839 /*
840  * Invalidate any existing index entries that could become
841  * ambiguous when a new transaction with number 'serial' is added.
842  */
843 static void
844 index_invalidate(dns_journal_t *j, isc_uint32_t serial) {
845         unsigned int i;
846         if (j->index == NULL)
847                 return;
848         for (i = 0; i < j->header.index_size; i++) {
849                 if (! DNS_SERIAL_GT(serial, j->index[i].serial))
850                         POS_INVALIDATE(j->index[i]);
851         }
852 }
853
854 /*
855  * Try to find a transaction with initial serial number 'serial'
856  * in the journal 'j'.
857  *
858  * If found, store its position at '*pos' and return ISC_R_SUCCESS.
859  *
860  * If 'serial' is current (= the ending serial number of the
861  * last transaction in the journal), set '*pos' to
862  * the position immediately following the last transaction and
863  * return ISC_R_SUCCESS.
864  *
865  * If 'serial' is within the range of addressable serial numbers
866  * covered by the journal but that particular serial number is missing
867  * (from the journal, not just from the index), return ISC_R_NOTFOUND.
868  *
869  * If 'serial' is outside the range of addressable serial numbers
870  * covered by the journal, return ISC_R_RANGE.
871  *
872  */
873 static isc_result_t
874 journal_find(dns_journal_t *j, isc_uint32_t serial, journal_pos_t *pos) {
875         isc_result_t result;
876         journal_pos_t current_pos;
877         REQUIRE(DNS_JOURNAL_VALID(j));
878
879         if (DNS_SERIAL_GT(j->header.begin.serial, serial))
880                 return (ISC_R_RANGE);
881         if (DNS_SERIAL_GT(serial, j->header.end.serial))
882                 return (ISC_R_RANGE);
883         if (serial == j->header.end.serial) {
884                 *pos = j->header.end;
885                 return (ISC_R_SUCCESS);
886         }
887
888         current_pos = j->header.begin;
889         index_find(j, serial, &current_pos);
890
891         while (current_pos.serial != serial) {
892                 if (DNS_SERIAL_GT(current_pos.serial, serial))
893                         return (ISC_R_NOTFOUND);
894                 result = journal_next(j, &current_pos);
895                 if (result != ISC_R_SUCCESS)
896                         return (result);
897         }
898         *pos = current_pos;
899         return (ISC_R_SUCCESS);
900 }
901
902 isc_result_t
903 dns_journal_begin_transaction(dns_journal_t *j) {
904         isc_uint32_t offset;
905         isc_result_t result;
906         journal_rawxhdr_t hdr;
907
908         REQUIRE(DNS_JOURNAL_VALID(j));
909         REQUIRE(j->state == JOURNAL_STATE_WRITE);
910
911         /*
912          * Find the file offset where the new transaction should
913          * be written, and seek there.
914          */
915         if (JOURNAL_EMPTY(&j->header)) {
916                 offset = sizeof(journal_rawheader_t) +
917                         j->header.index_size * sizeof(journal_rawpos_t);
918         } else {
919                 offset = j->header.end.offset;
920         }
921         j->x.pos[0].offset = offset;
922         j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
923         j->x.n_soa = 0;
924
925         CHECK(journal_seek(j, offset));
926
927         /*
928          * Write a dummy transaction header of all zeroes to reserve
929          * space.  It will be filled in when the transaction is
930          * finished.
931          */
932         memset(&hdr, 0, sizeof(hdr));
933         CHECK(journal_write(j, &hdr, sizeof(hdr)));
934         j->x.pos[1].offset = j->offset;
935
936         j->state = JOURNAL_STATE_TRANSACTION;
937         result = ISC_R_SUCCESS;
938  failure:
939         return (result);
940 }
941
942 isc_result_t
943 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
944         dns_difftuple_t *t;
945         isc_buffer_t buffer;
946         void *mem = NULL;
947         unsigned int size;
948         isc_result_t result;
949         isc_region_t used;
950
951         REQUIRE(DNS_DIFF_VALID(diff));
952         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
953
954         isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
955         (void)dns_diff_print(diff, NULL);
956
957         /*
958          * Pass 1: determine the buffer size needed, and
959          * keep track of SOA serial numbers.
960          */
961         size = 0;
962         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
963              t = ISC_LIST_NEXT(t, link))
964         {
965                 if (t->rdata.type == dns_rdatatype_soa) {
966                         if (j->x.n_soa < 2)
967                                 j->x.pos[j->x.n_soa].serial =
968                                         dns_soa_getserial(&t->rdata);
969                         j->x.n_soa++;
970                 }
971                 size += sizeof(journal_rawrrhdr_t);
972                 size += t->name.length; /* XXX should have access macro? */
973                 size += 10;
974                 size += t->rdata.length;
975         }
976
977         mem = isc_mem_get(j->mctx, size);
978         if (mem == NULL)
979                 return (ISC_R_NOMEMORY);
980
981         isc_buffer_init(&buffer, mem, size);
982
983         /*
984          * Pass 2.  Write RRs to buffer.
985          */
986         for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
987              t = ISC_LIST_NEXT(t, link))
988         {
989                 /*
990                  * Write the RR header.
991                  */
992                 isc_buffer_putuint32(&buffer, t->name.length + 10 +
993                                      t->rdata.length);
994                 /*
995                  * Write the owner name, RR header, and RR data.
996                  */
997                 isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
998                 isc_buffer_putuint16(&buffer, t->rdata.type);
999                 isc_buffer_putuint16(&buffer, t->rdata.rdclass);
1000                 isc_buffer_putuint32(&buffer, t->ttl);
1001                 INSIST(t->rdata.length < 65536);
1002                 isc_buffer_putuint16(&buffer, (isc_uint16_t)t->rdata.length);
1003                 INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
1004                 isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
1005         }
1006
1007         isc_buffer_usedregion(&buffer, &used);
1008         INSIST(used.length == size);
1009
1010         j->x.pos[1].offset += used.length;
1011
1012         /*
1013          * Write the buffer contents to the journal file.
1014          */
1015         CHECK(journal_write(j, used.base, used.length));
1016
1017         result = ISC_R_SUCCESS;
1018
1019  failure:
1020         if (mem != NULL)
1021                 isc_mem_put(j->mctx, mem, size);
1022         return (result);
1023
1024 }
1025
1026 isc_result_t
1027 dns_journal_commit(dns_journal_t *j) {
1028         isc_result_t result;
1029         journal_rawheader_t rawheader;
1030
1031         REQUIRE(DNS_JOURNAL_VALID(j));
1032         REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
1033
1034         /*
1035          * Perform some basic consistency checks.
1036          */
1037         if (j->x.n_soa != 2) {
1038                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1039                               "%s: malformed transaction: %d SOAs",
1040                               j->filename, j->x.n_soa);
1041                 return (ISC_R_UNEXPECTED);
1042         }
1043         if (! (DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial) ||
1044                (bind8_compat &&
1045                 j->x.pos[1].serial == j->x.pos[0].serial)))
1046         {
1047                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1048                               "%s: malformed transaction: serial number "
1049                               "would decrease", j->filename);
1050                 return (ISC_R_UNEXPECTED);
1051         }
1052         if (! JOURNAL_EMPTY(&j->header)) {
1053                 if (j->x.pos[0].serial != j->header.end.serial) {
1054                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1055                                          "malformed transaction: "
1056                                          "%s last serial %u != "
1057                                          "transaction first serial %u",
1058                                          j->filename,
1059                                          j->header.end.serial,
1060                                          j->x.pos[0].serial);
1061                         return (ISC_R_UNEXPECTED);
1062                 }
1063         }
1064
1065         /*
1066          * Some old journal entries may become non-addressable
1067          * when we increment the current serial number.  Purge them
1068          * by stepping header.begin forward to the first addressable
1069          * transaction.  Also purge them from the index.
1070          */
1071         if (! JOURNAL_EMPTY(&j->header)) {
1072                 while (! DNS_SERIAL_GT(j->x.pos[1].serial,
1073                                        j->header.begin.serial)) {
1074                         CHECK(journal_next(j, &j->header.begin));
1075                 }
1076                 index_invalidate(j, j->x.pos[1].serial);
1077         }
1078 #ifdef notyet
1079         if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
1080                 force_dump(...);
1081         }
1082 #endif
1083
1084         /*
1085          * Commit the transaction data to stable storage.
1086          */
1087         CHECK(journal_fsync(j));
1088
1089         /*
1090          * Update the transaction header.
1091          */
1092         CHECK(journal_seek(j, j->x.pos[0].offset));
1093         CHECK(journal_write_xhdr(j, (j->x.pos[1].offset - j->x.pos[0].offset) -
1094                                  sizeof(journal_rawxhdr_t),
1095                                  j->x.pos[0].serial, j->x.pos[1].serial));
1096
1097         /*
1098          * Update the journal header.
1099          */
1100         if (JOURNAL_EMPTY(&j->header)) {
1101                 j->header.begin = j->x.pos[0];
1102         }
1103         j->header.end = j->x.pos[1];
1104         journal_header_encode(&j->header, &rawheader);
1105         CHECK(journal_seek(j, 0));
1106         CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
1107
1108         /*
1109          * Update the index.
1110          */
1111         index_add(j, &j->x.pos[0]);
1112
1113         /*
1114          * Convert the index into on-disk format and write
1115          * it to disk.
1116          */
1117         CHECK(index_to_disk(j));
1118
1119         /*
1120          * Commit the header to stable storage.
1121          */
1122         CHECK(journal_fsync(j));
1123
1124         /*
1125          * We no longer have a transaction open.
1126          */
1127         j->state = JOURNAL_STATE_WRITE;
1128
1129         result = ISC_R_SUCCESS;
1130
1131  failure:
1132         return (result);
1133 }
1134
1135 isc_result_t
1136 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
1137         isc_result_t result;
1138         CHECK(dns_diff_sort(diff, ixfr_order));
1139         CHECK(dns_journal_begin_transaction(j));
1140         CHECK(dns_journal_writediff(j, diff));
1141         CHECK(dns_journal_commit(j));
1142         result = ISC_R_SUCCESS;
1143  failure:
1144         return (result);
1145 }
1146
1147 void
1148 dns_journal_destroy(dns_journal_t **journalp) {
1149         dns_journal_t *j = *journalp;
1150         REQUIRE(DNS_JOURNAL_VALID(j));
1151
1152         j->it.result = ISC_R_FAILURE;
1153         dns_name_invalidate(&j->it.name);
1154         dns_decompress_invalidate(&j->it.dctx);
1155         if (j->rawindex != NULL)
1156                 isc_mem_put(j->mctx, j->rawindex, j->header.index_size *
1157                             sizeof(journal_rawpos_t));
1158         if (j->index != NULL)
1159                 isc_mem_put(j->mctx, j->index, j->header.index_size *
1160                             sizeof(journal_pos_t));
1161         if (j->it.target.base != NULL)
1162                 isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
1163         if (j->it.source.base != NULL)
1164                 isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
1165
1166         if (j->fp != NULL)
1167                 (void)isc_stdio_close(j->fp);
1168         j->magic = 0;
1169         isc_mem_put(j->mctx, j, sizeof(*j));
1170         *journalp = NULL;
1171 }
1172
1173 /*
1174  * Roll the open journal 'j' into the database 'db'.
1175  * A new database version will be created.
1176  */
1177
1178 /* XXX Share code with incoming IXFR? */
1179
1180 static isc_result_t
1181 roll_forward(dns_journal_t *j, dns_db_t *db) {
1182         isc_buffer_t source;            /* Transaction data from disk */
1183         isc_buffer_t target;            /* Ditto after _fromwire check */
1184         isc_uint32_t db_serial;         /* Database SOA serial */
1185         isc_uint32_t end_serial;        /* Last journal SOA serial */
1186         isc_result_t result;
1187         dns_dbversion_t *ver = NULL;
1188         journal_pos_t pos;
1189         dns_diff_t diff;
1190         unsigned int n_soa = 0;
1191         unsigned int n_put = 0;
1192
1193         REQUIRE(DNS_JOURNAL_VALID(j));
1194         REQUIRE(DNS_DB_VALID(db));
1195
1196         dns_diff_init(j->mctx, &diff);
1197
1198         /*
1199          * Set up empty initial buffers for uncheched and checked
1200          * wire format transaction data.  They will be reallocated
1201          * later.
1202          */
1203         isc_buffer_init(&source, NULL, 0);
1204         isc_buffer_init(&target, NULL, 0);
1205
1206         /*
1207          * Create the new database version.
1208          */
1209         CHECK(dns_db_newversion(db, &ver));
1210
1211         /*
1212          * Get the current database SOA serial number.
1213          */
1214         CHECK(dns_db_getsoaserial(db, ver, &db_serial));
1215
1216         /*
1217          * Locate a journal entry for the current database serial.
1218          */
1219         CHECK(journal_find(j, db_serial, &pos));
1220         /*
1221          * XXX do more drastic things, like marking zone stale,
1222          * if this fails?
1223          */
1224         /*
1225          * XXXRTH  The zone code should probably mark the zone as bad and
1226          *         scream loudly into the log if this is a dynamic update
1227          *         log reply that failed.
1228          */
1229
1230         end_serial = dns_journal_last_serial(j);
1231         if (db_serial == end_serial)
1232                 CHECK(DNS_R_UPTODATE);
1233
1234         CHECK(dns_journal_iter_init(j, db_serial, end_serial));
1235
1236         for (result = dns_journal_first_rr(j);
1237              result == ISC_R_SUCCESS;
1238              result = dns_journal_next_rr(j))
1239         {
1240                 dns_name_t *name;
1241                 isc_uint32_t ttl;
1242                 dns_rdata_t *rdata;
1243                 dns_difftuple_t *tuple = NULL;
1244
1245                 name = NULL;
1246                 rdata = NULL;
1247                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1248
1249                 if (rdata->type == dns_rdatatype_soa) {
1250                         n_soa++;
1251                         if (n_soa == 2)
1252                                 db_serial = j->it.current_serial;
1253                 }
1254
1255                 if (n_soa == 3)
1256                         n_soa = 1;
1257                 if (n_soa == 0) {
1258                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1259                                          "%s: journal file corrupt: missing "
1260                                          "initial SOA", j->filename);
1261                         FAIL(ISC_R_UNEXPECTED);
1262                 }
1263                 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1264                                            DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1265                                            name, ttl, rdata, &tuple));
1266                 dns_diff_append(&diff, &tuple);
1267
1268                 if (++n_put > 100)  {
1269                         isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1270                                       "%s: applying diff to database (%u)",
1271                                       j->filename, db_serial);
1272                         (void)dns_diff_print(&diff, NULL);
1273                         CHECK(dns_diff_apply(&diff, db, ver));
1274                         dns_diff_clear(&diff);
1275                         n_put = 0;
1276                 }
1277         }
1278         if (result == ISC_R_NOMORE)
1279                 result = ISC_R_SUCCESS;
1280         CHECK(result);
1281
1282         if (n_put != 0) {
1283                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1284                               "%s: applying final diff to database (%u)",
1285                               j->filename, db_serial);
1286                 (void)dns_diff_print(&diff, NULL);
1287                 CHECK(dns_diff_apply(&diff, db, ver));
1288                 dns_diff_clear(&diff);
1289         }
1290
1291  failure:
1292         if (ver != NULL)
1293                 dns_db_closeversion(db, &ver, result == ISC_R_SUCCESS ?
1294                                     ISC_TRUE : ISC_FALSE);
1295
1296         if (source.base != NULL)
1297                 isc_mem_put(j->mctx, source.base, source.length);
1298         if (target.base != NULL)
1299                 isc_mem_put(j->mctx, target.base, target.length);
1300
1301         dns_diff_clear(&diff);
1302
1303         return (result);
1304 }
1305
1306 isc_result_t
1307 dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, const char *filename) {
1308         dns_journal_t *j;
1309         isc_result_t result;
1310
1311         REQUIRE(DNS_DB_VALID(db));
1312         REQUIRE(filename != NULL);
1313
1314         j = NULL;
1315         result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1316         if (result == ISC_R_NOTFOUND) {
1317                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
1318                               "no journal file, but that's OK");
1319                 return (DNS_R_NOJOURNAL);
1320         }
1321         if (result != ISC_R_SUCCESS)
1322                 return (result);
1323         if (JOURNAL_EMPTY(&j->header))
1324                 result = DNS_R_UPTODATE;
1325         else
1326                 result = roll_forward(j, db);
1327
1328         dns_journal_destroy(&j);
1329
1330         return (result);
1331 }
1332
1333 isc_result_t
1334 dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
1335         dns_journal_t *j;
1336         isc_buffer_t source;            /* Transaction data from disk */
1337         isc_buffer_t target;            /* Ditto after _fromwire check */
1338         isc_uint32_t start_serial;              /* Database SOA serial */
1339         isc_uint32_t end_serial;        /* Last journal SOA serial */
1340         isc_result_t result;
1341         dns_diff_t diff;
1342         unsigned int n_soa = 0;
1343         unsigned int n_put = 0;
1344
1345         REQUIRE(filename != NULL);
1346
1347         j = NULL;
1348         result = dns_journal_open(mctx, filename, ISC_FALSE, &j);
1349         if (result == ISC_R_NOTFOUND) {
1350                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
1351                 return (DNS_R_NOJOURNAL);
1352         }
1353
1354         if (result != ISC_R_SUCCESS) {
1355                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1356                               "journal open failure: %s: %s",
1357                               isc_result_totext(result), j->filename);
1358                 return (result);
1359         }
1360
1361         dns_diff_init(j->mctx, &diff);
1362
1363         /*
1364          * Set up empty initial buffers for uncheched and checked
1365          * wire format transaction data.  They will be reallocated
1366          * later.
1367          */
1368         isc_buffer_init(&source, NULL, 0);
1369         isc_buffer_init(&target, NULL, 0);
1370
1371         start_serial = dns_journal_first_serial(j);
1372         end_serial = dns_journal_last_serial(j);
1373
1374         CHECK(dns_journal_iter_init(j, start_serial, end_serial));
1375
1376         for (result = dns_journal_first_rr(j);
1377              result == ISC_R_SUCCESS;
1378              result = dns_journal_next_rr(j))
1379         {
1380                 dns_name_t *name;
1381                 isc_uint32_t ttl;
1382                 dns_rdata_t *rdata;
1383                 dns_difftuple_t *tuple = NULL;
1384
1385                 name = NULL;
1386                 rdata = NULL;
1387                 dns_journal_current_rr(j, &name, &ttl, &rdata);
1388
1389                 if (rdata->type == dns_rdatatype_soa)
1390                         n_soa++;
1391
1392                 if (n_soa == 3)
1393                         n_soa = 1;
1394                 if (n_soa == 0) {
1395                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1396                                          "%s: journal file corrupt: missing "
1397                                          "initial SOA", j->filename);
1398                         FAIL(ISC_R_UNEXPECTED);
1399                 }
1400                 CHECK(dns_difftuple_create(diff.mctx, n_soa == 1 ?
1401                                            DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
1402                                            name, ttl, rdata, &tuple));
1403                 dns_diff_append(&diff, &tuple);
1404
1405                 if (++n_put > 100)  {
1406                         result = dns_diff_print(&diff, file);
1407                         dns_diff_clear(&diff);
1408                         n_put = 0;
1409                         if (result != ISC_R_SUCCESS)
1410                                 break;
1411                 }
1412         }
1413         if (result == ISC_R_NOMORE)
1414                 result = ISC_R_SUCCESS;
1415         CHECK(result);
1416
1417         if (n_put != 0) {
1418                 result = dns_diff_print(&diff, file);
1419                 dns_diff_clear(&diff);
1420         }
1421         goto cleanup;
1422
1423  failure:
1424         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1425                       "%s: cannot print: journal file corrupt", j->filename);
1426
1427  cleanup:
1428         if (source.base != NULL)
1429                 isc_mem_put(j->mctx, source.base, source.length);
1430         if (target.base != NULL)
1431                 isc_mem_put(j->mctx, target.base, target.length);
1432
1433         dns_diff_clear(&diff);
1434         dns_journal_destroy(&j);
1435
1436         return (result);
1437 }
1438
1439 /**************************************************************************/
1440 /*
1441  * Miscellaneous accessors.
1442  */
1443 isc_uint32_t dns_journal_first_serial(dns_journal_t *j) {
1444         return (j->header.begin.serial);
1445 }
1446
1447 isc_uint32_t dns_journal_last_serial(dns_journal_t *j) {
1448         return (j->header.end.serial);
1449 }
1450
1451 /**************************************************************************/
1452 /*
1453  * Iteration support.
1454  *
1455  * When serving an outgoing IXFR, we transmit a part the journal starting
1456  * at the serial number in the IXFR request and ending at the serial
1457  * number that is current when the IXFR request arrives.  The ending
1458  * serial number is not necessarily at the end of the journal:
1459  * the journal may grow while the IXFR is in progress, but we stop
1460  * when we reach the serial number that was current when the IXFR started.
1461  */
1462
1463 static isc_result_t read_one_rr(dns_journal_t *j);
1464
1465 /*
1466  * Make sure the buffer 'b' is has at least 'size' bytes
1467  * allocated, and clear it.
1468  *
1469  * Requires:
1470  *      Either b->base is NULL, or it points to b->length bytes of memory
1471  *      previously allocated by isc_mem_get().
1472  */
1473
1474 static isc_result_t
1475 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
1476         if (b->length < size) {
1477                 void *mem = isc_mem_get(mctx, size);
1478                 if (mem == NULL)
1479                         return (ISC_R_NOMEMORY);
1480                 if (b->base != NULL)
1481                         isc_mem_put(mctx, b->base, b->length);
1482                 b->base = mem;
1483                 b->length = size;
1484         }
1485         isc_buffer_clear(b);
1486         return (ISC_R_SUCCESS);
1487 }
1488
1489 isc_result_t
1490 dns_journal_iter_init(dns_journal_t *j,
1491                       isc_uint32_t begin_serial, isc_uint32_t end_serial)
1492 {
1493         isc_result_t result;
1494
1495         CHECK(journal_find(j, begin_serial, &j->it.bpos));
1496         INSIST(j->it.bpos.serial == begin_serial);
1497
1498         CHECK(journal_find(j, end_serial, &j->it.epos));
1499         INSIST(j->it.epos.serial == end_serial);
1500
1501         result = ISC_R_SUCCESS;
1502  failure:
1503         j->it.result = result;
1504         return (j->it.result);
1505 }
1506
1507
1508 isc_result_t
1509 dns_journal_first_rr(dns_journal_t *j) {
1510         isc_result_t result;
1511
1512         /*
1513          * Seek to the beginning of the first transaction we are
1514          * interested in.
1515          */
1516         CHECK(journal_seek(j, j->it.bpos.offset));
1517         j->it.current_serial = j->it.bpos.serial;
1518
1519         j->it.xsize = 0;  /* We have no transaction data yet... */
1520         j->it.xpos = 0;   /* ...and haven't used any of it. */
1521
1522         return (read_one_rr(j));
1523
1524  failure:
1525         return (result);
1526 }
1527
1528 static isc_result_t
1529 read_one_rr(dns_journal_t *j) {
1530         isc_result_t result;
1531
1532         dns_rdatatype_t rdtype;
1533         dns_rdataclass_t rdclass;
1534         unsigned int rdlen;
1535         isc_uint32_t ttl;
1536         journal_xhdr_t xhdr;
1537         journal_rrhdr_t rrhdr;
1538
1539         INSIST(j->offset <= j->it.epos.offset);
1540         if (j->offset == j->it.epos.offset)
1541                 return (ISC_R_NOMORE);
1542         if (j->it.xpos == j->it.xsize) {
1543                 /*
1544                  * We are at a transaction boundary.
1545                  * Read another transaction header.
1546                  */
1547                 CHECK(journal_read_xhdr(j, &xhdr));
1548                 if (xhdr.size == 0) {
1549                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1550                                       "%s: journal corrupt: empty transaction",
1551                                       j->filename);
1552                         FAIL(ISC_R_UNEXPECTED);
1553                 }
1554                 if (xhdr.serial0 != j->it.current_serial) {
1555                         isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1556                                          "%s: journal file corrupt: "
1557                                          "expected serial %u, got %u",
1558                                          j->filename,
1559                                          j->it.current_serial, xhdr.serial0);
1560                         FAIL(ISC_R_UNEXPECTED);
1561                 }
1562                 j->it.xsize = xhdr.size;
1563                 j->it.xpos = 0;
1564         }
1565         /*
1566          * Read an RR.
1567          */
1568         CHECK(journal_read_rrhdr(j, &rrhdr));
1569         /*
1570          * Perform a sanity check on the journal RR size.
1571          * The smallest possible RR has a 1-byte owner name
1572          * and a 10-byte header.  The largest possible
1573          * RR has 65535 bytes of data, a header, and a maximum-
1574          * size owner name, well below 70 k total.
1575          */
1576         if (rrhdr.size < 1+10 || rrhdr.size > 70000) {
1577                 isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
1578                                  "%s: journal corrupt: impossible RR size "
1579                                  "(%d bytes)", j->filename, rrhdr.size);
1580                 FAIL(ISC_R_UNEXPECTED);
1581         }
1582
1583         CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
1584         CHECK(journal_read(j, j->it.source.base, rrhdr.size));
1585         isc_buffer_add(&j->it.source, rrhdr.size);
1586
1587         /*
1588          * The target buffer is made the same size
1589          * as the source buffer, with the assumption that when
1590          * no compression in present, the output of dns_*_fromwire()
1591          * is no larger than the input.
1592          */
1593         CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
1594
1595         /*
1596          * Parse the owner name.  We don't know where it
1597          * ends yet, so we make the entire "remaining"
1598          * part of the buffer "active".
1599          */
1600         isc_buffer_setactive(&j->it.source,
1601                              j->it.source.used - j->it.source.current);
1602         CHECK(dns_name_fromwire(&j->it.name, &j->it.source,
1603                                 &j->it.dctx, 0, &j->it.target));
1604
1605         /*
1606          * Check that the RR header is there, and parse it.
1607          */
1608         if (isc_buffer_remaininglength(&j->it.source) < 10)
1609                 FAIL(DNS_R_FORMERR);
1610
1611         rdtype = isc_buffer_getuint16(&j->it.source);
1612         rdclass = isc_buffer_getuint16(&j->it.source);
1613         ttl = isc_buffer_getuint32(&j->it.source);
1614         rdlen = isc_buffer_getuint16(&j->it.source);
1615
1616         /*
1617          * Parse the rdata.
1618          */
1619         isc_buffer_setactive(&j->it.source, rdlen);
1620         dns_rdata_reset(&j->it.rdata);
1621         CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass,
1622                                  rdtype, &j->it.source, &j->it.dctx,
1623                                  0, &j->it.target));
1624         j->it.ttl = ttl;
1625
1626         j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
1627         if (rdtype == dns_rdatatype_soa) {
1628                 /* XXX could do additional consistency checks here */
1629                 j->it.current_serial = dns_soa_getserial(&j->it.rdata);
1630         }
1631
1632         result = ISC_R_SUCCESS;
1633
1634  failure:
1635         j->it.result = result;
1636         return (result);
1637 }
1638
1639 isc_result_t
1640 dns_journal_next_rr(dns_journal_t *j) {
1641         j->it.result = read_one_rr(j);
1642         return (j->it.result);
1643 }
1644
1645 void
1646 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, isc_uint32_t *ttl,
1647                    dns_rdata_t **rdata)
1648 {
1649         REQUIRE(j->it.result == ISC_R_SUCCESS);
1650         *name = &j->it.name;
1651         *ttl = j->it.ttl;
1652         *rdata = &j->it.rdata;
1653 }
1654
1655 /**************************************************************************/
1656 /*
1657  * Generating diffs from databases
1658  */
1659
1660 /*
1661  * Construct a diff containing all the RRs at the current name of the
1662  * database iterator 'dbit' in database 'db', version 'ver'.
1663  * Set '*name' to the current name, and append the diff to 'diff'.
1664  * All new tuples will have the operation 'op'.
1665  *
1666  * Requires: 'name' must have buffer large enough to hold the name.
1667  * Typically, a dns_fixedname_t would be used.
1668  */
1669 static isc_result_t
1670 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
1671               dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
1672               dns_diff_t *diff)
1673 {
1674         isc_result_t result;
1675         dns_dbnode_t *node = NULL;
1676         dns_rdatasetiter_t *rdsiter = NULL;
1677         dns_difftuple_t *tuple = NULL;
1678
1679         result = dns_dbiterator_current(dbit, &node, name);
1680         if (result != ISC_R_SUCCESS)
1681                 return (result);
1682
1683         result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
1684         if (result != ISC_R_SUCCESS)
1685                 goto cleanup_node;
1686
1687         for (result = dns_rdatasetiter_first(rdsiter);
1688              result == ISC_R_SUCCESS;
1689              result = dns_rdatasetiter_next(rdsiter))
1690         {
1691                 dns_rdataset_t rdataset;
1692
1693                 dns_rdataset_init(&rdataset);
1694                 dns_rdatasetiter_current(rdsiter, &rdataset);
1695
1696                 for (result = dns_rdataset_first(&rdataset);
1697                      result == ISC_R_SUCCESS;
1698                      result = dns_rdataset_next(&rdataset))
1699                 {
1700                         dns_rdata_t rdata = DNS_RDATA_INIT;
1701                         dns_rdataset_current(&rdataset, &rdata);
1702                         result = dns_difftuple_create(diff->mctx, op, name,
1703                                                       rdataset.ttl, &rdata,
1704                                                       &tuple);
1705                         if (result != ISC_R_SUCCESS) {
1706                                 dns_rdataset_disassociate(&rdataset);
1707                                 goto cleanup_iterator;
1708                         }
1709                         dns_diff_append(diff, &tuple);
1710                 }
1711                 dns_rdataset_disassociate(&rdataset);
1712                 if (result != ISC_R_NOMORE)
1713                         goto cleanup_iterator;
1714         }
1715         if (result != ISC_R_NOMORE)
1716                 goto cleanup_iterator;
1717
1718         result = ISC_R_SUCCESS;
1719
1720  cleanup_iterator:
1721         dns_rdatasetiter_destroy(&rdsiter);
1722
1723  cleanup_node:
1724         dns_db_detachnode(db, &node);
1725
1726         return (result);
1727 }
1728
1729 /*
1730  * Comparison function for use by dns_diff_subtract when sorting
1731  * the diffs to be subtracted.  The sort keys are the rdata type
1732  * and the rdata itself.  The owner name is ignored, because
1733  * it is known to be the same for all tuples.
1734  */
1735 static int
1736 rdata_order(const void *av, const void *bv) {
1737         dns_difftuple_t const * const *ap = av;
1738         dns_difftuple_t const * const *bp = bv;
1739         dns_difftuple_t const *a = *ap;
1740         dns_difftuple_t const *b = *bp;
1741         int r;
1742         r = (b->rdata.type - a->rdata.type);
1743         if (r != 0)
1744                 return (r);
1745         r = dns_rdata_compare(&a->rdata, &b->rdata);
1746         return (r);
1747 }
1748
1749 static isc_result_t
1750 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
1751         isc_result_t result;
1752         dns_difftuple_t *p[2];
1753         int i, t;
1754         isc_boolean_t append;
1755
1756         CHECK(dns_diff_sort(&diff[0], rdata_order));
1757         CHECK(dns_diff_sort(&diff[1], rdata_order));
1758
1759         for (;;) {
1760                 p[0] = ISC_LIST_HEAD(diff[0].tuples);
1761                 p[1] = ISC_LIST_HEAD(diff[1].tuples);
1762                 if (p[0] == NULL && p[1] == NULL)
1763                         break;
1764
1765                 for (i = 0; i < 2; i++)
1766                         if (p[!i] == NULL) {
1767                                 ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1768                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1769                                 goto next;
1770                         }
1771                 t = rdata_order(&p[0], &p[1]);
1772                 if (t < 0) {
1773                         ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
1774                         ISC_LIST_APPEND(r->tuples, p[0], link);
1775                         goto next;
1776                 }
1777                 if (t > 0) {
1778                         ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
1779                         ISC_LIST_APPEND(r->tuples, p[1], link);
1780                         goto next;
1781                 }
1782                 INSIST(t == 0);
1783                 /*
1784                  * Identical RRs in both databases; skip them both
1785                  * if the ttl differs.
1786                  */
1787                 append = ISC_TF(p[0]->ttl != p[1]->ttl);
1788                 for (i = 0; i < 2; i++) {
1789                         ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
1790                         if (append) {
1791                                 ISC_LIST_APPEND(r->tuples, p[i], link);
1792                         } else {
1793                                 dns_difftuple_free(&p[i]);
1794                         }
1795                 }
1796         next: ;
1797         }
1798         result = ISC_R_SUCCESS;
1799  failure:
1800         return (result);
1801 }
1802
1803 /*
1804  * Compare the databases 'dba' and 'dbb' and generate a journal
1805  * entry containing the changes to make 'dba' from 'dbb' (note
1806  * the order).  This journal entry will consist of a single,
1807  * possibly very large transaction.
1808  */
1809
1810 isc_result_t
1811 dns_db_diff(isc_mem_t *mctx,
1812             dns_db_t *dba, dns_dbversion_t *dbvera,
1813             dns_db_t *dbb, dns_dbversion_t *dbverb,
1814             const char *journal_filename)
1815 {
1816         dns_db_t *db[2];
1817         dns_dbversion_t *ver[2];
1818         dns_dbiterator_t *dbit[2] = { NULL, NULL };
1819         isc_boolean_t have[2] = { ISC_FALSE, ISC_FALSE };
1820         dns_fixedname_t fixname[2];
1821         isc_result_t result, itresult[2];
1822         dns_diff_t diff[2], resultdiff;
1823         int i, t;
1824         dns_journal_t *journal = NULL;
1825
1826         db[0] = dba, db[1] = dbb;
1827         ver[0] = dbvera, ver[1] = dbverb;
1828
1829         dns_diff_init(mctx, &diff[0]);
1830         dns_diff_init(mctx, &diff[1]);
1831         dns_diff_init(mctx, &resultdiff);
1832
1833         dns_fixedname_init(&fixname[0]);
1834         dns_fixedname_init(&fixname[1]);
1835
1836         result = dns_journal_open(mctx, journal_filename, ISC_TRUE, &journal);
1837         if (result != ISC_R_SUCCESS)
1838                 return (result);
1839
1840         result = dns_db_createiterator(db[0], ISC_FALSE, &dbit[0]);
1841         if (result != ISC_R_SUCCESS)
1842                 goto cleanup_journal;
1843         result = dns_db_createiterator(db[1], ISC_FALSE, &dbit[1]);
1844         if (result != ISC_R_SUCCESS)
1845                 goto cleanup_interator0;
1846
1847         itresult[0] = dns_dbiterator_first(dbit[0]);
1848         itresult[1] = dns_dbiterator_first(dbit[1]);
1849
1850         for (;;) {
1851                 for (i = 0; i < 2; i++) {
1852                         if (! have[i] && itresult[i] == ISC_R_SUCCESS) {
1853                                 CHECK(get_name_diff(db[i], ver[i], 0, dbit[i],
1854                                             dns_fixedname_name(&fixname[i]),
1855                                             i == 0 ?
1856                                             DNS_DIFFOP_ADD :
1857                                             DNS_DIFFOP_DEL,
1858                                             &diff[i]));
1859                                 itresult[i] = dns_dbiterator_next(dbit[i]);
1860                                 have[i] = ISC_TRUE;
1861                         }
1862                 }
1863
1864                 if (! have[0] && ! have[1]) {
1865                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1866                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1867                         break;
1868                 }
1869
1870                 for (i = 0; i < 2; i++) {
1871                         if (! have[!i]) {
1872                                 ISC_LIST_APPENDLIST(resultdiff.tuples,
1873                                                     diff[i].tuples, link);
1874                                 INSIST(ISC_LIST_EMPTY(diff[i].tuples));
1875                                 have[i] = ISC_FALSE;
1876                                 goto next;
1877                         }
1878                 }
1879
1880                 t = dns_name_compare(dns_fixedname_name(&fixname[0]),
1881                                      dns_fixedname_name(&fixname[1]));
1882                 if (t < 0) {
1883                         ISC_LIST_APPENDLIST(resultdiff.tuples,
1884                                             diff[0].tuples, link);
1885                         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1886                         have[0] = ISC_FALSE;
1887                         continue;
1888                 }
1889                 if (t > 0) {
1890                         ISC_LIST_APPENDLIST(resultdiff.tuples,
1891                                             diff[1].tuples, link);
1892                         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1893                         have[1] = ISC_FALSE;
1894                         continue;
1895                 }
1896                 INSIST(t == 0);
1897                 CHECK(dns_diff_subtract(diff, &resultdiff));
1898                 INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1899                 INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1900                 have[0] = have[1] = ISC_FALSE;
1901         next: ;
1902         }
1903         if (itresult[0] != ISC_R_NOMORE)
1904                 FAIL(itresult[0]);
1905         if (itresult[1] != ISC_R_NOMORE)
1906                 FAIL(itresult[1]);
1907
1908         if (ISC_LIST_EMPTY(resultdiff.tuples)) {
1909                 isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
1910         } else {
1911                 CHECK(dns_journal_write_transaction(journal, &resultdiff));
1912         }
1913         INSIST(ISC_LIST_EMPTY(diff[0].tuples));
1914         INSIST(ISC_LIST_EMPTY(diff[1].tuples));
1915
1916  failure:
1917         dns_diff_clear(&resultdiff);
1918         dns_dbiterator_destroy(&dbit[1]);
1919  cleanup_interator0:
1920         dns_dbiterator_destroy(&dbit[0]);
1921  cleanup_journal:
1922         dns_journal_destroy(&journal);
1923         return (result);
1924 }
1925
1926 isc_result_t
1927 dns_journal_compact(isc_mem_t *mctx, char *filename, isc_uint32_t serial,
1928                     isc_uint32_t target_size)
1929 {
1930         unsigned int i;
1931         journal_pos_t best_guess;
1932         journal_pos_t current_pos;
1933         dns_journal_t *j = NULL;
1934         journal_rawheader_t rawheader;
1935         unsigned int copy_length;
1936         unsigned int len;
1937         char *buf = NULL;
1938         unsigned int size = 0;
1939         isc_result_t result;
1940         unsigned int indexend;
1941
1942         CHECK(journal_open(mctx, filename, ISC_TRUE, ISC_FALSE, &j));
1943
1944         if (JOURNAL_EMPTY(&j->header)) {
1945                 dns_journal_destroy(&j);
1946                 return (ISC_R_SUCCESS);
1947         }
1948                 
1949         if (DNS_SERIAL_GT(j->header.begin.serial, serial) ||
1950             DNS_SERIAL_GT(serial, j->header.end.serial)) {
1951                 dns_journal_destroy(&j);
1952                 return (ISC_R_RANGE);
1953         }
1954
1955         /*
1956          * Cope with very small target sizes.
1957          */
1958         indexend = sizeof(journal_rawheader_t) +
1959                    j->header.index_size * sizeof(journal_rawpos_t);
1960         if (target_size < indexend * 2)
1961                 target_size = target_size/2 + indexend;
1962
1963         /*
1964          * See if there is any work to do.
1965          */
1966         if ((isc_uint32_t) j->header.end.offset < target_size) {
1967                 dns_journal_destroy(&j);
1968                 return (ISC_R_SUCCESS);
1969         }
1970         
1971         /*
1972          * Remove overhead so space test below can succeed.
1973          */
1974         if (target_size >= indexend)
1975                 target_size -= indexend;
1976
1977         /*
1978          * Find if we can create enough free space.
1979          */
1980         best_guess = j->header.begin;
1981         for (i = 0; i < j->header.index_size; i++) {
1982                 if (POS_VALID(j->index[i]) &&
1983                     DNS_SERIAL_GE(serial, j->index[i].serial) &&
1984                     ((isc_uint32_t)(j->header.end.offset - j->index[i].offset)
1985                      >= target_size / 2) &&
1986                     j->index[i].offset > best_guess.offset)
1987                         best_guess = j->index[i];
1988         }
1989
1990         current_pos = best_guess;
1991         while (current_pos.serial != serial) {
1992                 CHECK(journal_next(j, &current_pos));
1993                 if (current_pos.serial == j->header.end.serial)
1994                         break;
1995
1996                 if (DNS_SERIAL_GE(serial, current_pos.serial) &&
1997                    ((isc_uint32_t)(j->header.end.offset - current_pos.offset)
1998                      >= (target_size / 2)) &&
1999                     current_pos.offset > best_guess.offset)
2000                         best_guess = current_pos;
2001                 else
2002                         break;
2003         }
2004
2005         INSIST(best_guess.serial != j->header.end.serial);
2006         if (best_guess.serial != serial)
2007                 CHECK(journal_next(j, &best_guess));
2008
2009         /*
2010          * Enough space to proceed?
2011          */
2012         if ((isc_uint32_t) (j->header.end.offset - best_guess.offset) >
2013              (isc_uint32_t) (best_guess.offset - indexend)) {
2014                 dns_journal_destroy(&j);
2015                 return (ISC_R_NOSPACE);
2016         }
2017
2018         copy_length = j->header.end.offset - best_guess.offset;
2019
2020         /*
2021          * Invalidate entire index, will be rebuilt at end.
2022          */
2023         for (i = 0; i < j->header.index_size; i++) {
2024                 if (POS_VALID(j->index[i]))
2025                         POS_INVALIDATE(j->index[i]);
2026         }
2027
2028         /*
2029          * Convert the index into on-disk format and write
2030          * it to disk.
2031          */
2032         CHECK(index_to_disk(j));
2033         CHECK(journal_fsync(j));
2034
2035         /*
2036          * Update the journal header.
2037          */
2038         if (copy_length == 0) {
2039                 j->header.begin.serial = 0;
2040                 j->header.end.serial = 0;
2041                 j->header.begin.offset = 0;
2042                 j->header.end.offset = 0;
2043         } else {
2044                 j->header.begin = best_guess;
2045         }
2046         journal_header_encode(&j->header, &rawheader);
2047         CHECK(journal_seek(j, 0));
2048         CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2049         CHECK(journal_fsync(j));
2050
2051         if (copy_length != 0) {
2052                 /*
2053                  * Copy best_guess to end into space just freed.
2054                  */
2055                 size = 64*1024;
2056                 if (copy_length < size)
2057                         size = copy_length;
2058                 buf = isc_mem_get(mctx, size);
2059                 if (buf == NULL) {
2060                         result = ISC_R_NOMEMORY;
2061                         goto failure;
2062                 }
2063         
2064                 for (i = 0; i < copy_length; i += size) {
2065                         len = (copy_length - i) > size ? size :
2066                                                          (copy_length - i);
2067                         CHECK(journal_seek(j, best_guess.offset + i));
2068                         CHECK(journal_read(j, buf, len));
2069                         CHECK(journal_seek(j, indexend + i));
2070                         CHECK(journal_write(j, buf, len));
2071                 }
2072
2073                 CHECK(journal_fsync(j));
2074
2075                 /*
2076                  * Compute new header.
2077                  */
2078                 j->header.begin.offset = indexend;
2079                 j->header.end.offset = indexend + copy_length;
2080                 /*
2081                  * Update the journal header.
2082                  */
2083                 journal_header_encode(&j->header, &rawheader);
2084                 CHECK(journal_seek(j, 0));
2085                 CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
2086                 CHECK(journal_fsync(j));
2087
2088                 /*
2089                  * Build new index.
2090                  */
2091                 current_pos = j->header.begin;
2092                 while (current_pos.serial != j->header.end.serial) {
2093                         index_add(j, &current_pos);
2094                         CHECK(journal_next(j, &current_pos));
2095                 }
2096
2097                 /*
2098                  * Write index.
2099                  */
2100                 CHECK(index_to_disk(j));
2101                 CHECK(journal_fsync(j));
2102
2103                 indexend = j->header.end.offset;
2104         }
2105         dns_journal_destroy(&j);
2106         (void)isc_file_truncate(filename, (isc_offset_t)indexend);
2107         result = ISC_R_SUCCESS;
2108
2109  failure:
2110         if (buf != NULL)
2111                 isc_mem_put(mctx, buf, size);
2112         if (j != NULL)
2113                 dns_journal_destroy(&j);
2114         return (result);
2115 }
2116
2117 static isc_result_t
2118 index_to_disk(dns_journal_t *j) {
2119         isc_result_t result = ISC_R_SUCCESS;
2120
2121         if (j->header.index_size != 0) {
2122                 unsigned int i;
2123                 unsigned char *p;
2124                 unsigned int rawbytes;
2125
2126                 rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
2127
2128                 p = j->rawindex;
2129                 for (i = 0; i < j->header.index_size; i++) {
2130                         encode_uint32(j->index[i].serial, p);
2131                         p += 4;
2132                         encode_uint32(j->index[i].offset, p);
2133                         p += 4;
2134                 }
2135                 INSIST(p == j->rawindex + rawbytes);
2136
2137                 CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
2138                 CHECK(journal_write(j, j->rawindex, rawbytes));
2139         }
2140 failure:
2141         return (result);
2142 }