BIND - Update BIND to 9.5.2
[dragonfly.git] / contrib / bind-9.5.2 / lib / dns / rbtdb.c
1 /*
2  * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: rbtdb.c,v 1.248.12.18 2009/05/06 23:34:47 jinmei Exp $ */
19
20 /*! \file */
21
22 /*
23  * Principal Author: Bob Halley
24  */
25
26 #include <config.h>
27
28 #include <isc/heap.h>
29 #include <isc/event.h>
30 #include <isc/mem.h>
31 #include <isc/platform.h>
32 #include <isc/print.h>
33 #include <isc/mutex.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/rwlock.h>
37 #include <isc/string.h>
38 #include <isc/task.h>
39 #include <isc/time.h>
40 #include <isc/util.h>
41
42 #include <dns/acache.h>
43 #include <dns/db.h>
44 #include <dns/dbiterator.h>
45 #include <dns/events.h>
46 #include <dns/fixedname.h>
47 #include <dns/lib.h>
48 #include <dns/log.h>
49 #include <dns/masterdump.h>
50 #include <dns/rbt.h>
51 #include <dns/rdata.h>
52 #include <dns/rdataset.h>
53 #include <dns/rdatasetiter.h>
54 #include <dns/rdataslab.h>
55 #include <dns/result.h>
56 #include <dns/stats.h>
57 #include <dns/view.h>
58 #include <dns/zone.h>
59 #include <dns/zonekey.h>
60
61 #ifdef DNS_RBTDB_VERSION64
62 #include "rbtdb64.h"
63 #else
64 #include "rbtdb.h"
65 #endif
66
67 #ifdef DNS_RBTDB_VERSION64
68 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
69 #else
70 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
71 #endif
72
73 /*%
74  * Note that "impmagic" is not the first four bytes of the struct, so
75  * ISC_MAGIC_VALID cannot be used.
76  */
77 #define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
78                                  (rbtdb)->common.impmagic == RBTDB_MAGIC)
79
80 #ifdef DNS_RBTDB_VERSION64
81 typedef isc_uint64_t                    rbtdb_serial_t;
82 /*%
83  * Make casting easier in symbolic debuggers by using different names
84  * for the 64 bit version.
85  */
86 #define dns_rbtdb_t dns_rbtdb64_t
87 #define rdatasetheader_t rdatasetheader64_t
88 #define rbtdb_version_t rbtdb_version64_t
89 #else
90 typedef isc_uint32_t                    rbtdb_serial_t;
91 #endif
92
93 typedef isc_uint32_t                    rbtdb_rdatatype_t;
94
95 #define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
96 #define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
97 #define RBTDB_RDATATYPE_VALUE(b, e)     (((e) << 16) | (b))
98
99 #define RBTDB_RDATATYPE_SIGNSEC \
100                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
101 #define RBTDB_RDATATYPE_SIGNS \
102                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
103 #define RBTDB_RDATATYPE_SIGCNAME \
104                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
105 #define RBTDB_RDATATYPE_SIGDNAME \
106                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
107 #define RBTDB_RDATATYPE_NCACHEANY \
108                 RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
109
110 /*
111  * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
112  * Using rwlock is effective with regard to lookup performance only when
113  * it is implemented in an efficient way.
114  * Otherwise, it is generally wise to stick to the simple locking since rwlock
115  * would require more memory or can even make lookups slower due to its own
116  * overhead (when it internally calls mutex locks).
117  */
118 #ifdef ISC_RWLOCK_USEATOMIC
119 #define DNS_RBTDB_USERWLOCK 1
120 #else
121 #define DNS_RBTDB_USERWLOCK 0
122 #endif
123
124 #if DNS_RBTDB_USERWLOCK
125 #define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
126 #define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
127 #define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
128 #define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
129 #else
130 #define RBTDB_INITLOCK(l)       isc_mutex_init(l)
131 #define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
132 #define RBTDB_LOCK(l, t)        LOCK(l)
133 #define RBTDB_UNLOCK(l, t)      UNLOCK(l)
134 #endif
135
136 /*
137  * Since node locking is sensitive to both performance and memory footprint,
138  * we need some trick here.  If we have both high-performance rwlock and
139  * high performance and small-memory reference counters, we use rwlock for
140  * node lock and isc_refcount for node references.  In this case, we don't have
141  * to protect the access to the counters by locks.
142  * Otherwise, we simply use ordinary mutex lock for node locking, and use
143  * simple integers as reference counters which is protected by the lock.
144  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
145  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
146  * counters first and then protect other parts of a node as read-only data.
147  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
148  * provided for these special cases.  When we can use the efficient backend
149  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
150  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
151  * section including the access to the reference counter.
152  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
153  * section is also protected by NODE_STRONGLOCK().
154  */
155 #if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
156 typedef isc_rwlock_t nodelock_t;
157
158 #define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
159 #define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
160 #define NODE_LOCK(l, t)         RWLOCK((l), (t))
161 #define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
162 #define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)
163
164 #define NODE_STRONGLOCK(l)      ((void)0)
165 #define NODE_STRONGUNLOCK(l)    ((void)0)
166 #define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
167 #define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
168 #define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
169 #else
170 typedef isc_mutex_t nodelock_t;
171
172 #define NODE_INITLOCK(l)        isc_mutex_init(l)
173 #define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
174 #define NODE_LOCK(l, t)         LOCK(l)
175 #define NODE_UNLOCK(l, t)       UNLOCK(l)
176 #define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS
177
178 #define NODE_STRONGLOCK(l)      LOCK(l)
179 #define NODE_STRONGUNLOCK(l)    UNLOCK(l)
180 #define NODE_WEAKLOCK(l, t)     ((void)0)
181 #define NODE_WEAKUNLOCK(l, t)   ((void)0)
182 #define NODE_WEAKDOWNGRADE(l)   ((void)0)
183 #endif
184
185 /*%
186  * Whether to rate-limit updating the LRU to avoid possible thread contention.
187  * Our performance measurement has shown the cost is marginal, so it's defined
188  * to be 0 by default either with or without threads.
189  */
190 #ifndef DNS_RBTDB_LIMITLRUUPDATE
191 #define DNS_RBTDB_LIMITLRUUPDATE 0
192 #endif
193
194 /*
195  * Allow clients with a virtual time of up to 5 minutes in the past to see
196  * records that would have otherwise have expired.
197  */
198 #define RBTDB_VIRTUAL 300
199
200 struct noqname {
201         dns_name_t name;
202         void *     nsec;
203         void *     nsecsig;
204 };
205
206 typedef struct acachectl acachectl_t;
207
208 typedef struct rdatasetheader {
209         /*%
210          * Locked by the owning node's lock.
211          */
212         rbtdb_serial_t                  serial;
213         dns_ttl_t                       rdh_ttl;
214         rbtdb_rdatatype_t               type;
215         isc_uint16_t                    attributes;
216         dns_trust_t                     trust;
217         struct noqname                  *noqname;
218         /*%<
219          * We don't use the LIST macros, because the LIST structure has
220          * both head and tail pointers, and is doubly linked.
221          */
222
223         struct rdatasetheader           *next;
224         /*%<
225          * If this is the top header for an rdataset, 'next' points
226          * to the top header for the next rdataset (i.e., the next type).
227          * Otherwise, it points up to the header whose down pointer points
228          * at this header.
229          */
230
231         struct rdatasetheader           *down;
232         /*%<
233          * Points to the header for the next older version of
234          * this rdataset.
235          */
236
237         isc_uint32_t                    count;
238         /*%<
239          * Monotonously increased every time this rdataset is bound so that
240          * it is used as the base of the starting point in DNS responses
241          * when the "cyclic" rrset-order is required.  Since the ordering
242          * should not be so crucial, no lock is set for the counter for
243          * performance reasons.
244          */
245
246         acachectl_t                     *additional_auth;
247         acachectl_t                     *additional_glue;
248
249         dns_rbtnode_t                   *node;
250         isc_stdtime_t                   last_used;
251         ISC_LINK(struct rdatasetheader) lru_link;
252         /*%<
253          * Used for LRU-based cache management.  We should probably make
254          * these cache-DB specific.  We might also make it a pointer and
255          * ensure only the top header has a valid link to save memory.
256          * The linked-list is locked by the rbtdb->lrulock.
257          */
258
259         /*
260          * It's possible this should not be here anymore, but instead
261          * referenced from the bucket's heap directly.
262          */
263 #if 0
264         isc_heap_t                      *heap;
265 #endif
266         unsigned int                    heap_index;
267         /*%<
268          * Used for TTL-based cache cleaning.
269          */
270 } rdatasetheader_t;
271
272 typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
273 typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;
274
275 #define RDATASET_ATTR_NONEXISTENT       0x0001
276 #define RDATASET_ATTR_STALE             0x0002
277 #define RDATASET_ATTR_IGNORE            0x0004
278 #define RDATASET_ATTR_RETAIN            0x0008
279 #define RDATASET_ATTR_NXDOMAIN          0x0010
280 #define RDATASET_ATTR_RESIGN            0x0020
281 #define RDATASET_ATTR_STATCOUNT         0x0040
282
283 typedef struct acache_cbarg {
284         dns_rdatasetadditional_t        type;
285         unsigned int                    count;
286         dns_db_t                        *db;
287         dns_dbnode_t                    *node;
288         rdatasetheader_t                *header;
289 } acache_cbarg_t;
290
291 struct acachectl {
292         dns_acacheentry_t               *entry;
293         acache_cbarg_t                  *cbarg;
294 };
295
296 /*
297  * XXX
298  * When the cache will pre-expire data (due to memory low or other
299  * situations) before the rdataset's TTL has expired, it MUST
300  * respect the RETAIN bit and not expire the data until its TTL is
301  * expired.
302  */
303
304 #undef IGNORE                   /* WIN32 winbase.h defines this. */
305
306 #define EXISTS(header) \
307         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
308 #define NONEXISTENT(header) \
309         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
310 #define IGNORE(header) \
311         (((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
312 #define RETAIN(header) \
313         (((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
314 #define NXDOMAIN(header) \
315         (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
316
317 #define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
318
319 /*%
320  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
321  * There is a tradeoff issue about configuring this value: if this is too
322  * small, it may cause heavier contention between threads; if this is too large,
323  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
324  * The default value should work well for most environments, but this can
325  * also be configurable at compilation time via the
326  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
327  * 1 due to the assumption of overmem_purge().
328  */
329 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
330 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
331 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
332 #else
333 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
334 #endif
335 #else
336 #define DEFAULT_CACHE_NODE_LOCK_COUNT   16
337 #endif  /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
338
339 typedef struct {
340         nodelock_t                      lock;
341         /* Protected in the refcount routines. */
342         isc_refcount_t                  references;
343         /* Locked by lock. */
344         isc_boolean_t                   exiting;
345 } rbtdb_nodelock_t;
346
347 typedef struct rbtdb_changed {
348         dns_rbtnode_t *                 node;
349         isc_boolean_t                   dirty;
350         ISC_LINK(struct rbtdb_changed)  link;
351 } rbtdb_changed_t;
352
353 typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
354
355 typedef struct rbtdb_version {
356         /* Not locked */
357         rbtdb_serial_t                  serial;
358         /*
359          * Protected in the refcount routines.
360          * XXXJT: should we change the lock policy based on the refcount
361          * performance?
362          */
363         isc_refcount_t                  references;
364         /* Locked by database lock. */
365         isc_boolean_t                   writer;
366         isc_boolean_t                   commit_ok;
367         rbtdb_changedlist_t             changed_list;
368         ISC_LINK(struct rbtdb_version)  link;
369 } rbtdb_version_t;
370
371 typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;
372
373 typedef struct {
374         /* Unlocked. */
375         dns_db_t                        common;
376 #if DNS_RBTDB_USERWLOCK
377         isc_rwlock_t                    lock;
378 #else
379         isc_mutex_t                     lock;
380 #endif
381         isc_rwlock_t                    tree_lock;
382         unsigned int                    node_lock_count;
383         rbtdb_nodelock_t *              node_locks;
384         dns_rbtnode_t *                 origin_node;
385         dns_stats_t *                   rrsetstats; /* cache DB only */
386         /* Locked by lock. */
387         unsigned int                    active;
388         isc_refcount_t                  references;
389         unsigned int                    attributes;
390         rbtdb_serial_t                  current_serial;
391         rbtdb_serial_t                  least_serial;
392         rbtdb_serial_t                  next_serial;
393         rbtdb_version_t *               current_version;
394         rbtdb_version_t *               future_version;
395         rbtdb_versionlist_t             open_versions;
396         isc_boolean_t                   overmem;
397         isc_task_t *                    task;
398         dns_dbnode_t                    *soanode;
399         dns_dbnode_t                    *nsnode;
400
401         /*
402          * This is a linked list used to implement the LRU cache.  There will
403          * be node_lock_count linked lists here.  Nodes in bucket 1 will be
404          * placed on the linked list rdatasets[1].
405          */
406         rdatasetheaderlist_t            *rdatasets;
407
408         /*%
409          * Temporary storage for stale cache nodes and dynamically deleted
410          * nodes that await being cleaned up.
411          */
412         rbtnodelist_t                   *deadnodes;
413
414         /*
415          * Heaps.  Each of these is used for TTL based expiry.
416          */
417         isc_heap_t                      **heaps;
418
419         /* Locked by tree_lock. */
420         dns_rbt_t *                     tree;
421         isc_boolean_t                   secure;
422
423         /* Unlocked */
424         unsigned int                    quantum;
425 } dns_rbtdb_t;
426
427 #define RBTDB_ATTR_LOADED               0x01
428 #define RBTDB_ATTR_LOADING              0x02
429
430 /*%
431  * Search Context
432  */
433 typedef struct {
434         dns_rbtdb_t *           rbtdb;
435         rbtdb_version_t *       rbtversion;
436         rbtdb_serial_t          serial;
437         unsigned int            options;
438         dns_rbtnodechain_t      chain;
439         isc_boolean_t           copy_name;
440         isc_boolean_t           need_cleanup;
441         isc_boolean_t           wild;
442         dns_rbtnode_t *         zonecut;
443         rdatasetheader_t *      zonecut_rdataset;
444         rdatasetheader_t *      zonecut_sigrdataset;
445         dns_fixedname_t         zonecut_name;
446         isc_stdtime_t           now;
447 } rbtdb_search_t;
448
449 /*%
450  * Load Context
451  */
452 typedef struct {
453         dns_rbtdb_t *           rbtdb;
454         isc_stdtime_t           now;
455 } rbtdb_load_t;
456
457 static void rdataset_disassociate(dns_rdataset_t *rdataset);
458 static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
459 static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
460 static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
461 static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
462 static unsigned int rdataset_count(dns_rdataset_t *rdataset);
463 static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
464                                         dns_name_t *name,
465                                         dns_rdataset_t *nsec,
466                                         dns_rdataset_t *nsecsig);
467 static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
468                                            dns_rdatasetadditional_t type,
469                                            dns_rdatatype_t qtype,
470                                            dns_acache_t *acache,
471                                            dns_zone_t **zonep,
472                                            dns_db_t **dbp,
473                                            dns_dbversion_t **versionp,
474                                            dns_dbnode_t **nodep,
475                                            dns_name_t *fname,
476                                            dns_message_t *msg,
477                                            isc_stdtime_t now);
478 static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
479                                            dns_rdatasetadditional_t type,
480                                            dns_rdatatype_t qtype,
481                                            dns_acache_t *acache,
482                                            dns_zone_t *zone,
483                                            dns_db_t *db,
484                                            dns_dbversion_t *version,
485                                            dns_dbnode_t *node,
486                                            dns_name_t *fname);
487 static isc_result_t rdataset_putadditional(dns_acache_t *acache,
488                                            dns_rdataset_t *rdataset,
489                                            dns_rdatasetadditional_t type,
490                                            dns_rdatatype_t qtype);
491 static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
492                                               isc_stdtime_t now);
493 static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
494                           isc_stdtime_t now);
495 static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
496                           isc_boolean_t tree_locked);
497 static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
498                           isc_stdtime_t now, isc_boolean_t tree_locked);
499 static void prune_tree(isc_task_t *task, isc_event_t *event);
500
501 static dns_rdatasetmethods_t rdataset_methods = {
502         rdataset_disassociate,
503         rdataset_first,
504         rdataset_next,
505         rdataset_current,
506         rdataset_clone,
507         rdataset_count,
508         NULL,
509         rdataset_getnoqname,
510         rdataset_getadditional,
511         rdataset_setadditional,
512         rdataset_putadditional
513 };
514
515 static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
516 static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
517 static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
518 static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
519                                  dns_rdataset_t *rdataset);
520
521 static dns_rdatasetitermethods_t rdatasetiter_methods = {
522         rdatasetiter_destroy,
523         rdatasetiter_first,
524         rdatasetiter_next,
525         rdatasetiter_current
526 };
527
528 typedef struct rbtdb_rdatasetiter {
529         dns_rdatasetiter_t              common;
530         rdatasetheader_t *              current;
531 } rbtdb_rdatasetiter_t;
532
533 static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
534 static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
535 static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
536 static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
537                                         dns_name_t *name);
538 static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
539 static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
540 static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
541                                            dns_dbnode_t **nodep,
542                                            dns_name_t *name);
543 static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
544 static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
545                                           dns_name_t *name);
546
547 static dns_dbiteratormethods_t dbiterator_methods = {
548         dbiterator_destroy,
549         dbiterator_first,
550         dbiterator_last,
551         dbiterator_seek,
552         dbiterator_prev,
553         dbiterator_next,
554         dbiterator_current,
555         dbiterator_pause,
556         dbiterator_origin
557 };
558
559 #define DELETION_BATCH_MAX 64
560
561 /*
562  * If 'paused' is ISC_TRUE, then the tree lock is not being held.
563  */
564 typedef struct rbtdb_dbiterator {
565         dns_dbiterator_t                common;
566         isc_boolean_t                   paused;
567         isc_boolean_t                   new_origin;
568         isc_rwlocktype_t                tree_locked;
569         isc_result_t                    result;
570         dns_fixedname_t                 name;
571         dns_fixedname_t                 origin;
572         dns_rbtnodechain_t              chain;
573         dns_rbtnode_t                   *node;
574         dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
575         int                             delete;
576 } rbtdb_dbiterator_t;
577
578
579 #define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
580 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
581
582 static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
583                        isc_event_t *event);
584 static void overmem(dns_db_t *db, isc_boolean_t overmem);
585
586 /*%
587  * 'init_count' is used to initialize 'newheader->count' which inturn
588  * is used to determine where in the cycle rrset-order cyclic starts.
589  * We don't lock this as we don't care about simultaneous updates.
590  *
591  * Note:
592  *      Both init_count and header->count can be ISC_UINT32_MAX.
593  *      The count on the returned rdataset however can't be as
594  *      that indicates that the database does not implement cyclic
595  *      processing.
596  */
597 static unsigned int init_count;
598
599 /*
600  * Locking
601  *
602  * If a routine is going to lock more than one lock in this module, then
603  * the locking must be done in the following order:
604  *
605  *      Tree Lock
606  *
607  *      Node Lock       (Only one from the set may be locked at one time by
608  *                       any caller)
609  *
610  *      Database Lock
611  *
612  * Failure to follow this hierarchy can result in deadlock.
613  */
614
615 /*
616  * Deleting Nodes
617  *
618  * For zone databases the node for the origin of the zone MUST NOT be deleted.
619  */
620
621
622 /*
623  * DB Routines
624  */
625
626 static void
627 attach(dns_db_t *source, dns_db_t **targetp) {
628         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
629
630         REQUIRE(VALID_RBTDB(rbtdb));
631
632         isc_refcount_increment(&rbtdb->references, NULL);
633
634         *targetp = source;
635 }
636
637 static void
638 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
639         dns_rbtdb_t *rbtdb = event->ev_arg;
640
641         UNUSED(task);
642
643         free_rbtdb(rbtdb, ISC_TRUE, event);
644 }
645
646 static void
647 update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
648                   isc_boolean_t increment)
649 {
650         dns_rdatastatstype_t statattributes = 0;
651         dns_rdatastatstype_t base = 0;
652         dns_rdatastatstype_t type;
653
654         /* At the moment we count statistics only for cache DB */
655         INSIST(IS_CACHE(rbtdb));
656
657         if (NXDOMAIN(header))
658                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
659         else if (RBTDB_RDATATYPE_BASE(header->type) == 0) {
660                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
661                 base = RBTDB_RDATATYPE_EXT(header->type);
662         } else
663                 base = RBTDB_RDATATYPE_BASE(header->type);
664
665         type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
666         if (increment)
667                 dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
668         else
669                 dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
670 }
671
672 static void
673 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
674         int idx;
675         isc_heap_t *heap;
676         dns_ttl_t oldttl;
677
678         oldttl = header->rdh_ttl;
679         header->rdh_ttl = newttl;
680
681         /*
682          * It's possible the rbtdb is not a cache.  If this is the case,
683          * we will not have a heap, and we move on.  If we do, though,
684          * we might need to adjust things.
685          */
686         if (header->heap_index == 0 || newttl == oldttl)
687                 return;
688         idx = header->node->locknum;
689         if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
690             return;
691         heap = rbtdb->heaps[idx];
692
693         if (newttl < oldttl)
694                 isc_heap_increased(heap, header->heap_index);
695         else
696                 isc_heap_decreased(heap, header->heap_index);
697 }
698
699 /*%
700  * This function allows the heap code to rank the priority of each
701  * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
702  */
703 static isc_boolean_t
704 ttl_sooner(void *v1, void *v2) {
705         rdatasetheader_t *h1 = v1;
706         rdatasetheader_t *h2 = v2;
707
708         if (h1->rdh_ttl < h2->rdh_ttl)
709                 return (ISC_TRUE);
710         return (ISC_FALSE);
711 }
712
713 /*%
714  * This function sets the heap index into the header.
715  */
716 static void
717 ttl_set_index(void *what, unsigned int index) {
718         rdatasetheader_t *h = what;
719
720         h->heap_index = index;
721 }
722
723 /*%
724  * Work out how many nodes can be deleted in the time between two
725  * requests to the nameserver.  Smooth the resulting number and use it
726  * as a estimate for the number of nodes to be deleted in the next
727  * iteration.
728  */
729 static unsigned int
730 adjust_quantum(unsigned int old, isc_time_t *start) {
731         unsigned int pps = dns_pps;     /* packets per second */
732         unsigned int interval;
733         isc_uint64_t usecs;
734         isc_time_t end;
735         unsigned int new;
736
737         if (pps < 100)
738                 pps = 100;
739         isc_time_now(&end);
740
741         interval = 1000000 / pps;       /* interval in usec */
742         if (interval == 0)
743                 interval = 1;
744         usecs = isc_time_microdiff(&end, start);
745         if (usecs == 0) {
746                 /*
747                  * We were unable to measure the amount of time taken.
748                  * Double the nodes deleted next time.
749                  */
750                 old *= 2;
751                 if (old > 1000)
752                         old = 1000;
753                 return (old);
754         }
755         new = old * interval;
756         new /= (unsigned int)usecs;
757         if (new == 0)
758                 new = 1;
759         else if (new > 1000)
760                 new = 1000;
761
762         /* Smooth */
763         new = (new + old * 3) / 4;
764
765         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
766                       ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
767
768         return (new);
769 }
770
771 static void
772 free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
773         unsigned int i;
774         isc_ondestroy_t ondest;
775         isc_result_t result;
776         char buf[DNS_NAME_FORMATSIZE];
777         isc_time_t start;
778
779         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
780                 overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
781
782         REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
783         REQUIRE(rbtdb->future_version == NULL);
784
785         if (rbtdb->current_version != NULL) {
786                 unsigned int refs;
787
788                 isc_refcount_decrement(&rbtdb->current_version->references,
789                                        &refs);
790                 INSIST(refs == 0);
791                 UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
792                 isc_refcount_destroy(&rbtdb->current_version->references);
793                 isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
794                             sizeof(rbtdb_version_t));
795         }
796
797         /*
798          * We assume the number of remaining dead nodes is reasonably small;
799          * the overhead of unlinking all nodes here should be negligible.
800          */
801         for (i = 0; i < rbtdb->node_lock_count; i++) {
802                 dns_rbtnode_t *node;
803
804                 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
805                 while (node != NULL) {
806                         ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
807                         node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
808                 }
809         }
810
811         if (event == NULL)
812                 rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
813  again:
814         if (rbtdb->tree != NULL) {
815                 isc_time_now(&start);
816                 result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
817                 if (result == ISC_R_QUOTA) {
818                         INSIST(rbtdb->task != NULL);
819                         if (rbtdb->quantum != 0)
820                                 rbtdb->quantum = adjust_quantum(rbtdb->quantum,
821                                                                 &start);
822                         if (event == NULL)
823                                 event = isc_event_allocate(rbtdb->common.mctx,
824                                                            NULL,
825                                                          DNS_EVENT_FREESTORAGE,
826                                                            free_rbtdb_callback,
827                                                            rbtdb,
828                                                            sizeof(isc_event_t));
829                         if (event == NULL)
830                                 goto again;
831                         isc_task_send(rbtdb->task, &event);
832                         return;
833                 }
834                 INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
835         }
836         if (event != NULL)
837                 isc_event_free(&event);
838         if (log) {
839                 if (dns_name_dynamic(&rbtdb->common.origin))
840                         dns_name_format(&rbtdb->common.origin, buf,
841                                         sizeof(buf));
842                 else
843                         strcpy(buf, "<UNKNOWN>");
844                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
845                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
846                               "done free_rbtdb(%s)", buf);
847         }
848         if (dns_name_dynamic(&rbtdb->common.origin))
849                 dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
850         for (i = 0; i < rbtdb->node_lock_count; i++) {
851                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
852                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
853         }
854
855         /*
856          * Clean up LRU cache objects.
857          */
858         if (rbtdb->rdatasets != NULL) {
859                 for (i = 0; i < rbtdb->node_lock_count; i++)
860                         INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
861                 isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
862                             rbtdb->node_lock_count *
863                             sizeof(rdatasetheaderlist_t));
864         }
865         /*
866          * Clean up dead node buckets.
867          */
868         if (rbtdb->deadnodes != NULL) {
869                 for (i = 0; i < rbtdb->node_lock_count; i++)
870                         INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
871                 isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
872                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
873         }
874         /*
875          * Clean up TTL heap cache objects.
876          */
877         if (rbtdb->heaps != NULL) {
878                 for (i = 0; i < rbtdb->node_lock_count; i++)
879                         isc_heap_destroy(&rbtdb->heaps[i]);
880                 isc_mem_put(rbtdb->common.mctx, rbtdb->heaps,
881                             rbtdb->node_lock_count *
882                             sizeof(isc_heap_t *));
883         }
884
885         if (rbtdb->rrsetstats != NULL)
886                 dns_stats_detach(&rbtdb->rrsetstats);
887
888         isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
889                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
890         isc_rwlock_destroy(&rbtdb->tree_lock);
891         isc_refcount_destroy(&rbtdb->references);
892         if (rbtdb->task != NULL)
893                 isc_task_detach(&rbtdb->task);
894
895         RBTDB_DESTROYLOCK(&rbtdb->lock);
896         rbtdb->common.magic = 0;
897         rbtdb->common.impmagic = 0;
898         ondest = rbtdb->common.ondest;
899         isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
900         isc_ondestroy_notify(&ondest, rbtdb);
901 }
902
903 static inline void
904 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
905         isc_boolean_t want_free = ISC_FALSE;
906         unsigned int i;
907         unsigned int inactive = 0;
908
909         /* XXX check for open versions here */
910
911         if (rbtdb->soanode != NULL)
912                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
913         if (rbtdb->nsnode != NULL)
914                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
915
916         /*
917          * Even though there are no external direct references, there still
918          * may be nodes in use.
919          */
920         for (i = 0; i < rbtdb->node_lock_count; i++) {
921                 NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
922                 rbtdb->node_locks[i].exiting = ISC_TRUE;
923                 NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
924                 if (isc_refcount_current(&rbtdb->node_locks[i].references)
925                     == 0) {
926                         inactive++;
927                 }
928         }
929
930         if (inactive != 0) {
931                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
932                 rbtdb->active -= inactive;
933                 if (rbtdb->active == 0)
934                         want_free = ISC_TRUE;
935                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
936                 if (want_free) {
937                         char buf[DNS_NAME_FORMATSIZE];
938                         if (dns_name_dynamic(&rbtdb->common.origin))
939                                 dns_name_format(&rbtdb->common.origin, buf,
940                                                 sizeof(buf));
941                         else
942                                 strcpy(buf, "<UNKNOWN>");
943                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
944                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
945                                       "calling free_rbtdb(%s)", buf);
946                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
947                 }
948         }
949 }
950
951 static void
952 detach(dns_db_t **dbp) {
953         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
954         unsigned int refs;
955
956         REQUIRE(VALID_RBTDB(rbtdb));
957
958         isc_refcount_decrement(&rbtdb->references, &refs);
959
960         if (refs == 0)
961                 maybe_free_rbtdb(rbtdb);
962
963         *dbp = NULL;
964 }
965
966 static void
967 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
968         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
969         rbtdb_version_t *version;
970         unsigned int refs;
971
972         REQUIRE(VALID_RBTDB(rbtdb));
973
974         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
975         version = rbtdb->current_version;
976         isc_refcount_increment(&version->references, &refs);
977         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
978
979         *versionp = (dns_dbversion_t *)version;
980 }
981
982 static inline rbtdb_version_t *
983 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
984                  unsigned int references, isc_boolean_t writer)
985 {
986         isc_result_t result;
987         rbtdb_version_t *version;
988
989         version = isc_mem_get(mctx, sizeof(*version));
990         if (version == NULL)
991                 return (NULL);
992         version->serial = serial;
993         result = isc_refcount_init(&version->references, references);
994         if (result != ISC_R_SUCCESS) {
995                 isc_mem_put(mctx, version, sizeof(*version));
996                 return (NULL);
997         }
998         version->writer = writer;
999         version->commit_ok = ISC_FALSE;
1000         ISC_LIST_INIT(version->changed_list);
1001         ISC_LINK_INIT(version, link);
1002
1003         return (version);
1004 }
1005
1006 static isc_result_t
1007 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1008         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1009         rbtdb_version_t *version;
1010
1011         REQUIRE(VALID_RBTDB(rbtdb));
1012         REQUIRE(versionp != NULL && *versionp == NULL);
1013         REQUIRE(rbtdb->future_version == NULL);
1014
1015         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1016         RUNTIME_CHECK(rbtdb->next_serial != 0);         /* XXX Error? */
1017         version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1018                                    ISC_TRUE);
1019         if (version != NULL) {
1020                 version->commit_ok = ISC_TRUE;
1021                 rbtdb->next_serial++;
1022                 rbtdb->future_version = version;
1023         }
1024         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1025
1026         if (version == NULL)
1027                 return (ISC_R_NOMEMORY);
1028
1029         *versionp = version;
1030
1031         return (ISC_R_SUCCESS);
1032 }
1033
1034 static void
1035 attachversion(dns_db_t *db, dns_dbversion_t *source,
1036               dns_dbversion_t **targetp)
1037 {
1038         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1039         rbtdb_version_t *rbtversion = source;
1040         unsigned int refs;
1041
1042         REQUIRE(VALID_RBTDB(rbtdb));
1043
1044         isc_refcount_increment(&rbtversion->references, &refs);
1045         INSIST(refs > 1);
1046
1047         *targetp = rbtversion;
1048 }
1049
1050 static rbtdb_changed_t *
1051 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1052             dns_rbtnode_t *node)
1053 {
1054         rbtdb_changed_t *changed;
1055         unsigned int refs;
1056
1057         /*
1058          * Caller must be holding the node lock if its reference must be
1059          * protected by the lock.
1060          */
1061
1062         changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1063
1064         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1065
1066         REQUIRE(version->writer);
1067
1068         if (changed != NULL) {
1069                 dns_rbtnode_refincrement(node, &refs);
1070                 INSIST(refs != 0);
1071                 changed->node = node;
1072                 changed->dirty = ISC_FALSE;
1073                 ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1074         } else
1075                 version->commit_ok = ISC_FALSE;
1076
1077         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1078
1079         return (changed);
1080 }
1081
1082 static void
1083 free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
1084                  acachectl_t *array)
1085 {
1086         unsigned int count;
1087         unsigned int i;
1088         unsigned char *raw;     /* RDATASLAB */
1089
1090         /*
1091          * The caller must be holding the corresponding node lock.
1092          */
1093
1094         if (array == NULL)
1095                 return;
1096
1097         raw = (unsigned char *)header + sizeof(*header);
1098         count = raw[0] * 256 + raw[1];
1099
1100         /*
1101          * Sanity check: since an additional cache entry has a reference to
1102          * the original DB node (in the callback arg), there should be no
1103          * acache entries when the node can be freed.
1104          */
1105         for (i = 0; i < count; i++)
1106                 INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1107
1108         isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1109 }
1110
1111 static inline void
1112 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1113
1114         if (dns_name_dynamic(&(*noqname)->name))
1115                 dns_name_free(&(*noqname)->name, mctx);
1116         if ((*noqname)->nsec != NULL)
1117                 isc_mem_put(mctx, (*noqname)->nsec,
1118                             dns_rdataslab_size((*noqname)->nsec, 0));
1119         if ((*noqname)->nsecsig != NULL)
1120                 isc_mem_put(mctx, (*noqname)->nsecsig,
1121                             dns_rdataslab_size((*noqname)->nsecsig, 0));
1122         isc_mem_put(mctx, *noqname, sizeof(**noqname));
1123         *noqname = NULL;
1124 }
1125
1126 static inline void
1127 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
1128 {
1129         ISC_LINK_INIT(h, lru_link);
1130         h->heap_index = 0;
1131
1132 #if TRACE_HEADER
1133         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1134                 fprintf(stderr, "initialized header: %p\n", h);
1135 #else
1136         UNUSED(rbtdb);
1137 #endif
1138 }
1139
1140 static inline rdatasetheader_t *
1141 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
1142 {
1143         rdatasetheader_t *h;
1144
1145         h = isc_mem_get(mctx, sizeof(*h));
1146         if (h == NULL)
1147                 return (NULL);
1148
1149 #if TRACE_HEADER
1150         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1151                 fprintf(stderr, "allocated header: %p\n", h);
1152 #endif
1153         init_rdataset(rbtdb, h);
1154         return (h);
1155 }
1156
1157 static inline void
1158 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
1159 {
1160         unsigned int size;
1161
1162         if (EXISTS(rdataset) &&
1163             (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
1164                 update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
1165         }
1166
1167         if (IS_CACHE(rbtdb) && ISC_LINK_LINKED(rdataset, lru_link)) {
1168                 int idx = rdataset->node->locknum;
1169                 ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link);
1170                 if (rdataset->heap_index != 0) {
1171                         isc_heap_delete(rbtdb->heaps[idx],
1172                                         rdataset->heap_index);
1173                 }
1174                 rdataset->heap_index = 0;
1175         }
1176
1177         if (rdataset->noqname != NULL)
1178                 free_noqname(mctx, &rdataset->noqname);
1179
1180         free_acachearray(mctx, rdataset, rdataset->additional_auth);
1181         free_acachearray(mctx, rdataset, rdataset->additional_glue);
1182
1183         if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
1184                 size = sizeof(*rdataset);
1185         else
1186                 size = dns_rdataslab_size((unsigned char *)rdataset,
1187                                           sizeof(*rdataset));
1188         isc_mem_put(mctx, rdataset, size);
1189 }
1190
1191 static inline void
1192 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1193         rdatasetheader_t *header, *dcurrent;
1194         isc_boolean_t make_dirty = ISC_FALSE;
1195
1196         /*
1197          * Caller must hold the node lock.
1198          */
1199
1200         /*
1201          * We set the IGNORE attribute on rdatasets with serial number
1202          * 'serial'.  When the reference count goes to zero, these rdatasets
1203          * will be cleaned up; until that time, they will be ignored.
1204          */
1205         for (header = node->data; header != NULL; header = header->next) {
1206                 if (header->serial == serial) {
1207                         header->attributes |= RDATASET_ATTR_IGNORE;
1208                         make_dirty = ISC_TRUE;
1209                 }
1210                 for (dcurrent = header->down;
1211                      dcurrent != NULL;
1212                      dcurrent = dcurrent->down) {
1213                         if (dcurrent->serial == serial) {
1214                                 dcurrent->attributes |= RDATASET_ATTR_IGNORE;
1215                                 make_dirty = ISC_TRUE;
1216                         }
1217                 }
1218         }
1219         if (make_dirty)
1220                 node->dirty = 1;
1221 }
1222
1223 static inline void
1224 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
1225 {
1226         rdatasetheader_t *d, *down_next;
1227
1228         for (d = top->down; d != NULL; d = down_next) {
1229                 down_next = d->down;
1230                 free_rdataset(rbtdb, mctx, d);
1231         }
1232         top->down = NULL;
1233 }
1234
1235 static inline void
1236 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1237         rdatasetheader_t *current, *top_prev, *top_next;
1238         isc_mem_t *mctx = rbtdb->common.mctx;
1239
1240         /*
1241          * Caller must be holding the node lock.
1242          */
1243
1244         top_prev = NULL;
1245         for (current = node->data; current != NULL; current = top_next) {
1246                 top_next = current->next;
1247                 clean_stale_headers(rbtdb, mctx, current);
1248                 /*
1249                  * If current is nonexistent or stale, we can clean it up.
1250                  */
1251                 if ((current->attributes &
1252                      (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
1253                         if (top_prev != NULL)
1254                                 top_prev->next = current->next;
1255                         else
1256                                 node->data = current->next;
1257                         free_rdataset(rbtdb, mctx, current);
1258                 } else
1259                         top_prev = current;
1260         }
1261         node->dirty = 0;
1262 }
1263
1264 static inline void
1265 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1266                 rbtdb_serial_t least_serial)
1267 {
1268         rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1269         rdatasetheader_t *top_prev, *top_next;
1270         isc_mem_t *mctx = rbtdb->common.mctx;
1271         isc_boolean_t still_dirty = ISC_FALSE;
1272
1273         /*
1274          * Caller must be holding the node lock.
1275          */
1276         REQUIRE(least_serial != 0);
1277
1278         top_prev = NULL;
1279         for (current = node->data; current != NULL; current = top_next) {
1280                 top_next = current->next;
1281
1282                 /*
1283                  * First, we clean up any instances of multiple rdatasets
1284                  * with the same serial number, or that have the IGNORE
1285                  * attribute.
1286                  */
1287                 dparent = current;
1288                 for (dcurrent = current->down;
1289                      dcurrent != NULL;
1290                      dcurrent = down_next) {
1291                         down_next = dcurrent->down;
1292                         INSIST(dcurrent->serial <= dparent->serial);
1293                         if (dcurrent->serial == dparent->serial ||
1294                             IGNORE(dcurrent)) {
1295                                 if (down_next != NULL)
1296                                         down_next->next = dparent;
1297                                 dparent->down = down_next;
1298                                 free_rdataset(rbtdb, mctx, dcurrent);
1299                         } else
1300                                 dparent = dcurrent;
1301                 }
1302
1303                 /*
1304                  * We've now eliminated all IGNORE datasets with the possible
1305                  * exception of current, which we now check.
1306                  */
1307                 if (IGNORE(current)) {
1308                         down_next = current->down;
1309                         if (down_next == NULL) {
1310                                 if (top_prev != NULL)
1311                                         top_prev->next = current->next;
1312                                 else
1313                                         node->data = current->next;
1314                                 free_rdataset(rbtdb, mctx, current);
1315                                 /*
1316                                  * current no longer exists, so we can
1317                                  * just continue with the loop.
1318                                  */
1319                                 continue;
1320                         } else {
1321                                 /*
1322                                  * Pull up current->down, making it the new
1323                                  * current.
1324                                  */
1325                                 if (top_prev != NULL)
1326                                         top_prev->next = down_next;
1327                                 else
1328                                         node->data = down_next;
1329                                 down_next->next = top_next;
1330                                 free_rdataset(rbtdb, mctx, current);
1331                                 current = down_next;
1332                         }
1333                 }
1334
1335                 /*
1336                  * We now try to find the first down node less than the
1337                  * least serial.
1338                  */
1339                 dparent = current;
1340                 for (dcurrent = current->down;
1341                      dcurrent != NULL;
1342                      dcurrent = down_next) {
1343                         down_next = dcurrent->down;
1344                         if (dcurrent->serial < least_serial)
1345                                 break;
1346                         dparent = dcurrent;
1347                 }
1348
1349                 /*
1350                  * If there is a such an rdataset, delete it and any older
1351                  * versions.
1352                  */
1353                 if (dcurrent != NULL) {
1354                         do {
1355                                 down_next = dcurrent->down;
1356                                 INSIST(dcurrent->serial <= least_serial);
1357                                 free_rdataset(rbtdb, mctx, dcurrent);
1358                                 dcurrent = down_next;
1359                         } while (dcurrent != NULL);
1360                         dparent->down = NULL;
1361                 }
1362
1363                 /*
1364                  * Note.  The serial number of 'current' might be less than
1365                  * least_serial too, but we cannot delete it because it is
1366                  * the most recent version, unless it is a NONEXISTENT
1367                  * rdataset.
1368                  */
1369                 if (current->down != NULL) {
1370                         still_dirty = ISC_TRUE;
1371                         top_prev = current;
1372                 } else {
1373                         /*
1374                          * If this is a NONEXISTENT rdataset, we can delete it.
1375                          */
1376                         if (NONEXISTENT(current)) {
1377                                 if (top_prev != NULL)
1378                                         top_prev->next = current->next;
1379                                 else
1380                                         node->data = current->next;
1381                                 free_rdataset(rbtdb, mctx, current);
1382                         } else
1383                                 top_prev = current;
1384                 }
1385         }
1386         if (!still_dirty)
1387                 node->dirty = 0;
1388 }
1389
1390 /*%
1391  * Clean up dead nodes.  These are nodes which have no references, and
1392  * have no data.  They are dead but we could not or chose not to delete
1393  * them when we deleted all the data at that node because we did not want
1394  * to wait for the tree write lock.
1395  *
1396  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1397  */
1398 static void
1399 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1400         dns_rbtnode_t *node;
1401         isc_result_t result;
1402         int count = 10;         /* XXXJT: should be adjustable */
1403
1404         node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1405         while (node != NULL && count > 0) {
1406                 ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1407
1408                 /*
1409                  * Since we're holding a tree write lock, it should be
1410                  * impossible for this node to be referenced by others.
1411                  */
1412                 INSIST(dns_rbtnode_refcurrent(node) == 0 &&
1413                        node->data == NULL);
1414
1415                 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1416                 if (result != ISC_R_SUCCESS)
1417                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1418                                       DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1419                                       "cleanup_dead_nodes: "
1420                                       "dns_rbt_deletenode: %s",
1421                                       isc_result_totext(result));
1422                 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1423                 count--;
1424         }
1425 }
1426
1427 /*
1428  * Caller must be holding the node lock if its reference must be protected
1429  * by the lock.
1430  */
1431 static inline void
1432 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1433         unsigned int lockrefs, noderefs;
1434         isc_refcount_t *lockref;
1435
1436         dns_rbtnode_refincrement0(node, &noderefs);
1437         if (noderefs == 1) {    /* this is the first reference to the node */
1438                 lockref = &rbtdb->node_locks[node->locknum].references;
1439                 isc_refcount_increment0(lockref, &lockrefs);
1440                 INSIST(lockrefs != 0);
1441         }
1442         INSIST(noderefs != 0);
1443 }
1444
1445 /*
1446  * This function is assumed to be called when a node is newly referenced
1447  * and can be in the deadnode list.  In that case the node must be retrieved
1448  * from the list because it is going to be used.  In addition, if the caller
1449  * happens to hold a write lock on the tree, it's a good chance to purge dead
1450  * nodes.
1451  * Note: while a new reference is gained in multiple places, there are only very
1452  * few cases where the node can be in the deadnode list (only empty nodes can
1453  * have been added to the list).
1454  */
1455 static inline void
1456 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1457                 isc_rwlocktype_t treelocktype)
1458 {
1459         isc_boolean_t need_relock = ISC_FALSE;
1460
1461         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
1462         new_reference(rbtdb, node);
1463
1464         NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1465                       isc_rwlocktype_read);
1466         if (ISC_LINK_LINKED(node, deadlink))
1467                 need_relock = ISC_TRUE;
1468         else if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1469                  treelocktype == isc_rwlocktype_write)
1470                 need_relock = ISC_TRUE;
1471         NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1472                         isc_rwlocktype_read);
1473         if (need_relock) {
1474                 NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1475                               isc_rwlocktype_write);
1476                 if (ISC_LINK_LINKED(node, deadlink))
1477                         ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
1478                                         node, deadlink);
1479                 if (treelocktype == isc_rwlocktype_write)
1480                         cleanup_dead_nodes(rbtdb, node->locknum);
1481                 NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1482                                 isc_rwlocktype_write);
1483         }
1484
1485         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
1486 }
1487
1488 /*
1489  * Caller must be holding the node lock; either the "strong", read or write
1490  * lock.  Note that the lock must be held even when node references are
1491  * atomically modified; in that case the decrement operation itself does not
1492  * have to be protected, but we must avoid a race condition where multiple
1493  * threads are decreasing the reference to zero simultaneously and at least
1494  * one of them is going to free the node.
1495  * This function returns ISC_TRUE if and only if the node reference decreases
1496  * to zero.
1497  */
1498 static isc_boolean_t
1499 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1500                     rbtdb_serial_t least_serial,
1501                     isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
1502                     isc_boolean_t pruning)
1503 {
1504         isc_result_t result;
1505         isc_boolean_t write_locked;
1506         rbtdb_nodelock_t *nodelock;
1507         unsigned int refs, nrefs;
1508         int bucket = node->locknum;
1509         isc_boolean_t no_reference;
1510
1511         nodelock = &rbtdb->node_locks[bucket];
1512
1513         /* Handle easy and typical case first. */
1514         if (!node->dirty && (node->data != NULL || node->down != NULL)) {
1515                 dns_rbtnode_refdecrement(node, &nrefs);
1516                 INSIST((int)nrefs >= 0);
1517                 if (nrefs == 0) {
1518                         isc_refcount_decrement(&nodelock->references, &refs);
1519                         INSIST((int)refs >= 0);
1520                 }
1521                 return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
1522         }
1523
1524         /* Upgrade the lock? */
1525         if (nlock == isc_rwlocktype_read) {
1526                 NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
1527                 NODE_WEAKLOCK(&nodelock->lock, isc_rwlocktype_write);
1528         }
1529         dns_rbtnode_refdecrement(node, &nrefs);
1530         INSIST((int)nrefs >= 0);
1531         if (nrefs > 0) {
1532                 /* Restore the lock? */
1533                 if (nlock == isc_rwlocktype_read)
1534                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1535                 return (ISC_FALSE);
1536         }
1537
1538         if (node->dirty && dns_rbtnode_refcurrent(node) == 0) {
1539                 if (IS_CACHE(rbtdb))
1540                         clean_cache_node(rbtdb, node);
1541                 else {
1542                         if (least_serial == 0) {
1543                                 /*
1544                                  * Caller doesn't know the least serial.
1545                                  * Get it.
1546                                  */
1547                                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1548                                 least_serial = rbtdb->least_serial;
1549                                 RBTDB_UNLOCK(&rbtdb->lock,
1550                                              isc_rwlocktype_read);
1551                         }
1552                         clean_zone_node(rbtdb, node, least_serial);
1553                 }
1554         }
1555
1556         isc_refcount_decrement(&nodelock->references, &refs);
1557         INSIST((int)refs >= 0);
1558
1559         /*
1560          * XXXDCL should this only be done for cache zones?
1561          */
1562         if (node->data != NULL || node->down != NULL) {
1563                 /* Restore the lock? */
1564                 if (nlock == isc_rwlocktype_read)
1565                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1566                 return (ISC_TRUE);
1567         }
1568
1569         /*
1570          * Attempt to switch to a write lock on the tree.  If this fails,
1571          * we will add this node to a linked list of nodes in this locking
1572          * bucket which we will free later.
1573          */
1574         if (tlock != isc_rwlocktype_write) {
1575                 /*
1576                  * Locking hierarchy notwithstanding, we don't need to free
1577                  * the node lock before acquiring the tree write lock because
1578                  * we only do a trylock.
1579                  */
1580                 if (tlock == isc_rwlocktype_read)
1581                         result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
1582                 else
1583                         result = isc_rwlock_trylock(&rbtdb->tree_lock,
1584                                                     isc_rwlocktype_write);
1585                 RUNTIME_CHECK(result == ISC_R_SUCCESS ||
1586                               result == ISC_R_LOCKBUSY);
1587
1588                 write_locked = ISC_TF(result == ISC_R_SUCCESS);
1589         } else
1590                 write_locked = ISC_TRUE;
1591
1592         no_reference = ISC_TRUE;
1593         if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
1594                 /*
1595                  * We can now delete the node if the reference counter is
1596                  * zero.  This should be typically the case, but a different
1597                  * thread may still gain a (new) reference just before the
1598                  * current thread locks the tree (e.g., in findnode()).
1599                  */
1600
1601                 /*
1602                  * If this node is the only one in the level it's in, deleting
1603                  * this node may recursively make its parent the only node in
1604                  * the parent level; if so, and if no one is currently using
1605                  * the parent node, this is almost the only opportunity to
1606                  * clean it up.  But the recursive cleanup is not that trivial
1607                  * since the child and parent may be in different lock buckets,
1608                  * which would cause a lock order reversal problem.  To avoid
1609                  * the trouble, we'll dispatch a separate event for batch
1610                  * cleaning.  We need to check whether we're deleting the node
1611                  * as a result of pruning to avoid infinite dispatching.
1612                  * Note: pruning happens only when a task has been set for the
1613                  * rbtdb.  If the user of the rbtdb chooses not to set a task,
1614                  * it's their responsibility to purge stale leaves (e.g. by
1615                  * periodic walk-through).
1616                  */
1617                 if (!pruning && node->parent != NULL &&
1618                     node->parent->down == node && node->left == NULL &&
1619                     node->right == NULL && rbtdb->task != NULL) {
1620                         isc_event_t *ev;
1621                         dns_db_t *db;
1622
1623                         ev = isc_event_allocate(rbtdb->common.mctx, NULL,
1624                                                 DNS_EVENT_RBTPRUNE,
1625                                                 prune_tree, node,
1626                                                 sizeof(isc_event_t));
1627                         if (ev != NULL) {
1628                                 new_reference(rbtdb, node);
1629                                 db = NULL;
1630                                 attach((dns_db_t *)rbtdb, &db);
1631                                 ev->ev_sender = db;
1632                                 isc_task_send(rbtdb->task, &ev);
1633                                 no_reference = ISC_FALSE;
1634                         } else {
1635                                 /*
1636                                  * XXX: this is a weird situation.  We could
1637                                  * ignore this error case, but then the stale
1638                                  * node will unlikely be purged except via a
1639                                  * rare condition such as manual cleanup.  So
1640                                  * we queue it in the deadnodes list, hoping
1641                                  * the memory shortage is temporary and the node
1642                                  * will be deleted later.
1643                                  */
1644                                 isc_log_write(dns_lctx,
1645                                               DNS_LOGCATEGORY_DATABASE,
1646                                               DNS_LOGMODULE_CACHE,
1647                                               ISC_LOG_INFO,
1648                                               "decrement_reference: failed to "
1649                                               "allocate pruning event");
1650                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1651                                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
1652                                                 deadlink);
1653                         }
1654                 } else {
1655                         if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1656                                 char printname[DNS_NAME_FORMATSIZE];
1657
1658                                 isc_log_write(dns_lctx,
1659                                               DNS_LOGCATEGORY_DATABASE,
1660                                               DNS_LOGMODULE_CACHE,
1661                                               ISC_LOG_DEBUG(1),
1662                                               "decrement_reference: "
1663                                               "delete from rbt: %p %s",
1664                                               node,
1665                                               dns_rbt_formatnodename(node,
1666                                                         printname,
1667                                                         sizeof(printname)));
1668                         }
1669
1670                         INSIST(!ISC_LINK_LINKED(node, deadlink));
1671                         result = dns_rbt_deletenode(rbtdb->tree, node,
1672                                                     ISC_FALSE);
1673                         if (result != ISC_R_SUCCESS) {
1674                                 isc_log_write(dns_lctx,
1675                                               DNS_LOGCATEGORY_DATABASE,
1676                                               DNS_LOGMODULE_CACHE,
1677                                               ISC_LOG_WARNING,
1678                                               "decrement_reference: "
1679                                               "dns_rbt_deletenode: %s",
1680                                               isc_result_totext(result));
1681                         }
1682                 }
1683         } else if (dns_rbtnode_refcurrent(node) == 0) {
1684                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1685                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
1686         }
1687
1688         /* Restore the lock? */
1689         if (nlock == isc_rwlocktype_read)
1690                 NODE_WEAKDOWNGRADE(&nodelock->lock);
1691
1692         /*
1693          * Relock a read lock, or unlock the write lock if no lock was held.
1694          */
1695         if (tlock == isc_rwlocktype_none)
1696                 if (write_locked)
1697                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1698
1699         if (tlock == isc_rwlocktype_read)
1700                 if (write_locked)
1701                         isc_rwlock_downgrade(&rbtdb->tree_lock);
1702
1703         return (no_reference);
1704 }
1705
1706 /*
1707  * Prune the tree by recursively cleaning-up single leaves.  In the worst
1708  * case, the number of iteration is the number of tree levels, which is at
1709  * most the maximum number of domain name labels, i.e, 127.  In practice, this
1710  * should be much smaller (only a few times), and even the worst case would be
1711  * acceptable for a single event.
1712  */
1713 static void
1714 prune_tree(isc_task_t *task, isc_event_t *event) {
1715         dns_rbtdb_t *rbtdb = event->ev_sender;
1716         dns_rbtnode_t *node = event->ev_arg;
1717         dns_rbtnode_t *parent;
1718         unsigned int locknum;
1719
1720         UNUSED(task);
1721
1722         isc_event_free(&event);
1723
1724         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1725         locknum = node->locknum;
1726         NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1727         do {
1728                 parent = node->parent;
1729                 decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
1730                                     isc_rwlocktype_write, ISC_TRUE);
1731
1732                 if (parent != NULL && parent->down == NULL) {
1733                         /*
1734                          * node was the only down child of the parent and has
1735                          * just been removed.  We'll then need to examine the
1736                          * parent.  Keep the lock if possible; otherwise,
1737                          * release the old lock and acquire one for the parent.
1738                          */
1739                         if (parent->locknum != locknum) {
1740                                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
1741                                             isc_rwlocktype_write);
1742                                 locknum = parent->locknum;
1743                                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
1744                                           isc_rwlocktype_write);
1745                         }
1746
1747                         /*
1748                          * We need to gain a reference to the node before
1749                          * decrementing it in the next iteration.  In addition,
1750                          * if the node is in the dead-nodes list, extract it
1751                          * from the list beforehand as we do in
1752                          * reactivate_node().
1753                          */
1754                         new_reference(rbtdb, parent);
1755                         if (ISC_LINK_LINKED(parent, deadlink)) {
1756                                 ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
1757                                                 parent, deadlink);
1758                         }
1759                 } else
1760                         parent = NULL;
1761
1762                 node = parent;
1763         } while (node != NULL);
1764         NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1765         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1766
1767         detach((dns_db_t **)&rbtdb);
1768 }
1769
1770 static inline void
1771 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1772                    rbtdb_changedlist_t *cleanup_list)
1773 {
1774         /*
1775          * Caller must be holding the database lock.
1776          */
1777
1778         rbtdb->least_serial = version->serial;
1779         *cleanup_list = version->changed_list;
1780         ISC_LIST_INIT(version->changed_list);
1781 }
1782
1783 static inline void
1784 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
1785         rbtdb_changed_t *changed, *next_changed;
1786
1787         /*
1788          * If the changed record is dirty, then
1789          * an update created multiple versions of
1790          * a given rdataset.  We keep this list
1791          * until we're the least open version, at
1792          * which point it's safe to get rid of any
1793          * older versions.
1794          *
1795          * If the changed record isn't dirty, then
1796          * we don't need it anymore since we're
1797          * committing and not rolling back.
1798          *
1799          * The caller must be holding the database lock.
1800          */
1801         for (changed = HEAD(version->changed_list);
1802              changed != NULL;
1803              changed = next_changed) {
1804                 next_changed = NEXT(changed, link);
1805                 if (!changed->dirty) {
1806                         UNLINK(version->changed_list,
1807                                changed, link);
1808                         APPEND(*cleanup_list,
1809                                changed, link);
1810                 }
1811         }
1812 }
1813
1814 static isc_boolean_t
1815 iszonesecure(dns_db_t *db, dns_dbnode_t *origin) {
1816         dns_rdataset_t keyset;
1817         dns_rdataset_t nsecset, signsecset;
1818         isc_boolean_t haszonekey = ISC_FALSE;
1819         isc_boolean_t hasnsec = ISC_FALSE;
1820         isc_result_t result;
1821
1822         dns_rdataset_init(&keyset);
1823         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_dnskey, 0,
1824                                      0, &keyset, NULL);
1825         if (result == ISC_R_SUCCESS) {
1826                 dns_rdata_t keyrdata = DNS_RDATA_INIT;
1827                 result = dns_rdataset_first(&keyset);
1828                 while (result == ISC_R_SUCCESS) {
1829                         dns_rdataset_current(&keyset, &keyrdata);
1830                         if (dns_zonekey_iszonekey(&keyrdata)) {
1831                                 haszonekey = ISC_TRUE;
1832                                 break;
1833                         }
1834                         result = dns_rdataset_next(&keyset);
1835                 }
1836                 dns_rdataset_disassociate(&keyset);
1837         }
1838         if (!haszonekey)
1839                 return (ISC_FALSE);
1840
1841         dns_rdataset_init(&nsecset);
1842         dns_rdataset_init(&signsecset);
1843         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_nsec, 0,
1844                                      0, &nsecset, &signsecset);
1845         if (result == ISC_R_SUCCESS) {
1846                 if (dns_rdataset_isassociated(&signsecset)) {
1847                         hasnsec = ISC_TRUE;
1848                         dns_rdataset_disassociate(&signsecset);
1849                 }
1850                 dns_rdataset_disassociate(&nsecset);
1851         }
1852         return (hasnsec);
1853 }
1854
1855 static void
1856 closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
1857         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1858         rbtdb_version_t *version, *cleanup_version, *least_greater;
1859         isc_boolean_t rollback = ISC_FALSE;
1860         rbtdb_changedlist_t cleanup_list;
1861         rbtdb_changed_t *changed, *next_changed;
1862         rbtdb_serial_t serial, least_serial;
1863         dns_rbtnode_t *rbtnode;
1864         unsigned int refs;
1865         isc_boolean_t writer;
1866
1867         REQUIRE(VALID_RBTDB(rbtdb));
1868         version = (rbtdb_version_t *)*versionp;
1869
1870         cleanup_version = NULL;
1871         ISC_LIST_INIT(cleanup_list);
1872
1873         isc_refcount_decrement(&version->references, &refs);
1874         if (refs > 0) {         /* typical and easy case first */
1875                 if (commit) {
1876                         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1877                         INSIST(!version->writer);
1878                         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1879                 }
1880                 goto end;
1881         }
1882
1883         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1884         serial = version->serial;
1885         writer = version->writer;
1886         if (version->writer) {
1887                 if (commit) {
1888                         unsigned cur_ref;
1889                         rbtdb_version_t *cur_version;
1890
1891                         INSIST(version->commit_ok);
1892                         INSIST(version == rbtdb->future_version);
1893                         /*
1894                          * The current version is going to be replaced.
1895                          * Release the (likely last) reference to it from the
1896                          * DB itself and unlink it from the open list.
1897                          */
1898                         cur_version = rbtdb->current_version;
1899                         isc_refcount_decrement(&cur_version->references,
1900                                                &cur_ref);
1901                         if (cur_ref == 0) {
1902                                 if (cur_version->serial == rbtdb->least_serial)
1903                                         INSIST(EMPTY(cur_version->changed_list));
1904                                 UNLINK(rbtdb->open_versions,
1905                                        cur_version, link);
1906                         }
1907                         if (EMPTY(rbtdb->open_versions)) {
1908                                 /*
1909                                  * We're going to become the least open
1910                                  * version.
1911                                  */
1912                                 make_least_version(rbtdb, version,
1913                                                    &cleanup_list);
1914                         } else {
1915                                 /*
1916                                  * Some other open version is the
1917                                  * least version.  We can't cleanup
1918                                  * records that were changed in this
1919                                  * version because the older versions
1920                                  * may still be in use by an open
1921                                  * version.
1922                                  *
1923                                  * We can, however, discard the
1924                                  * changed records for things that
1925                                  * we've added that didn't exist in
1926                                  * prior versions.
1927                                  */
1928                                 cleanup_nondirty(version, &cleanup_list);
1929                         }
1930                         /*
1931                          * If the (soon to be former) current version
1932                          * isn't being used by anyone, we can clean
1933                          * it up.
1934                          */
1935                         if (cur_ref == 0) {
1936                                 cleanup_version = cur_version;
1937                                 APPENDLIST(version->changed_list,
1938                                            cleanup_version->changed_list,
1939                                            link);
1940                         }
1941                         /*
1942                          * Become the current version.
1943                          */
1944                         version->writer = ISC_FALSE;
1945                         rbtdb->current_version = version;
1946                         rbtdb->current_serial = version->serial;
1947                         rbtdb->future_version = NULL;
1948
1949                         /*
1950                          * Keep the current version in the open list, and
1951                          * gain a reference for the DB itself (see the DB
1952                          * creation function below).  This must be the only
1953                          * case where we need to increment the counter from
1954                          * zero and need to use isc_refcount_increment0().
1955                          */
1956                         isc_refcount_increment0(&version->references,
1957                                                 &cur_ref);
1958                         INSIST(cur_ref == 1);
1959                         PREPEND(rbtdb->open_versions,
1960                                 rbtdb->current_version, link);
1961                 } else {
1962                         /*
1963                          * We're rolling back this transaction.
1964                          */
1965                         cleanup_list = version->changed_list;
1966                         ISC_LIST_INIT(version->changed_list);
1967                         rollback = ISC_TRUE;
1968                         cleanup_version = version;
1969                         rbtdb->future_version = NULL;
1970                 }
1971         } else {
1972                 if (version != rbtdb->current_version) {
1973                         /*
1974                          * There are no external or internal references
1975                          * to this version and it can be cleaned up.
1976                          */
1977                         cleanup_version = version;
1978
1979                         /*
1980                          * Find the version with the least serial
1981                          * number greater than ours.
1982                          */
1983                         least_greater = PREV(version, link);
1984                         if (least_greater == NULL)
1985                                 least_greater = rbtdb->current_version;
1986
1987                         INSIST(version->serial < least_greater->serial);
1988                         /*
1989                          * Is this the least open version?
1990                          */
1991                         if (version->serial == rbtdb->least_serial) {
1992                                 /*
1993                                  * Yes.  Install the new least open
1994                                  * version.
1995                                  */
1996                                 make_least_version(rbtdb,
1997                                                    least_greater,
1998                                                    &cleanup_list);
1999                         } else {
2000                                 /*
2001                                  * Add any unexecuted cleanups to
2002                                  * those of the least greater version.
2003                                  */
2004                                 APPENDLIST(least_greater->changed_list,
2005                                            version->changed_list,
2006                                            link);
2007                         }
2008                 } else if (version->serial == rbtdb->least_serial)
2009                         INSIST(EMPTY(version->changed_list));
2010                 UNLINK(rbtdb->open_versions, version, link);
2011         }
2012         least_serial = rbtdb->least_serial;
2013         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2014
2015         /*
2016          * Update the zone's secure status.
2017          */
2018         if (writer && commit && !IS_CACHE(rbtdb))
2019                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
2020
2021         if (cleanup_version != NULL) {
2022                 INSIST(EMPTY(cleanup_version->changed_list));
2023                 isc_mem_put(rbtdb->common.mctx, cleanup_version,
2024                             sizeof(*cleanup_version));
2025         }
2026
2027         if (!EMPTY(cleanup_list)) {
2028                 /*
2029                  * We acquire a tree write lock here in order to make sure
2030                  * that stale nodes will be removed in decrement_reference().
2031                  * If we didn't have the lock, those nodes could miss the
2032                  * chance to be removed until the server stops.  The write lock
2033                  * is expensive, but this event should be rare enough to justify
2034                  * the cost.
2035                  */
2036                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2037                 for (changed = HEAD(cleanup_list);
2038                      changed != NULL;
2039                      changed = next_changed) {
2040                         nodelock_t *lock;
2041
2042                         next_changed = NEXT(changed, link);
2043                         rbtnode = changed->node;
2044                         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2045
2046                         NODE_LOCK(lock, isc_rwlocktype_write);
2047                         /*
2048                          * This is a good opportunity to purge any dead nodes,
2049                          * so use it.
2050                          */
2051                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2052
2053                         if (rollback)
2054                                 rollback_node(rbtnode, serial);
2055                         decrement_reference(rbtdb, rbtnode, least_serial,
2056                                             isc_rwlocktype_write,
2057                                             isc_rwlocktype_write, ISC_FALSE);
2058
2059                         NODE_UNLOCK(lock, isc_rwlocktype_write);
2060
2061                         isc_mem_put(rbtdb->common.mctx, changed,
2062                                     sizeof(*changed));
2063                 }
2064                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2065         }
2066
2067   end:
2068         *versionp = NULL;
2069 }
2070
2071 /*
2072  * Add the necessary magic for the wildcard name 'name'
2073  * to be found in 'rbtdb'.
2074  *
2075  * In order for wildcard matching to work correctly in
2076  * zone_find(), we must ensure that a node for the wildcarding
2077  * level exists in the database, and has its 'find_callback'
2078  * and 'wild' bits set.
2079  *
2080  * E.g. if the wildcard name is "*.sub.example." then we
2081  * must ensure that "sub.example." exists and is marked as
2082  * a wildcard level.
2083  */
2084 static isc_result_t
2085 add_wildcard_magic(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2086         isc_result_t result;
2087         dns_name_t foundname;
2088         dns_offsets_t offsets;
2089         unsigned int n;
2090         dns_rbtnode_t *node = NULL;
2091
2092         dns_name_init(&foundname, offsets);
2093         n = dns_name_countlabels(name);
2094         INSIST(n >= 2);
2095         n--;
2096         dns_name_getlabelsequence(name, 1, n, &foundname);
2097         result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2098         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2099                 return (result);
2100         node->find_callback = 1;
2101         node->wild = 1;
2102         return (ISC_R_SUCCESS);
2103 }
2104
2105 static isc_result_t
2106 add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2107         isc_result_t result;
2108         dns_name_t foundname;
2109         dns_offsets_t offsets;
2110         unsigned int n, l, i;
2111
2112         dns_name_init(&foundname, offsets);
2113         n = dns_name_countlabels(name);
2114         l = dns_name_countlabels(&rbtdb->common.origin);
2115         i = l + 1;
2116         while (i < n) {
2117                 dns_rbtnode_t *node = NULL;     /* dummy */
2118                 dns_name_getlabelsequence(name, n - i, i, &foundname);
2119                 if (dns_name_iswildcard(&foundname)) {
2120                         result = add_wildcard_magic(rbtdb, &foundname);
2121                         if (result != ISC_R_SUCCESS)
2122                                 return (result);
2123                         result = dns_rbt_addnode(rbtdb->tree, &foundname,
2124                                                  &node);
2125                         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2126                                 return (result);
2127                 }
2128                 i++;
2129         }
2130         return (ISC_R_SUCCESS);
2131 }
2132
2133 static isc_result_t
2134 findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create,
2135          dns_dbnode_t **nodep)
2136 {
2137         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2138         dns_rbtnode_t *node = NULL;
2139         dns_name_t nodename;
2140         isc_result_t result;
2141         isc_rwlocktype_t locktype = isc_rwlocktype_read;
2142
2143         REQUIRE(VALID_RBTDB(rbtdb));
2144
2145         dns_name_init(&nodename, NULL);
2146         RWLOCK(&rbtdb->tree_lock, locktype);
2147         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &node, NULL,
2148                                   DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2149         if (result != ISC_R_SUCCESS) {
2150                 RWUNLOCK(&rbtdb->tree_lock, locktype);
2151                 if (!create) {
2152                         if (result == DNS_R_PARTIALMATCH)
2153                                 result = ISC_R_NOTFOUND;
2154                         return (result);
2155                 }
2156                 /*
2157                  * It would be nice to try to upgrade the lock instead of
2158                  * unlocking then relocking.
2159                  */
2160                 locktype = isc_rwlocktype_write;
2161                 RWLOCK(&rbtdb->tree_lock, locktype);
2162                 node = NULL;
2163                 result = dns_rbt_addnode(rbtdb->tree, name, &node);
2164                 if (result == ISC_R_SUCCESS) {
2165                         dns_rbt_namefromnode(node, &nodename);
2166 #ifdef DNS_RBT_USEHASH
2167                         node->locknum = node->hashval % rbtdb->node_lock_count;
2168 #else
2169                         node->locknum = dns_name_hash(&nodename, ISC_TRUE) %
2170                                 rbtdb->node_lock_count;
2171 #endif
2172                         add_empty_wildcards(rbtdb, name);
2173
2174                         if (dns_name_iswildcard(name)) {
2175                                 result = add_wildcard_magic(rbtdb, name);
2176                                 if (result != ISC_R_SUCCESS) {
2177                                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2178                                         return (result);
2179                                 }
2180                         }
2181                 } else if (result != ISC_R_EXISTS) {
2182                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2183                         return (result);
2184                 }
2185         }
2186         reactivate_node(rbtdb, node, locktype);
2187         RWUNLOCK(&rbtdb->tree_lock, locktype);
2188
2189         *nodep = (dns_dbnode_t *)node;
2190
2191         return (ISC_R_SUCCESS);
2192 }
2193
2194 static isc_result_t
2195 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2196         rbtdb_search_t *search = arg;
2197         rdatasetheader_t *header, *header_next;
2198         rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2199         rdatasetheader_t *found;
2200         isc_result_t result;
2201         dns_rbtnode_t *onode;
2202
2203         /*
2204          * We only want to remember the topmost zone cut, since it's the one
2205          * that counts, so we'll just continue if we've already found a
2206          * zonecut.
2207          */
2208         if (search->zonecut != NULL)
2209                 return (DNS_R_CONTINUE);
2210
2211         found = NULL;
2212         result = DNS_R_CONTINUE;
2213         onode = search->rbtdb->origin_node;
2214
2215         NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2216                   isc_rwlocktype_read);
2217
2218         /*
2219          * Look for an NS or DNAME rdataset active in our version.
2220          */
2221         ns_header = NULL;
2222         dname_header = NULL;
2223         sigdname_header = NULL;
2224         for (header = node->data; header != NULL; header = header_next) {
2225                 header_next = header->next;
2226                 if (header->type == dns_rdatatype_ns ||
2227                     header->type == dns_rdatatype_dname ||
2228                     header->type == RBTDB_RDATATYPE_SIGDNAME) {
2229                         do {
2230                                 if (header->serial <= search->serial &&
2231                                     !IGNORE(header)) {
2232                                         /*
2233                                          * Is this a "this rdataset doesn't
2234                                          * exist" record?
2235                                          */
2236                                         if (NONEXISTENT(header))
2237                                                 header = NULL;
2238                                         break;
2239                                 } else
2240                                         header = header->down;
2241                         } while (header != NULL);
2242                         if (header != NULL) {
2243                                 if (header->type == dns_rdatatype_dname)
2244                                         dname_header = header;
2245                                 else if (header->type ==
2246                                            RBTDB_RDATATYPE_SIGDNAME)
2247                                         sigdname_header = header;
2248                                 else if (node != onode ||
2249                                          IS_STUB(search->rbtdb)) {
2250                                         /*
2251                                          * We've found an NS rdataset that
2252                                          * isn't at the origin node.  We check
2253                                          * that they're not at the origin node,
2254                                          * because otherwise we'd erroneously
2255                                          * treat the zone top as if it were
2256                                          * a delegation.
2257                                          */
2258                                         ns_header = header;
2259                                 }
2260                         }
2261                 }
2262         }
2263
2264         /*
2265          * Did we find anything?
2266          */
2267         if (dname_header != NULL) {
2268                 /*
2269                  * Note that DNAME has precedence over NS if both exist.
2270                  */
2271                 found = dname_header;
2272                 search->zonecut_sigrdataset = sigdname_header;
2273         } else if (ns_header != NULL) {
2274                 found = ns_header;
2275                 search->zonecut_sigrdataset = NULL;
2276         }
2277
2278         if (found != NULL) {
2279                 /*
2280                  * We increment the reference count on node to ensure that
2281                  * search->zonecut_rdataset will still be valid later.
2282                  */
2283                 new_reference(search->rbtdb, node);
2284                 search->zonecut = node;
2285                 search->zonecut_rdataset = found;
2286                 search->need_cleanup = ISC_TRUE;
2287                 /*
2288                  * Since we've found a zonecut, anything beneath it is
2289                  * glue and is not subject to wildcard matching, so we
2290                  * may clear search->wild.
2291                  */
2292                 search->wild = ISC_FALSE;
2293                 if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
2294                         /*
2295                          * If the caller does not want to find glue, then
2296                          * this is the best answer and the search should
2297                          * stop now.
2298                          */
2299                         result = DNS_R_PARTIALMATCH;
2300                 } else {
2301                         dns_name_t *zcname;
2302
2303                         /*
2304                          * The search will continue beneath the zone cut.
2305                          * This may or may not be the best match.  In case it
2306                          * is, we need to remember the node name.
2307                          */
2308                         zcname = dns_fixedname_name(&search->zonecut_name);
2309                         RUNTIME_CHECK(dns_name_copy(name, zcname, NULL) ==
2310                                       ISC_R_SUCCESS);
2311                         search->copy_name = ISC_TRUE;
2312                 }
2313         } else {
2314                 /*
2315                  * There is no zonecut at this node which is active in this
2316                  * version.
2317                  *
2318                  * If this is a "wild" node and the caller hasn't disabled
2319                  * wildcard matching, remember that we've seen a wild node
2320                  * in case we need to go searching for wildcard matches
2321                  * later on.
2322                  */
2323                 if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0)
2324                         search->wild = ISC_TRUE;
2325         }
2326
2327         NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2328                     isc_rwlocktype_read);
2329
2330         return (result);
2331 }
2332
2333 static inline void
2334 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2335               rdatasetheader_t *header, isc_stdtime_t now,
2336               dns_rdataset_t *rdataset)
2337 {
2338         unsigned char *raw;     /* RDATASLAB */
2339
2340         /*
2341          * Caller must be holding the node reader lock.
2342          * XXXJT: technically, we need a writer lock, since we'll increment
2343          * the header count below.  However, since the actual counter value
2344          * doesn't matter, we prioritize performance here.  (We may want to
2345          * use atomic increment when available).
2346          */
2347
2348         if (rdataset == NULL)
2349                 return;
2350
2351         new_reference(rbtdb, node);
2352
2353         INSIST(rdataset->methods == NULL);      /* We must be disassociated. */
2354
2355         rdataset->methods = &rdataset_methods;
2356         rdataset->rdclass = rbtdb->common.rdclass;
2357         rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
2358         rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
2359         rdataset->ttl = header->rdh_ttl - now;
2360         rdataset->trust = header->trust;
2361         if (NXDOMAIN(header))
2362                 rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
2363         rdataset->private1 = rbtdb;
2364         rdataset->private2 = node;
2365         raw = (unsigned char *)header + sizeof(*header);
2366         rdataset->private3 = raw;
2367         rdataset->count = header->count++;
2368         if (rdataset->count == ISC_UINT32_MAX)
2369                 rdataset->count = 0;
2370
2371         /*
2372          * Reset iterator state.
2373          */
2374         rdataset->privateuint4 = 0;
2375         rdataset->private5 = NULL;
2376
2377         /*
2378          * Add noqname proof.
2379          */
2380         rdataset->private6 = header->noqname;
2381         if (rdataset->private6 != NULL)
2382                 rdataset->attributes |=  DNS_RDATASETATTR_NOQNAME;
2383 }
2384
2385 static inline isc_result_t
2386 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
2387                  dns_name_t *foundname, dns_rdataset_t *rdataset,
2388                  dns_rdataset_t *sigrdataset)
2389 {
2390         isc_result_t result;
2391         dns_name_t *zcname;
2392         rbtdb_rdatatype_t type;
2393         dns_rbtnode_t *node;
2394
2395         /*
2396          * The caller MUST NOT be holding any node locks.
2397          */
2398
2399         node = search->zonecut;
2400         type = search->zonecut_rdataset->type;
2401
2402         /*
2403          * If we have to set foundname, we do it before anything else.
2404          * If we were to set foundname after we had set nodep or bound the
2405          * rdataset, then we'd have to undo that work if dns_name_copy()
2406          * failed.  By setting foundname first, there's nothing to undo if
2407          * we have trouble.
2408          */
2409         if (foundname != NULL && search->copy_name) {
2410                 zcname = dns_fixedname_name(&search->zonecut_name);
2411                 result = dns_name_copy(zcname, foundname, NULL);
2412                 if (result != ISC_R_SUCCESS)
2413                         return (result);
2414         }
2415         if (nodep != NULL) {
2416                 /*
2417                  * Note that we don't have to increment the node's reference
2418                  * count here because we're going to use the reference we
2419                  * already have in the search block.
2420                  */
2421                 *nodep = node;
2422                 search->need_cleanup = ISC_FALSE;
2423         }
2424         if (rdataset != NULL) {
2425                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2426                           isc_rwlocktype_read);
2427                 bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
2428                               search->now, rdataset);
2429                 if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
2430                         bind_rdataset(search->rbtdb, node,
2431                                       search->zonecut_sigrdataset,
2432                                       search->now, sigrdataset);
2433                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2434                             isc_rwlocktype_read);
2435         }
2436
2437         if (type == dns_rdatatype_dname)
2438                 return (DNS_R_DNAME);
2439         return (DNS_R_DELEGATION);
2440 }
2441
2442 static inline isc_boolean_t
2443 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
2444            dns_rbtnode_t *node)
2445 {
2446         unsigned char *raw;     /* RDATASLAB */
2447         unsigned int count, size;
2448         dns_name_t ns_name;
2449         isc_boolean_t valid = ISC_FALSE;
2450         dns_offsets_t offsets;
2451         isc_region_t region;
2452         rdatasetheader_t *header;
2453
2454         /*
2455          * No additional locking is required.
2456          */
2457
2458         /*
2459          * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
2460          * if it occurs at a zone cut, but is not valid below it.
2461          */
2462         if (type == dns_rdatatype_ns) {
2463                 if (node != search->zonecut) {
2464                         return (ISC_FALSE);
2465                 }
2466         } else if (type != dns_rdatatype_a &&
2467                    type != dns_rdatatype_aaaa &&
2468                    type != dns_rdatatype_a6) {
2469                 return (ISC_FALSE);
2470         }
2471
2472         header = search->zonecut_rdataset;
2473         raw = (unsigned char *)header + sizeof(*header);
2474         count = raw[0] * 256 + raw[1];
2475 #if DNS_RDATASET_FIXED
2476         raw += 2 + (4 * count);
2477 #else
2478         raw += 2;
2479 #endif
2480
2481         while (count > 0) {
2482                 count--;
2483                 size = raw[0] * 256 + raw[1];
2484 #if DNS_RDATASET_FIXED
2485                 raw += 4;
2486 #else
2487                 raw += 2;
2488 #endif
2489                 region.base = raw;
2490                 region.length = size;
2491                 raw += size;
2492                 /*
2493                  * XXX Until we have rdata structures, we have no choice but
2494                  * to directly access the rdata format.
2495                  */
2496                 dns_name_init(&ns_name, offsets);
2497                 dns_name_fromregion(&ns_name, &region);
2498                 if (dns_name_compare(&ns_name, name) == 0) {
2499                         valid = ISC_TRUE;
2500                         break;
2501                 }
2502         }
2503
2504         return (valid);
2505 }
2506
2507 static inline isc_boolean_t
2508 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
2509             dns_name_t *name)
2510 {
2511         dns_fixedname_t fnext;
2512         dns_fixedname_t forigin;
2513         dns_name_t *next;
2514         dns_name_t *origin;
2515         dns_name_t prefix;
2516         dns_rbtdb_t *rbtdb;
2517         dns_rbtnode_t *node;
2518         isc_result_t result;
2519         isc_boolean_t answer = ISC_FALSE;
2520         rdatasetheader_t *header;
2521
2522         rbtdb = search->rbtdb;
2523
2524         dns_name_init(&prefix, NULL);
2525         dns_fixedname_init(&fnext);
2526         next = dns_fixedname_name(&fnext);
2527         dns_fixedname_init(&forigin);
2528         origin = dns_fixedname_name(&forigin);
2529
2530         result = dns_rbtnodechain_next(chain, NULL, NULL);
2531         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2532                 node = NULL;
2533                 result = dns_rbtnodechain_current(chain, &prefix,
2534                                                   origin, &node);
2535                 if (result != ISC_R_SUCCESS)
2536                         break;
2537                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2538                           isc_rwlocktype_read);
2539                 for (header = node->data;
2540                      header != NULL;
2541                      header = header->next) {
2542                         if (header->serial <= search->serial &&
2543                             !IGNORE(header) && EXISTS(header))
2544                                 break;
2545                 }
2546                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2547                             isc_rwlocktype_read);
2548                 if (header != NULL)
2549                         break;
2550                 result = dns_rbtnodechain_next(chain, NULL, NULL);
2551         }
2552         if (result == ISC_R_SUCCESS)
2553                 result = dns_name_concatenate(&prefix, origin, next, NULL);
2554         if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name))
2555                 answer = ISC_TRUE;
2556         return (answer);
2557 }
2558
2559 static inline isc_boolean_t
2560 activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) {
2561         dns_fixedname_t fnext;
2562         dns_fixedname_t forigin;
2563         dns_fixedname_t fprev;
2564         dns_name_t *next;
2565         dns_name_t *origin;
2566         dns_name_t *prev;
2567         dns_name_t name;
2568         dns_name_t rname;
2569         dns_name_t tname;
2570         dns_rbtdb_t *rbtdb;
2571         dns_rbtnode_t *node;
2572         dns_rbtnodechain_t chain;
2573         isc_boolean_t check_next = ISC_TRUE;
2574         isc_boolean_t check_prev = ISC_TRUE;
2575         isc_boolean_t answer = ISC_FALSE;
2576         isc_result_t result;
2577         rdatasetheader_t *header;
2578         unsigned int n;
2579
2580         rbtdb = search->rbtdb;
2581
2582         dns_name_init(&name, NULL);
2583         dns_name_init(&tname, NULL);
2584         dns_name_init(&rname, NULL);
2585         dns_fixedname_init(&fnext);
2586         next = dns_fixedname_name(&fnext);
2587         dns_fixedname_init(&fprev);
2588         prev = dns_fixedname_name(&fprev);
2589         dns_fixedname_init(&forigin);
2590         origin = dns_fixedname_name(&forigin);
2591
2592         /*
2593          * Find if qname is at or below a empty node.
2594          * Use our own copy of the chain.
2595          */
2596
2597         chain = search->chain;
2598         do {
2599                 node = NULL;
2600                 result = dns_rbtnodechain_current(&chain, &name,
2601                                                   origin, &node);
2602                 if (result != ISC_R_SUCCESS)
2603                         break;
2604                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2605                           isc_rwlocktype_read);
2606                 for (header = node->data;
2607                      header != NULL;
2608                      header = header->next) {
2609                         if (header->serial <= search->serial &&
2610                             !IGNORE(header) && EXISTS(header))
2611                                 break;
2612                 }
2613                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2614                             isc_rwlocktype_read);
2615                 if (header != NULL)
2616                         break;
2617                 result = dns_rbtnodechain_prev(&chain, NULL, NULL);
2618         } while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
2619         if (result == ISC_R_SUCCESS)
2620                 result = dns_name_concatenate(&name, origin, prev, NULL);
2621         if (result != ISC_R_SUCCESS)
2622                 check_prev = ISC_FALSE;
2623
2624         result = dns_rbtnodechain_next(&chain, NULL, NULL);
2625         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2626                 node = NULL;
2627                 result = dns_rbtnodechain_current(&chain, &name,
2628                                                   origin, &node);
2629                 if (result != ISC_R_SUCCESS)
2630                         break;
2631                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2632                           isc_rwlocktype_read);
2633                 for (header = node->data;
2634                      header != NULL;
2635                      header = header->next) {
2636                         if (header->serial <= search->serial &&
2637                             !IGNORE(header) && EXISTS(header))
2638                                 break;
2639                 }
2640                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2641                             isc_rwlocktype_read);
2642                 if (header != NULL)
2643                         break;
2644                 result = dns_rbtnodechain_next(&chain, NULL, NULL);
2645         }
2646         if (result == ISC_R_SUCCESS)
2647                 result = dns_name_concatenate(&name, origin, next, NULL);
2648         if (result != ISC_R_SUCCESS)
2649                 check_next = ISC_FALSE;
2650
2651         dns_name_clone(qname, &rname);
2652
2653         /*
2654          * Remove the wildcard label to find the terminal name.
2655          */
2656         n = dns_name_countlabels(wname);
2657         dns_name_getlabelsequence(wname, 1, n - 1, &tname);
2658
2659         do {
2660                 if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
2661                     (check_next && dns_name_issubdomain(next, &rname))) {
2662                         answer = ISC_TRUE;
2663                         break;
2664                 }
2665                 /*
2666                  * Remove the left hand label.
2667                  */
2668                 n = dns_name_countlabels(&rname);
2669                 dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
2670         } while (!dns_name_equal(&rname, &tname));
2671         return (answer);
2672 }
2673
2674 static inline isc_result_t
2675 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
2676               dns_name_t *qname)
2677 {
2678         unsigned int i, j;
2679         dns_rbtnode_t *node, *level_node, *wnode;
2680         rdatasetheader_t *header;
2681         isc_result_t result = ISC_R_NOTFOUND;
2682         dns_name_t name;
2683         dns_name_t *wname;
2684         dns_fixedname_t fwname;
2685         dns_rbtdb_t *rbtdb;
2686         isc_boolean_t done, wild, active;
2687         dns_rbtnodechain_t wchain;
2688
2689         /*
2690          * Caller must be holding the tree lock and MUST NOT be holding
2691          * any node locks.
2692          */
2693
2694         /*
2695          * Examine each ancestor level.  If the level's wild bit
2696          * is set, then construct the corresponding wildcard name and
2697          * search for it.  If the wildcard node exists, and is active in
2698          * this version, we're done.  If not, then we next check to see
2699          * if the ancestor is active in this version.  If so, then there
2700          * can be no possible wildcard match and again we're done.  If not,
2701          * continue the search.
2702          */
2703
2704         rbtdb = search->rbtdb;
2705         i = search->chain.level_matches;
2706         done = ISC_FALSE;
2707         node = *nodep;
2708         do {
2709                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2710                           isc_rwlocktype_read);
2711
2712                 /*
2713                  * First we try to figure out if this node is active in
2714                  * the search's version.  We do this now, even though we
2715                  * may not need the information, because it simplifies the
2716                  * locking and code flow.
2717                  */
2718                 for (header = node->data;
2719                      header != NULL;
2720                      header = header->next) {
2721                         if (header->serial <= search->serial &&
2722                             !IGNORE(header) && EXISTS(header))
2723                                 break;
2724                 }
2725                 if (header != NULL)
2726                         active = ISC_TRUE;
2727                 else
2728                         active = ISC_FALSE;
2729
2730                 if (node->wild)
2731                         wild = ISC_TRUE;
2732                 else
2733                         wild = ISC_FALSE;
2734
2735                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2736                             isc_rwlocktype_read);
2737
2738                 if (wild) {
2739                         /*
2740                          * Construct the wildcard name for this level.
2741                          */
2742                         dns_name_init(&name, NULL);
2743                         dns_rbt_namefromnode(node, &name);
2744                         dns_fixedname_init(&fwname);
2745                         wname = dns_fixedname_name(&fwname);
2746                         result = dns_name_concatenate(dns_wildcardname, &name,
2747                                                       wname, NULL);
2748                         j = i;
2749                         while (result == ISC_R_SUCCESS && j != 0) {
2750                                 j--;
2751                                 level_node = search->chain.levels[j];
2752                                 dns_name_init(&name, NULL);
2753                                 dns_rbt_namefromnode(level_node, &name);
2754                                 result = dns_name_concatenate(wname,
2755                                                               &name,
2756                                                               wname,
2757                                                               NULL);
2758                         }
2759                         if (result != ISC_R_SUCCESS)
2760                                 break;
2761
2762                         wnode = NULL;
2763                         dns_rbtnodechain_init(&wchain, NULL);
2764                         result = dns_rbt_findnode(rbtdb->tree, wname,
2765                                                   NULL, &wnode, &wchain,
2766                                                   DNS_RBTFIND_EMPTYDATA,
2767                                                   NULL, NULL);
2768                         if (result == ISC_R_SUCCESS) {
2769                                 nodelock_t *lock;
2770
2771                                 /*
2772                                  * We have found the wildcard node.  If it
2773                                  * is active in the search's version, we're
2774                                  * done.
2775                                  */
2776                                 lock = &rbtdb->node_locks[wnode->locknum].lock;
2777                                 NODE_LOCK(lock, isc_rwlocktype_read);
2778                                 for (header = wnode->data;
2779                                      header != NULL;
2780                                      header = header->next) {
2781                                         if (header->serial <= search->serial &&
2782                                             !IGNORE(header) && EXISTS(header))
2783                                                 break;
2784                                 }
2785                                 NODE_UNLOCK(lock, isc_rwlocktype_read);
2786                                 if (header != NULL ||
2787                                     activeempty(search, &wchain, wname)) {
2788                                         if (activeemtpynode(search, qname,
2789                                                             wname)) {
2790                                                 return (ISC_R_NOTFOUND);
2791                                         }
2792                                         /*
2793                                          * The wildcard node is active!
2794                                          *
2795                                          * Note: result is still ISC_R_SUCCESS
2796                                          * so we don't have to set it.
2797                                          */
2798                                         *nodep = wnode;
2799                                         break;
2800                                 }
2801                         } else if (result != ISC_R_NOTFOUND &&
2802                                    result != DNS_R_PARTIALMATCH) {
2803                                 /*
2804                                  * An error has occurred.  Bail out.
2805                                  */
2806                                 break;
2807                         }
2808                 }
2809
2810                 if (active) {
2811                         /*
2812                          * The level node is active.  Any wildcarding
2813                          * present at higher levels has no
2814                          * effect and we're done.
2815                          */
2816                         result = ISC_R_NOTFOUND;
2817                         break;
2818                 }
2819
2820                 if (i > 0) {
2821                         i--;
2822                         node = search->chain.levels[i];
2823                 } else
2824                         done = ISC_TRUE;
2825         } while (!done);
2826
2827         return (result);
2828 }
2829
2830 static inline isc_result_t
2831 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
2832                   dns_name_t *foundname, dns_rdataset_t *rdataset,
2833                   dns_rdataset_t *sigrdataset, isc_boolean_t need_sig)
2834 {
2835         dns_rbtnode_t *node;
2836         rdatasetheader_t *header, *header_next, *found, *foundsig;
2837         isc_boolean_t empty_node;
2838         isc_result_t result;
2839         dns_fixedname_t fname, forigin;
2840         dns_name_t *name, *origin;
2841
2842         do {
2843                 node = NULL;
2844                 dns_fixedname_init(&fname);
2845                 name = dns_fixedname_name(&fname);
2846                 dns_fixedname_init(&forigin);
2847                 origin = dns_fixedname_name(&forigin);
2848                 result = dns_rbtnodechain_current(&search->chain, name,
2849                                                   origin, &node);
2850                 if (result != ISC_R_SUCCESS)
2851                         return (result);
2852                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2853                           isc_rwlocktype_read);
2854                 found = NULL;
2855                 foundsig = NULL;
2856                 empty_node = ISC_TRUE;
2857                 for (header = node->data;
2858                      header != NULL;
2859                      header = header_next) {
2860                         header_next = header->next;
2861                         /*
2862                          * Look for an active, extant NSEC or RRSIG NSEC.
2863                          */
2864                         do {
2865                                 if (header->serial <= search->serial &&
2866                                     !IGNORE(header)) {
2867                                         /*
2868                                          * Is this a "this rdataset doesn't
2869                                          * exist" record?
2870                                          */
2871                                         if (NONEXISTENT(header))
2872                                                 header = NULL;
2873                                         break;
2874                                 } else
2875                                         header = header->down;
2876                         } while (header != NULL);
2877                         if (header != NULL) {
2878                                 /*
2879                                  * We now know that there is at least one
2880                                  * active rdataset at this node.
2881                                  */
2882                                 empty_node = ISC_FALSE;
2883                                 if (header->type == dns_rdatatype_nsec) {
2884                                         found = header;
2885                                         if (foundsig != NULL)
2886                                                 break;
2887                                 } else if (header->type ==
2888                                            RBTDB_RDATATYPE_SIGNSEC) {
2889                                         foundsig = header;
2890                                         if (found != NULL)
2891                                                 break;
2892                                 }
2893                         }
2894                 }
2895                 if (!empty_node) {
2896                         if (found != NULL &&
2897                             (foundsig != NULL || !need_sig))
2898                         {
2899                                 /*
2900                                  * We've found the right NSEC record.
2901                                  *
2902                                  * Note: for this to really be the right
2903                                  * NSEC record, it's essential that the NSEC
2904                                  * records of any nodes obscured by a zone
2905                                  * cut have been removed; we assume this is
2906                                  * the case.
2907                                  */
2908                                 result = dns_name_concatenate(name, origin,
2909                                                               foundname, NULL);
2910                                 if (result == ISC_R_SUCCESS) {
2911                                         if (nodep != NULL) {
2912                                                 new_reference(search->rbtdb,
2913                                                               node);
2914                                                 *nodep = node;
2915                                         }
2916                                         bind_rdataset(search->rbtdb, node,
2917                                                       found, search->now,
2918                                                       rdataset);
2919                                         if (foundsig != NULL)
2920                                                 bind_rdataset(search->rbtdb,
2921                                                               node,
2922                                                               foundsig,
2923                                                               search->now,
2924                                                               sigrdataset);
2925                                 }
2926                         } else if (found == NULL && foundsig == NULL) {
2927                                 /*
2928                                  * This node is active, but has no NSEC or
2929                                  * RRSIG NSEC.  That means it's glue or
2930                                  * other obscured zone data that isn't
2931                                  * relevant for our search.  Treat the
2932                                  * node as if it were empty and keep looking.
2933                                  */
2934                                 empty_node = ISC_TRUE;
2935                                 result = dns_rbtnodechain_prev(&search->chain,
2936                                                                NULL, NULL);
2937                         } else {
2938                                 /*
2939                                  * We found an active node, but either the
2940                                  * NSEC or the RRSIG NSEC is missing.  This
2941                                  * shouldn't happen.
2942                                  */
2943                                 result = DNS_R_BADDB;
2944                         }
2945                 } else {
2946                         /*
2947                          * This node isn't active.  We've got to keep
2948                          * looking.
2949                          */
2950                         result = dns_rbtnodechain_prev(&search->chain, NULL,
2951                                                        NULL);
2952                 }
2953                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2954                             isc_rwlocktype_read);
2955         } while (empty_node && result == ISC_R_SUCCESS);
2956
2957         /*
2958          * If the result is ISC_R_NOMORE, then we got to the beginning of
2959          * the database and didn't find a NSEC record.  This shouldn't
2960          * happen.
2961          */
2962         if (result == ISC_R_NOMORE)
2963                 result = DNS_R_BADDB;
2964
2965         return (result);
2966 }
2967
2968 static isc_result_t
2969 zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
2970           dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
2971           dns_dbnode_t **nodep, dns_name_t *foundname,
2972           dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
2973 {
2974         dns_rbtnode_t *node = NULL;
2975         isc_result_t result;
2976         rbtdb_search_t search;
2977         isc_boolean_t cname_ok = ISC_TRUE;
2978         isc_boolean_t close_version = ISC_FALSE;
2979         isc_boolean_t maybe_zonecut = ISC_FALSE;
2980         isc_boolean_t at_zonecut = ISC_FALSE;
2981         isc_boolean_t wild;
2982         isc_boolean_t empty_node;
2983         rdatasetheader_t *header, *header_next, *found, *nsecheader;
2984         rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
2985         rbtdb_rdatatype_t sigtype;
2986         isc_boolean_t active;
2987         dns_rbtnodechain_t chain;
2988         nodelock_t *lock;
2989
2990
2991         search.rbtdb = (dns_rbtdb_t *)db;
2992
2993         REQUIRE(VALID_RBTDB(search.rbtdb));
2994
2995         /*
2996          * We don't care about 'now'.
2997          */
2998         UNUSED(now);
2999
3000         /*
3001          * If the caller didn't supply a version, attach to the current
3002          * version.
3003          */
3004         if (version == NULL) {
3005                 currentversion(db, &version);
3006                 close_version = ISC_TRUE;
3007         }
3008
3009         search.rbtversion = version;
3010         search.serial = search.rbtversion->serial;
3011         search.options = options;
3012         search.copy_name = ISC_FALSE;
3013         search.need_cleanup = ISC_FALSE;
3014         search.wild = ISC_FALSE;
3015         search.zonecut = NULL;
3016         dns_fixedname_init(&search.zonecut_name);
3017         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3018         search.now = 0;
3019
3020         /*
3021          * 'wild' will be true iff. we've matched a wildcard.
3022          */
3023         wild = ISC_FALSE;
3024
3025         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3026
3027         /*
3028          * Search down from the root of the tree.  If, while going down, we
3029          * encounter a callback node, zone_zonecut_callback() will search the
3030          * rdatasets at the zone cut for active DNAME or NS rdatasets.
3031          */
3032         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3033                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3034                                   zone_zonecut_callback, &search);
3035
3036         if (result == DNS_R_PARTIALMATCH) {
3037         partial_match:
3038                 if (search.zonecut != NULL) {
3039                     result = setup_delegation(&search, nodep, foundname,
3040                                               rdataset, sigrdataset);
3041                     goto tree_exit;
3042                 }
3043
3044                 if (search.wild) {
3045                         /*
3046                          * At least one of the levels in the search chain
3047                          * potentially has a wildcard.  For each such level,
3048                          * we must see if there's a matching wildcard active
3049                          * in the current version.
3050                          */
3051                         result = find_wildcard(&search, &node, name);
3052                         if (result == ISC_R_SUCCESS) {
3053                                 result = dns_name_copy(name, foundname, NULL);
3054                                 if (result != ISC_R_SUCCESS)
3055                                         goto tree_exit;
3056                                 wild = ISC_TRUE;
3057                                 goto found;
3058                         }
3059                         else if (result != ISC_R_NOTFOUND)
3060                                 goto tree_exit;
3061                 }
3062
3063                 chain = search.chain;
3064                 active = activeempty(&search, &chain, name);
3065
3066                 /*
3067                  * If we're here, then the name does not exist, is not
3068                  * beneath a zonecut, and there's no matching wildcard.
3069                  */
3070                 if (search.rbtdb->secure ||
3071                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3072                 {
3073                         result = find_closest_nsec(&search, nodep, foundname,
3074                                                    rdataset, sigrdataset,
3075                                                    search.rbtdb->secure);
3076                         if (result == ISC_R_SUCCESS)
3077                                 result = active ? DNS_R_EMPTYNAME :
3078                                                   DNS_R_NXDOMAIN;
3079                 } else
3080                         result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
3081                 goto tree_exit;
3082         } else if (result != ISC_R_SUCCESS)
3083                 goto tree_exit;
3084
3085  found:
3086         /*
3087          * We have found a node whose name is the desired name, or we
3088          * have matched a wildcard.
3089          */
3090
3091         if (search.zonecut != NULL) {
3092                 /*
3093                  * If we're beneath a zone cut, we don't want to look for
3094                  * CNAMEs because they're not legitimate zone glue.
3095                  */
3096                 cname_ok = ISC_FALSE;
3097         } else {
3098                 /*
3099                  * The node may be a zone cut itself.  If it might be one,
3100                  * make sure we check for it later.
3101                  */
3102                 if (node->find_callback &&
3103                     (node != search.rbtdb->origin_node ||
3104                      IS_STUB(search.rbtdb)) &&
3105                     !dns_rdatatype_atparent(type))
3106                         maybe_zonecut = ISC_TRUE;
3107         }
3108
3109         /*
3110          * Certain DNSSEC types are not subject to CNAME matching
3111          * (RFC4035, section 2.5 and RFC3007).
3112          *
3113          * We don't check for RRSIG, because we don't store RRSIG records
3114          * directly.
3115          */
3116         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3117                 cname_ok = ISC_FALSE;
3118
3119         /*
3120          * We now go looking for rdata...
3121          */
3122
3123         NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3124                   isc_rwlocktype_read);
3125
3126         found = NULL;
3127         foundsig = NULL;
3128         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3129         nsecheader = NULL;
3130         nsecsig = NULL;
3131         cnamesig = NULL;
3132         empty_node = ISC_TRUE;
3133         for (header = node->data; header != NULL; header = header_next) {
3134                 header_next = header->next;
3135                 /*
3136                  * Look for an active, extant rdataset.
3137                  */
3138                 do {
3139                         if (header->serial <= search.serial &&
3140                             !IGNORE(header)) {
3141                                 /*
3142                                  * Is this a "this rdataset doesn't
3143                                  * exist" record?
3144                                  */
3145                                 if (NONEXISTENT(header))
3146                                         header = NULL;
3147                                 break;
3148                         } else
3149                                 header = header->down;
3150                 } while (header != NULL);
3151                 if (header != NULL) {
3152                         /*
3153                          * We now know that there is at least one active
3154                          * rdataset at this node.
3155                          */
3156                         empty_node = ISC_FALSE;
3157
3158                         /*
3159                          * Do special zone cut handling, if requested.
3160                          */
3161                         if (maybe_zonecut &&
3162                             header->type == dns_rdatatype_ns) {
3163                                 /*
3164                                  * We increment the reference count on node to
3165                                  * ensure that search->zonecut_rdataset will
3166                                  * still be valid later.
3167                                  */
3168                                 new_reference(search.rbtdb, node);
3169                                 search.zonecut = node;
3170                                 search.zonecut_rdataset = header;
3171                                 search.zonecut_sigrdataset = NULL;
3172                                 search.need_cleanup = ISC_TRUE;
3173                                 maybe_zonecut = ISC_FALSE;
3174                                 at_zonecut = ISC_TRUE;
3175                                 /*
3176                                  * It is not clear if KEY should still be
3177                                  * allowed at the parent side of the zone
3178                                  * cut or not.  It is needed for RFC3007
3179                                  * validated updates.
3180                                  */
3181                                 if ((search.options & DNS_DBFIND_GLUEOK) == 0
3182                                     && type != dns_rdatatype_nsec
3183                                     && type != dns_rdatatype_key) {
3184                                         /*
3185                                          * Glue is not OK, but any answer we
3186                                          * could return would be glue.  Return
3187                                          * the delegation.
3188                                          */
3189                                         found = NULL;
3190                                         break;
3191                                 }
3192                                 if (found != NULL && foundsig != NULL)
3193                                         break;
3194                         }
3195
3196                         /*
3197                          * If we found a type we were looking for,
3198                          * remember it.
3199                          */
3200                         if (header->type == type ||
3201                             type == dns_rdatatype_any ||
3202                             (header->type == dns_rdatatype_cname &&
3203                              cname_ok)) {
3204                                 /*
3205                                  * We've found the answer!
3206                                  */
3207                                 found = header;
3208                                 if (header->type == dns_rdatatype_cname &&
3209                                     cname_ok) {
3210                                         /*
3211                                          * We may be finding a CNAME instead
3212                                          * of the desired type.
3213                                          *
3214                                          * If we've already got the CNAME RRSIG,
3215                                          * use it, otherwise change sigtype
3216                                          * so that we find it.
3217                                          */
3218                                         if (cnamesig != NULL)
3219                                                 foundsig = cnamesig;
3220                                         else
3221                                                 sigtype =
3222                                                     RBTDB_RDATATYPE_SIGCNAME;
3223                                 }
3224                                 /*
3225                                  * If we've got all we need, end the search.
3226                                  */
3227                                 if (!maybe_zonecut && foundsig != NULL)
3228                                         break;
3229                         } else if (header->type == sigtype) {
3230                                 /*
3231                                  * We've found the RRSIG rdataset for our
3232                                  * target type.  Remember it.
3233                                  */
3234                                 foundsig = header;
3235                                 /*
3236                                  * If we've got all we need, end the search.
3237                                  */
3238                                 if (!maybe_zonecut && found != NULL)
3239                                         break;
3240                         } else if (header->type == dns_rdatatype_nsec) {
3241                                 /*
3242                                  * Remember a NSEC rdataset even if we're
3243                                  * not specifically looking for it, because
3244                                  * we might need it later.
3245                                  */
3246                                 nsecheader = header;
3247                         } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
3248                                 /*
3249                                  * If we need the NSEC rdataset, we'll also
3250                                  * need its signature.
3251                                  */
3252                                 nsecsig = header;
3253                         } else if (cname_ok &&
3254                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
3255                                 /*
3256                                  * If we get a CNAME match, we'll also need
3257                                  * its signature.
3258                                  */
3259                                 cnamesig = header;
3260                         }
3261                 }
3262         }
3263
3264         if (empty_node) {
3265                 /*
3266                  * We have an exact match for the name, but there are no
3267                  * active rdatasets in the desired version.  That means that
3268                  * this node doesn't exist in the desired version, and that
3269                  * we really have a partial match.
3270                  */
3271                 if (!wild) {
3272                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3273                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3274                         goto partial_match;
3275                 }
3276         }
3277
3278         /*
3279          * If we didn't find what we were looking for...
3280          */
3281         if (found == NULL) {
3282                 if (search.zonecut != NULL) {
3283                         /*
3284                          * We were trying to find glue at a node beneath a
3285                          * zone cut, but didn't.
3286                          *
3287                          * Return the delegation.
3288                          */
3289                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3290                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3291                         result = setup_delegation(&search, nodep, foundname,
3292                                                   rdataset, sigrdataset);
3293                         goto tree_exit;
3294                 }
3295                 /*
3296                  * The desired type doesn't exist.
3297                  */
3298                 result = DNS_R_NXRRSET;
3299                 if (search.rbtdb->secure &&
3300                     (nsecheader == NULL || nsecsig == NULL)) {
3301                         /*
3302                          * The zone is secure but there's no NSEC,
3303                          * or the NSEC has no signature!
3304                          */
3305                         if (!wild) {
3306                                 result = DNS_R_BADDB;
3307                                 goto node_exit;
3308                         }
3309
3310                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3311                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3312                         result = find_closest_nsec(&search, nodep, foundname,
3313                                                    rdataset, sigrdataset,
3314                                                    search.rbtdb->secure);
3315                         if (result == ISC_R_SUCCESS)
3316                                 result = DNS_R_EMPTYWILD;
3317                         goto tree_exit;
3318                 }
3319                 if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
3320                     nsecheader == NULL)
3321                 {
3322                         /*
3323                          * There's no NSEC record, and we were told
3324                          * to find one.
3325                          */
3326                         result = DNS_R_BADDB;
3327                         goto node_exit;
3328                 }
3329                 if (nodep != NULL) {
3330                         new_reference(search.rbtdb, node);
3331                         *nodep = node;
3332                 }
3333                 if (search.rbtdb->secure ||
3334                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3335                 {
3336                         bind_rdataset(search.rbtdb, node, nsecheader,
3337                                       0, rdataset);
3338                         if (nsecsig != NULL)
3339                                 bind_rdataset(search.rbtdb, node,
3340                                               nsecsig, 0, sigrdataset);
3341                 }
3342                 if (wild)
3343                         foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3344                 goto node_exit;
3345         }
3346
3347         /*
3348          * We found what we were looking for, or we found a CNAME.
3349          */
3350
3351         if (type != found->type &&
3352             type != dns_rdatatype_any &&
3353             found->type == dns_rdatatype_cname) {
3354                 /*
3355                  * We weren't doing an ANY query and we found a CNAME instead
3356                  * of the type we were looking for, so we need to indicate
3357                  * that result to the caller.
3358                  */
3359                 result = DNS_R_CNAME;
3360         } else if (search.zonecut != NULL) {
3361                 /*
3362                  * If we're beneath a zone cut, we must indicate that the
3363                  * result is glue, unless we're actually at the zone cut
3364                  * and the type is NSEC or KEY.
3365                  */
3366                 if (search.zonecut == node) {
3367                         /*
3368                          * It is not clear if KEY should still be
3369                          * allowed at the parent side of the zone
3370                          * cut or not.  It is needed for RFC3007
3371                          * validated updates.
3372                          */
3373                         if (type == dns_rdatatype_nsec ||
3374                             type == dns_rdatatype_key)
3375                                 result = ISC_R_SUCCESS;
3376                         else if (type == dns_rdatatype_any)
3377                                 result = DNS_R_ZONECUT;
3378                         else
3379                                 result = DNS_R_GLUE;
3380                 } else
3381                         result = DNS_R_GLUE;
3382                 /*
3383                  * We might have found data that isn't glue, but was occluded
3384                  * by a dynamic update.  If the caller cares about this, they
3385                  * will have told us to validate glue.
3386                  *
3387                  * XXX We should cache the glue validity state!
3388                  */
3389                 if (result == DNS_R_GLUE &&
3390                     (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
3391                     !valid_glue(&search, foundname, type, node)) {
3392                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3393                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3394                         result = setup_delegation(&search, nodep, foundname,
3395                                                   rdataset, sigrdataset);
3396                     goto tree_exit;
3397                 }
3398         } else {
3399                 /*
3400                  * An ordinary successful query!
3401                  */
3402                 result = ISC_R_SUCCESS;
3403         }
3404
3405         if (nodep != NULL) {
3406                 if (!at_zonecut)
3407                         new_reference(search.rbtdb, node);
3408                 else
3409                         search.need_cleanup = ISC_FALSE;
3410                 *nodep = node;
3411         }
3412
3413         if (type != dns_rdatatype_any) {
3414                 bind_rdataset(search.rbtdb, node, found, 0, rdataset);
3415                 if (foundsig != NULL)
3416                         bind_rdataset(search.rbtdb, node, foundsig, 0,
3417                                       sigrdataset);
3418         }
3419
3420         if (wild)
3421                 foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3422
3423  node_exit:
3424         NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3425                     isc_rwlocktype_read);
3426
3427  tree_exit:
3428         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3429
3430         /*
3431          * If we found a zonecut but aren't going to use it, we have to
3432          * let go of it.
3433          */
3434         if (search.need_cleanup) {
3435                 node = search.zonecut;
3436                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
3437
3438                 NODE_LOCK(lock, isc_rwlocktype_read);
3439                 decrement_reference(search.rbtdb, node, 0,
3440                                     isc_rwlocktype_read, isc_rwlocktype_none,
3441                                     ISC_FALSE);
3442                 NODE_UNLOCK(lock, isc_rwlocktype_read);
3443         }
3444
3445         if (close_version)
3446                 closeversion(db, &version, ISC_FALSE);
3447
3448         dns_rbtnodechain_reset(&search.chain);
3449
3450         return (result);
3451 }
3452
3453 static isc_result_t
3454 zone_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
3455                  isc_stdtime_t now, dns_dbnode_t **nodep,
3456                  dns_name_t *foundname,
3457                  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3458 {
3459         UNUSED(db);
3460         UNUSED(name);
3461         UNUSED(options);
3462         UNUSED(now);
3463         UNUSED(nodep);
3464         UNUSED(foundname);
3465         UNUSED(rdataset);
3466         UNUSED(sigrdataset);
3467
3468         FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
3469
3470         return (ISC_R_NOTIMPLEMENTED);
3471 }
3472
3473 static isc_result_t
3474 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
3475         rbtdb_search_t *search = arg;
3476         rdatasetheader_t *header, *header_prev, *header_next;
3477         rdatasetheader_t *dname_header, *sigdname_header;
3478         isc_result_t result;
3479         nodelock_t *lock;
3480         isc_rwlocktype_t locktype;
3481
3482         /* XXX comment */
3483
3484         REQUIRE(search->zonecut == NULL);
3485
3486         /*
3487          * Keep compiler silent.
3488          */
3489         UNUSED(name);
3490
3491         lock = &(search->rbtdb->node_locks[node->locknum].lock);
3492         locktype = isc_rwlocktype_read;
3493         NODE_LOCK(lock, locktype);
3494
3495         /*
3496          * Look for a DNAME or RRSIG DNAME rdataset.
3497          */
3498         dname_header = NULL;
3499         sigdname_header = NULL;
3500         header_prev = NULL;
3501         for (header = node->data; header != NULL; header = header_next) {
3502                 header_next = header->next;
3503                 if (header->rdh_ttl <= search->now) {
3504                         /*
3505                          * This rdataset is stale.  If no one else is
3506                          * using the node, we can clean it up right
3507                          * now, otherwise we mark it as stale, and
3508                          * the node as dirty, so it will get cleaned
3509                          * up later.
3510                          */
3511                         if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) &&
3512                             (locktype == isc_rwlocktype_write ||
3513                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3514                                 /*
3515                                  * We update the node's status only when we
3516                                  * can get write access; otherwise, we leave
3517                                  * others to this work.  Periodical cleaning
3518                                  * will eventually take the job as the last
3519                                  * resort.
3520                                  * We won't downgrade the lock, since other
3521                                  * rdatasets are probably stale, too.
3522                                  */
3523                                 locktype = isc_rwlocktype_write;
3524
3525                                 if (dns_rbtnode_refcurrent(node) == 0) {
3526                                         isc_mem_t *mctx;
3527
3528                                         /*
3529                                          * header->down can be non-NULL if the
3530                                          * refcount has just decremented to 0
3531                                          * but decrement_reference() has not
3532                                          * performed clean_cache_node(), in
3533                                          * which case we need to purge the
3534                                          * stale headers first.
3535                                          */
3536                                         mctx = search->rbtdb->common.mctx;
3537                                         clean_stale_headers(search->rbtdb,
3538                                                             mctx,
3539                                                             header);
3540                                         if (header_prev != NULL)
3541                                                 header_prev->next =
3542                                                         header->next;
3543                                         else
3544                                                 node->data = header->next;
3545                                         free_rdataset(search->rbtdb, mctx,
3546                                                       header);
3547                                 } else {
3548                                         header->attributes |=
3549                                                 RDATASET_ATTR_STALE;
3550                                         node->dirty = 1;
3551                                         header_prev = header;
3552                                 }
3553                         } else
3554                                 header_prev = header;
3555                 } else if (header->type == dns_rdatatype_dname &&
3556                            EXISTS(header)) {
3557                         dname_header = header;
3558                         header_prev = header;
3559                 } else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
3560                          EXISTS(header)) {
3561                         sigdname_header = header;
3562                         header_prev = header;
3563                 } else
3564                         header_prev = header;
3565         }
3566
3567         if (dname_header != NULL &&
3568             (dname_header->trust != dns_trust_pending ||
3569              (search->options & DNS_DBFIND_PENDINGOK) != 0)) {
3570                 /*
3571                  * We increment the reference count on node to ensure that
3572                  * search->zonecut_rdataset will still be valid later.
3573                  */
3574                 new_reference(search->rbtdb, node);
3575                 INSIST(!ISC_LINK_LINKED(node, deadlink));
3576                 search->zonecut = node;
3577                 search->zonecut_rdataset = dname_header;
3578                 search->zonecut_sigrdataset = sigdname_header;
3579                 search->need_cleanup = ISC_TRUE;
3580                 result = DNS_R_PARTIALMATCH;
3581         } else
3582                 result = DNS_R_CONTINUE;
3583
3584         NODE_UNLOCK(lock, locktype);
3585
3586         return (result);
3587 }
3588
3589 static inline isc_result_t
3590 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
3591                      dns_dbnode_t **nodep, dns_name_t *foundname,
3592                      dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3593 {
3594         unsigned int i;
3595         dns_rbtnode_t *level_node;
3596         rdatasetheader_t *header, *header_prev, *header_next;
3597         rdatasetheader_t *found, *foundsig;
3598         isc_result_t result = ISC_R_NOTFOUND;
3599         dns_name_t name;
3600         dns_rbtdb_t *rbtdb;
3601         isc_boolean_t done;
3602         nodelock_t *lock;
3603         isc_rwlocktype_t locktype;
3604
3605         /*
3606          * Caller must be holding the tree lock.
3607          */
3608
3609         rbtdb = search->rbtdb;
3610         i = search->chain.level_matches;
3611         done = ISC_FALSE;
3612         do {
3613                 locktype = isc_rwlocktype_read;
3614                 lock = &rbtdb->node_locks[node->locknum].lock;
3615                 NODE_LOCK(lock, locktype);
3616
3617                 /*
3618                  * Look for NS and RRSIG NS rdatasets.
3619                  */
3620                 found = NULL;
3621                 foundsig = NULL;
3622                 header_prev = NULL;
3623                 for (header = node->data;
3624                      header != NULL;
3625                      header = header_next) {
3626                         header_next = header->next;
3627                         if (header->rdh_ttl <= search->now) {
3628                                 /*
3629                                  * This rdataset is stale.  If no one else is
3630                                  * using the node, we can clean it up right
3631                                  * now, otherwise we mark it as stale, and
3632                                  * the node as dirty, so it will get cleaned
3633                                  * up later.
3634                                  */
3635                                 if ((header->rdh_ttl <= search->now -
3636                                                     RBTDB_VIRTUAL) &&
3637                                     (locktype == isc_rwlocktype_write ||
3638                                      NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3639                                         /*
3640                                          * We update the node's status only
3641                                          * when we can get write access.
3642                                          */
3643                                         locktype = isc_rwlocktype_write;
3644
3645                                         if (dns_rbtnode_refcurrent(node)
3646                                             == 0) {
3647                                                 isc_mem_t *m;
3648
3649                                                 m = search->rbtdb->common.mctx;
3650                                                 clean_stale_headers(
3651                                                         search->rbtdb,
3652                                                         m, header);
3653                                                 if (header_prev != NULL)
3654                                                         header_prev->next =
3655                                                                 header->next;
3656                                                 else
3657                                                         node->data =
3658                                                                 header->next;
3659                                                 free_rdataset(rbtdb, m,
3660                                                               header);
3661                                         } else {
3662                                                 header->attributes |=
3663                                                         RDATASET_ATTR_STALE;
3664                                                 node->dirty = 1;
3665                                                 header_prev = header;
3666                                         }
3667                                 } else
3668                                         header_prev = header;
3669                         } else if (EXISTS(header)) {
3670                                 /*
3671                                  * We've found an extant rdataset.  See if
3672                                  * we're interested in it.
3673                                  */
3674                                 if (header->type == dns_rdatatype_ns) {
3675                                         found = header;
3676                                         if (foundsig != NULL)
3677                                                 break;
3678                                 } else if (header->type ==
3679                                            RBTDB_RDATATYPE_SIGNS) {
3680                                         foundsig = header;
3681                                         if (found != NULL)
3682                                                 break;
3683                                 }
3684                                 header_prev = header;
3685                         } else
3686                                 header_prev = header;
3687                 }
3688
3689                 if (found != NULL) {
3690                         /*
3691                          * If we have to set foundname, we do it before
3692                          * anything else.  If we were to set foundname after
3693                          * we had set nodep or bound the rdataset, then we'd
3694                          * have to undo that work if dns_name_concatenate()
3695                          * failed.  By setting foundname first, there's
3696                          * nothing to undo if we have trouble.
3697                          */
3698                         if (foundname != NULL) {
3699                                 dns_name_init(&name, NULL);
3700                                 dns_rbt_namefromnode(node, &name);
3701                                 result = dns_name_copy(&name, foundname, NULL);
3702                                 while (result == ISC_R_SUCCESS && i > 0) {
3703                                         i--;
3704                                         level_node = search->chain.levels[i];
3705                                         dns_name_init(&name, NULL);
3706                                         dns_rbt_namefromnode(level_node,
3707                                                              &name);
3708                                         result =
3709                                                 dns_name_concatenate(foundname,
3710                                                                      &name,
3711                                                                      foundname,
3712                                                                      NULL);
3713                                 }
3714                                 if (result != ISC_R_SUCCESS) {
3715                                         *nodep = NULL;
3716                                         goto node_exit;
3717                                 }
3718                         }
3719                         result = DNS_R_DELEGATION;
3720                         if (nodep != NULL) {
3721                                 new_reference(search->rbtdb, node);
3722                                 *nodep = node;
3723                         }
3724                         bind_rdataset(search->rbtdb, node, found, search->now,
3725                                       rdataset);
3726                         if (foundsig != NULL)
3727                                 bind_rdataset(search->rbtdb, node, foundsig,
3728                                               search->now, sigrdataset);
3729                         if (need_headerupdate(found, search->now) ||
3730                             (foundsig != NULL &&
3731                              need_headerupdate(foundsig, search->now))) {
3732                                 if (locktype != isc_rwlocktype_write) {
3733                                         NODE_UNLOCK(lock, locktype);
3734                                         NODE_LOCK(lock, isc_rwlocktype_write);
3735                                         locktype = isc_rwlocktype_write;
3736                                 }
3737                                 if (need_headerupdate(found, search->now))
3738                                         update_header(search->rbtdb, found,
3739                                                       search->now);
3740                                 if (foundsig != NULL &&
3741                                     need_headerupdate(foundsig, search->now)) {
3742                                         update_header(search->rbtdb, foundsig,
3743                                                       search->now);
3744                                 }
3745                         }
3746                 }
3747
3748         node_exit:
3749                 NODE_UNLOCK(lock, locktype);
3750
3751                 if (found == NULL && i > 0) {
3752                         i--;
3753                         node = search->chain.levels[i];
3754                 } else
3755                         done = ISC_TRUE;
3756
3757         } while (!done);
3758
3759         return (result);
3760 }
3761
3762 static isc_result_t
3763 find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3764                   isc_stdtime_t now, dns_name_t *foundname,
3765                   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3766 {
3767         dns_rbtnode_t *node;
3768         rdatasetheader_t *header, *header_next, *header_prev;
3769         rdatasetheader_t *found, *foundsig;
3770         isc_boolean_t empty_node;
3771         isc_result_t result;
3772         dns_fixedname_t fname, forigin;
3773         dns_name_t *name, *origin;
3774         rbtdb_rdatatype_t matchtype, sigmatchtype;
3775         nodelock_t *lock;
3776         isc_rwlocktype_t locktype;
3777
3778         matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
3779         sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
3780                                              dns_rdatatype_nsec);
3781
3782         do {
3783                 node = NULL;
3784                 dns_fixedname_init(&fname);
3785                 name = dns_fixedname_name(&fname);
3786                 dns_fixedname_init(&forigin);
3787                 origin = dns_fixedname_name(&forigin);
3788                 result = dns_rbtnodechain_current(&search->chain, name,
3789                                                   origin, &node);
3790                 if (result != ISC_R_SUCCESS)
3791                         return (result);
3792                 locktype = isc_rwlocktype_read;
3793                 lock = &(search->rbtdb->node_locks[node->locknum].lock);
3794                 NODE_LOCK(lock, locktype);
3795                 found = NULL;
3796                 foundsig = NULL;
3797                 empty_node = ISC_TRUE;
3798                 header_prev = NULL;
3799                 for (header = node->data;
3800                      header != NULL;
3801                      header = header_next) {
3802                         header_next = header->next;
3803                         if (header->rdh_ttl <= now) {
3804                                 /*
3805                                  * This rdataset is stale.  If no one else is
3806                                  * using the node, we can clean it up right
3807                                  * now, otherwise we mark it as stale, and the
3808                                  * node as dirty, so it will get cleaned up
3809                                  * later.
3810                                  */
3811                                 if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3812                                     (locktype == isc_rwlocktype_write ||
3813                                      NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3814                                         /*
3815                                          * We update the node's status only
3816                                          * when we can get write access.
3817                                          */
3818                                         locktype = isc_rwlocktype_write;
3819
3820                                         if (dns_rbtnode_refcurrent(node)
3821                                             == 0) {
3822                                                 isc_mem_t *m;
3823
3824                                                 m = search->rbtdb->common.mctx;
3825                                                 clean_stale_headers(
3826                                                         search->rbtdb,
3827                                                         m, header);
3828                                                 if (header_prev != NULL)
3829                                                         header_prev->next =
3830                                                                 header->next;
3831                                                 else
3832                                                         node->data = header->next;
3833                                                 free_rdataset(search->rbtdb, m,
3834                                                               header);
3835                                         } else {
3836                                                 header->attributes |=
3837                                                         RDATASET_ATTR_STALE;
3838                                                 node->dirty = 1;
3839                                                 header_prev = header;
3840                                         }
3841                                 } else
3842                                         header_prev = header;
3843                                 continue;
3844                         }
3845                         if (NONEXISTENT(header) ||
3846                             RBTDB_RDATATYPE_BASE(header->type) == 0) {
3847                                 header_prev = header;
3848                                 continue;
3849                         }
3850                         empty_node = ISC_FALSE;
3851                         if (header->type == matchtype)
3852                                 found = header;
3853                         else if (header->type == sigmatchtype)
3854                                 foundsig = header;
3855                         header_prev = header;
3856                 }
3857                 if (found != NULL) {
3858                         result = dns_name_concatenate(name, origin,
3859                                                       foundname, NULL);
3860                         if (result != ISC_R_SUCCESS)
3861                                 goto unlock_node;
3862                         bind_rdataset(search->rbtdb, node, found,
3863                                       now, rdataset);
3864                         if (foundsig != NULL)
3865                                 bind_rdataset(search->rbtdb, node, foundsig,
3866                                               now, sigrdataset);
3867                         new_reference(search->rbtdb, node);
3868                         *nodep = node;
3869                         result = DNS_R_COVERINGNSEC;
3870                 } else if (!empty_node) {
3871                         result = ISC_R_NOTFOUND;
3872                 } else
3873                         result = dns_rbtnodechain_prev(&search->chain, NULL,
3874                                                        NULL);
3875  unlock_node:
3876                 NODE_UNLOCK(lock, locktype);
3877         } while (empty_node && result == ISC_R_SUCCESS);
3878         return (result);
3879 }
3880
3881 static isc_result_t
3882 cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
3883            dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
3884            dns_dbnode_t **nodep, dns_name_t *foundname,
3885            dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3886 {
3887         dns_rbtnode_t *node = NULL;
3888         isc_result_t result;
3889         rbtdb_search_t search;
3890         isc_boolean_t cname_ok = ISC_TRUE;
3891         isc_boolean_t empty_node;
3892         nodelock_t *lock;
3893         isc_rwlocktype_t locktype;
3894         rdatasetheader_t *header, *header_prev, *header_next;
3895         rdatasetheader_t *found, *nsheader;
3896         rdatasetheader_t *foundsig, *nssig, *cnamesig;
3897         rdatasetheader_t *update, *updatesig;
3898         rbtdb_rdatatype_t sigtype, negtype;
3899
3900         UNUSED(version);
3901
3902         search.rbtdb = (dns_rbtdb_t *)db;
3903
3904         REQUIRE(VALID_RBTDB(search.rbtdb));
3905         REQUIRE(version == NULL);
3906
3907         if (now == 0)
3908                 isc_stdtime_get(&now);
3909
3910         search.rbtversion = NULL;
3911         search.serial = 1;
3912         search.options = options;
3913         search.copy_name = ISC_FALSE;
3914         search.need_cleanup = ISC_FALSE;
3915         search.wild = ISC_FALSE;
3916         search.zonecut = NULL;
3917         dns_fixedname_init(&search.zonecut_name);
3918         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3919         search.now = now;
3920         update = NULL;
3921         updatesig = NULL;
3922
3923         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3924
3925         /*
3926          * Search down from the root of the tree.  If, while going down, we
3927          * encounter a callback node, cache_zonecut_callback() will search the
3928          * rdatasets at the zone cut for a DNAME rdataset.
3929          */
3930         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3931                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3932                                   cache_zonecut_callback, &search);
3933
3934         if (result == DNS_R_PARTIALMATCH) {
3935                 if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
3936                         result = find_coveringnsec(&search, nodep, now,
3937                                                    foundname, rdataset,
3938                                                    sigrdataset);
3939                         if (result == DNS_R_COVERINGNSEC)
3940                                 goto tree_exit;
3941                 }
3942                 if (search.zonecut != NULL) {
3943                     result = setup_delegation(&search, nodep, foundname,
3944                                               rdataset, sigrdataset);
3945                     goto tree_exit;
3946                 } else {
3947                 find_ns:
3948                         result = find_deepest_zonecut(&search, node, nodep,
3949                                                       foundname, rdataset,
3950                                                       sigrdataset);
3951                         goto tree_exit;
3952                 }
3953         } else if (result != ISC_R_SUCCESS)
3954                 goto tree_exit;
3955
3956         /*
3957          * Certain DNSSEC types are not subject to CNAME matching
3958          * (RFC4035, section 2.5 and RFC3007).
3959          *
3960          * We don't check for RRSIG, because we don't store RRSIG records
3961          * directly.
3962          */
3963         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3964                 cname_ok = ISC_FALSE;
3965
3966         /*
3967          * We now go looking for rdata...
3968          */
3969
3970         lock = &(search.rbtdb->node_locks[node->locknum].lock);
3971         locktype = isc_rwlocktype_read;
3972         NODE_LOCK(lock, locktype);
3973
3974         found = NULL;
3975         foundsig = NULL;
3976         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3977         negtype = RBTDB_RDATATYPE_VALUE(0, type);
3978         nsheader = NULL;
3979         nssig = NULL;
3980         cnamesig = NULL;
3981         empty_node = ISC_TRUE;
3982         header_prev = NULL;
3983         for (header = node->data; header != NULL; header = header_next) {
3984                 header_next = header->next;
3985                 if (header->rdh_ttl <= now) {
3986                         /*
3987                          * This rdataset is stale.  If no one else is using the
3988                          * node, we can clean it up right now, otherwise we
3989                          * mark it as stale, and the node as dirty, so it will
3990                          * get cleaned up later.
3991                          */
3992                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3993                             (locktype == isc_rwlocktype_write ||
3994                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3995                                 /*
3996                                  * We update the node's status only when we
3997                                  * can get write access.
3998                                  */
3999                                 locktype = isc_rwlocktype_write;
4000
4001                                 if (dns_rbtnode_refcurrent(node) == 0) {
4002                                         isc_mem_t *mctx;
4003
4004                                         mctx = search.rbtdb->common.mctx;
4005                                         clean_stale_headers(search.rbtdb, mctx,
4006                                                             header);
4007                                         if (header_prev != NULL)
4008                                                 header_prev->next =
4009                                                         header->next;
4010                                         else
4011                                                 node->data = header->next;
4012                                         free_rdataset(search.rbtdb, mctx,
4013                                                       header);
4014                                 } else {
4015                                         header->attributes |=
4016                                                 RDATASET_ATTR_STALE;
4017                                         node->dirty = 1;
4018                                         header_prev = header;
4019                                 }
4020                         } else
4021                                 header_prev = header;
4022                 } else if (EXISTS(header)) {
4023                         /*
4024                          * We now know that there is at least one active
4025                          * non-stale rdataset at this node.
4026                          */
4027                         empty_node = ISC_FALSE;
4028
4029                         /*
4030                          * If we found a type we were looking for, remember
4031                          * it.
4032                          */
4033                         if (header->type == type ||
4034                             (type == dns_rdatatype_any &&
4035                              RBTDB_RDATATYPE_BASE(header->type) != 0) ||
4036                             (cname_ok && header->type ==
4037                              dns_rdatatype_cname)) {
4038                                 /*
4039                                  * We've found the answer.
4040                                  */
4041                                 found = header;
4042                                 if (header->type == dns_rdatatype_cname &&
4043                                     cname_ok &&
4044                                     cnamesig != NULL) {
4045                                         /*
4046                                          * If we've already got the CNAME RRSIG,
4047                                          * use it, otherwise change sigtype
4048                                          * so that we find it.
4049                                          */
4050                                         if (cnamesig != NULL)
4051                                                 foundsig = cnamesig;
4052                                         else
4053                                                 sigtype =
4054                                                     RBTDB_RDATATYPE_SIGCNAME;
4055                                         foundsig = cnamesig;
4056                                 }
4057                         } else if (header->type == sigtype) {
4058                                 /*
4059                                  * We've found the RRSIG rdataset for our
4060                                  * target type.  Remember it.
4061                                  */
4062                                 foundsig = header;
4063                         } else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
4064                                    header->type == negtype) {
4065                                 /*
4066                                  * We've found a negative cache entry.
4067                                  */
4068                                 found = header;
4069                         } else if (header->type == dns_rdatatype_ns) {
4070                                 /*
4071                                  * Remember a NS rdataset even if we're
4072                                  * not specifically looking for it, because
4073                                  * we might need it later.
4074                                  */
4075                                 nsheader = header;
4076                         } else if (header->type == RBTDB_RDATATYPE_SIGNS) {
4077                                 /*
4078                                  * If we need the NS rdataset, we'll also
4079                                  * need its signature.
4080                                  */
4081                                 nssig = header;
4082                         } else if (cname_ok &&
4083                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
4084                                 /*
4085                                  * If we get a CNAME match, we'll also need
4086                                  * its signature.
4087                                  */
4088                                 cnamesig = header;
4089                         }
4090                         header_prev = header;
4091                 } else
4092                         header_prev = header;
4093         }
4094
4095         if (empty_node) {
4096                 /*
4097                  * We have an exact match for the name, but there are no
4098                  * extant rdatasets.  That means that this node doesn't
4099                  * meaningfully exist, and that we really have a partial match.
4100                  */
4101                 NODE_UNLOCK(lock, locktype);
4102                 goto find_ns;
4103         }
4104
4105         /*
4106          * If we didn't find what we were looking for...
4107          */
4108         if (found == NULL ||
4109             (found->trust == dns_trust_glue &&
4110              ((options & DNS_DBFIND_GLUEOK) == 0)) ||
4111             (found->trust == dns_trust_pending &&
4112              ((options & DNS_DBFIND_PENDINGOK) == 0))) {
4113                 /*
4114                  * If there is an NS rdataset at this node, then this is the
4115                  * deepest zone cut.
4116                  */
4117                 if (nsheader != NULL) {
4118                         if (nodep != NULL) {
4119                                 new_reference(search.rbtdb, node);
4120                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4121                                 *nodep = node;
4122                         }
4123                         bind_rdataset(search.rbtdb, node, nsheader, search.now,
4124                                       rdataset);
4125                         if (need_headerupdate(nsheader, search.now))
4126                                 update = nsheader;
4127                         if (nssig != NULL) {
4128                                 bind_rdataset(search.rbtdb, node, nssig,
4129                                               search.now, sigrdataset);
4130                                 if (need_headerupdate(nssig, search.now))
4131                                         updatesig = nssig;
4132                         }
4133                         result = DNS_R_DELEGATION;
4134                         goto node_exit;
4135                 }
4136
4137                 /*
4138                  * Go find the deepest zone cut.
4139                  */
4140                 NODE_UNLOCK(lock, locktype);
4141                 goto find_ns;
4142         }
4143
4144         /*
4145          * We found what we were looking for, or we found a CNAME.
4146          */
4147
4148         if (nodep != NULL) {
4149                 new_reference(search.rbtdb, node);
4150                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4151                 *nodep = node;
4152         }
4153
4154         if (RBTDB_RDATATYPE_BASE(found->type) == 0) {
4155                 /*
4156                  * We found a negative cache entry.
4157                  */
4158                 if (NXDOMAIN(found))
4159                         result = DNS_R_NCACHENXDOMAIN;
4160                 else
4161                         result = DNS_R_NCACHENXRRSET;
4162         } else if (type != found->type &&
4163                    type != dns_rdatatype_any &&
4164                    found->type == dns_rdatatype_cname) {
4165                 /*
4166                  * We weren't doing an ANY query and we found a CNAME instead
4167                  * of the type we were looking for, so we need to indicate
4168                  * that result to the caller.
4169                  */
4170                 result = DNS_R_CNAME;
4171         } else {
4172                 /*
4173                  * An ordinary successful query!
4174                  */
4175                 result = ISC_R_SUCCESS;
4176         }
4177
4178         if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
4179             result == DNS_R_NCACHENXRRSET) {
4180                 bind_rdataset(search.rbtdb, node, found, search.now,
4181                               rdataset);
4182                 if (need_headerupdate(found, search.now))
4183                         update = found;
4184                 if (foundsig != NULL) {
4185                         bind_rdataset(search.rbtdb, node, foundsig, search.now,
4186                                       sigrdataset);
4187                         if (need_headerupdate(foundsig, search.now))
4188                                 updatesig = foundsig;
4189                 }
4190         }
4191
4192  node_exit:
4193         if ((update != NULL || updatesig != NULL) &&
4194             locktype != isc_rwlocktype_write) {
4195                 NODE_UNLOCK(lock, locktype);
4196                 NODE_LOCK(lock, isc_rwlocktype_write);
4197                 locktype = isc_rwlocktype_write;
4198         }
4199         if (update != NULL && need_headerupdate(update, search.now))
4200                 update_header(search.rbtdb, update, search.now);
4201         if (updatesig != NULL && need_headerupdate(updatesig, search.now))
4202                 update_header(search.rbtdb, updatesig, search.now);
4203
4204         NODE_UNLOCK(lock, locktype);
4205
4206  tree_exit:
4207         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4208
4209         /*
4210          * If we found a zonecut but aren't going to use it, we have to
4211          * let go of it.
4212          */
4213         if (search.need_cleanup) {
4214                 node = search.zonecut;
4215                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
4216
4217                 NODE_LOCK(lock, isc_rwlocktype_read);
4218                 decrement_reference(search.rbtdb, node, 0,
4219                                     isc_rwlocktype_read, isc_rwlocktype_none,
4220                                     ISC_FALSE);
4221                 NODE_UNLOCK(lock, isc_rwlocktype_read);
4222         }
4223
4224         dns_rbtnodechain_reset(&search.chain);
4225
4226         return (result);
4227 }
4228
4229 static isc_result_t
4230 cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
4231                   isc_stdtime_t now, dns_dbnode_t **nodep,
4232                   dns_name_t *foundname,
4233                   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
4234 {
4235         dns_rbtnode_t *node = NULL;
4236         nodelock_t *lock;
4237         isc_result_t result;
4238         rbtdb_search_t search;
4239         rdatasetheader_t *header, *header_prev, *header_next;
4240         rdatasetheader_t *found, *foundsig;
4241         unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
4242         isc_rwlocktype_t locktype;
4243
4244         search.rbtdb = (dns_rbtdb_t *)db;
4245
4246         REQUIRE(VALID_RBTDB(search.rbtdb));
4247
4248         if (now == 0)
4249                 isc_stdtime_get(&now);
4250
4251         search.rbtversion = NULL;
4252         search.serial = 1;
4253         search.options = options;
4254         search.copy_name = ISC_FALSE;
4255         search.need_cleanup = ISC_FALSE;
4256         search.wild = ISC_FALSE;
4257         search.zonecut = NULL;
4258         dns_fixedname_init(&search.zonecut_name);
4259         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
4260         search.now = now;
4261
4262         if ((options & DNS_DBFIND_NOEXACT) != 0)
4263                 rbtoptions |= DNS_RBTFIND_NOEXACT;
4264
4265         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4266
4267         /*
4268          * Search down from the root of the tree.
4269          */
4270         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
4271                                   &search.chain, rbtoptions, NULL, &search);
4272
4273         if (result == DNS_R_PARTIALMATCH) {
4274         find_ns:
4275                 result = find_deepest_zonecut(&search, node, nodep, foundname,
4276                                               rdataset, sigrdataset);
4277                 goto tree_exit;
4278         } else if (result != ISC_R_SUCCESS)
4279                 goto tree_exit;
4280
4281         /*
4282          * We now go looking for an NS rdataset at the node.
4283          */
4284
4285         lock = &(search.rbtdb->node_locks[node->locknum].lock);
4286         locktype = isc_rwlocktype_read;
4287         NODE_LOCK(lock, locktype);
4288
4289         found = NULL;
4290         foundsig = NULL;
4291         header_prev = NULL;
4292         for (header = node->data; header != NULL; header = header_next) {
4293                 header_next = header->next;
4294                 if (header->rdh_ttl <= now) {
4295                         /*
4296                          * This rdataset is stale.  If no one else is using the
4297                          * node, we can clean it up right now, otherwise we
4298                          * mark it as stale, and the node as dirty, so it will
4299                          * get cleaned up later.
4300                          */
4301                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
4302                             (locktype == isc_rwlocktype_write ||
4303                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
4304                                 /*
4305                                  * We update the node's status only when we
4306                                  * can get write access.
4307                                  */
4308                                 locktype = isc_rwlocktype_write;
4309
4310                                 if (dns_rbtnode_refcurrent(node) == 0) {
4311                                         isc_mem_t *mctx;
4312
4313                                         mctx = search.rbtdb->common.mctx;
4314                                         clean_stale_headers(search.rbtdb, mctx,
4315                                                             header);
4316                                         if (header_prev != NULL)
4317                                                 header_prev->next =
4318                                                         header->next;
4319                                         else
4320                                                 node->data = header->next;
4321                                         free_rdataset(search.rbtdb, mctx,
4322                                                       header);
4323                                 } else {
4324                                         header->attributes |=
4325                                                 RDATASET_ATTR_STALE;
4326                                         node->dirty = 1;
4327                                         header_prev = header;
4328                                 }
4329                         } else
4330                                 header_prev = header;
4331                 } else if (EXISTS(header)) {
4332                         /*
4333                          * If we found a type we were looking for, remember
4334                          * it.
4335                          */
4336                         if (header->type == dns_rdatatype_ns) {
4337                                 /*
4338                                  * Remember a NS rdataset even if we're
4339                                  * not specifically looking for it, because
4340                                  * we might need it later.
4341                                  */
4342                                 found = header;
4343                         } else if (header->type == RBTDB_RDATATYPE_SIGNS) {
4344                                 /*
4345                                  * If we need the NS rdataset, we'll also
4346                                  * need its signature.
4347                                  */
4348                                 foundsig = header;
4349                         }
4350                         header_prev = header;
4351                 } else
4352                         header_prev = header;
4353         }
4354
4355         if (found == NULL) {
4356                 /*
4357                  * No NS records here.
4358                  */
4359                 NODE_UNLOCK(lock, locktype);
4360                 goto find_ns;
4361         }
4362
4363         if (nodep != NULL) {
4364                 new_reference(search.rbtdb, node);
4365                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4366                 *nodep = node;
4367         }
4368
4369         bind_rdataset(search.rbtdb, node, found, search.now, rdataset);
4370         if (foundsig != NULL)
4371                 bind_rdataset(search.rbtdb, node, foundsig, search.now,
4372                               sigrdataset);
4373
4374         if (need_headerupdate(found, search.now) ||
4375             (foundsig != NULL &&  need_headerupdate(foundsig, search.now))) {
4376                 if (locktype != isc_rwlocktype_write) {
4377                         NODE_UNLOCK(lock, locktype);
4378                         NODE_LOCK(lock, isc_rwlocktype_write);
4379                         locktype = isc_rwlocktype_write;
4380                 }
4381                 if (need_headerupdate(found, search.now))
4382                         update_header(search.rbtdb, found, search.now);
4383                 if (foundsig != NULL &&
4384                     need_headerupdate(foundsig, search.now)) {
4385                         update_header(search.rbtdb, foundsig, search.now);
4386                 }
4387         }
4388
4389         NODE_UNLOCK(lock, locktype);
4390
4391  tree_exit:
4392         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4393
4394         INSIST(!search.need_cleanup);
4395
4396         dns_rbtnodechain_reset(&search.chain);
4397
4398         if (result == DNS_R_DELEGATION)
4399                 result = ISC_R_SUCCESS;
4400
4401         return (result);
4402 }
4403
4404 static void
4405 attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) {
4406         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4407         dns_rbtnode_t *node = (dns_rbtnode_t *)source;
4408         unsigned int refs;
4409
4410         REQUIRE(VALID_RBTDB(rbtdb));
4411         REQUIRE(targetp != NULL && *targetp == NULL);
4412
4413         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
4414         dns_rbtnode_refincrement(node, &refs);
4415         INSIST(refs != 0);
4416         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
4417
4418         *targetp = source;
4419 }
4420
4421 static void
4422 detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
4423         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4424         dns_rbtnode_t *node;
4425         isc_boolean_t want_free = ISC_FALSE;
4426         isc_boolean_t inactive = ISC_FALSE;
4427         rbtdb_nodelock_t *nodelock;
4428
4429         REQUIRE(VALID_RBTDB(rbtdb));
4430         REQUIRE(targetp != NULL && *targetp != NULL);
4431
4432         node = (dns_rbtnode_t *)(*targetp);
4433         nodelock = &rbtdb->node_locks[node->locknum];
4434
4435         NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
4436
4437         if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
4438                                 isc_rwlocktype_none, ISC_FALSE)) {
4439                 if (isc_refcount_current(&nodelock->references) == 0 &&
4440                     nodelock->exiting) {
4441                         inactive = ISC_TRUE;
4442                 }
4443         }
4444
4445         NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
4446
4447         *targetp = NULL;
4448
4449         if (inactive) {
4450                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
4451                 rbtdb->active--;
4452                 if (rbtdb->active == 0)
4453                         want_free = ISC_TRUE;
4454                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
4455                 if (want_free) {
4456                         char buf[DNS_NAME_FORMATSIZE];
4457                         if (dns_name_dynamic(&rbtdb->common.origin))
4458                                 dns_name_format(&rbtdb->common.origin, buf,
4459                                                 sizeof(buf));
4460                         else
4461                                 strcpy(buf, "<UNKNOWN>");
4462                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
4463                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
4464                                       "calling free_rbtdb(%s)", buf);
4465                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
4466                 }
4467         }
4468 }
4469
4470 static isc_result_t
4471 expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) {
4472         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4473         dns_rbtnode_t *rbtnode = node;
4474         rdatasetheader_t *header;
4475         isc_boolean_t force_expire = ISC_FALSE;
4476         /*
4477          * These are the category and module used by the cache cleaner.
4478          */
4479         isc_boolean_t log = ISC_FALSE;
4480         isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE;
4481         isc_logmodule_t *module = DNS_LOGMODULE_CACHE;
4482         int level = ISC_LOG_DEBUG(2);
4483         char printname[DNS_NAME_FORMATSIZE];
4484
4485         REQUIRE(VALID_RBTDB(rbtdb));
4486
4487         /*
4488          * Caller must hold a tree lock.
4489          */
4490
4491         if (now == 0)
4492                 isc_stdtime_get(&now);
4493
4494         if (rbtdb->overmem) {
4495                 isc_uint32_t val;
4496
4497                 isc_random_get(&val);
4498                 /*
4499                  * XXXDCL Could stand to have a better policy, like LRU.
4500                  */
4501                 force_expire = ISC_TF(rbtnode->down == NULL && val % 4 == 0);
4502
4503                 /*
4504                  * Note that 'log' can be true IFF rbtdb->overmem is also true.
4505                  * rbtdb->overmem can currently only be true for cache
4506                  * databases -- hence all of the "overmem cache" log strings.
4507                  */
4508                 log = ISC_TF(isc_log_wouldlog(dns_lctx, level));
4509                 if (log)
4510                         isc_log_write(dns_lctx, category, module, level,
4511                                       "overmem cache: %s %s",
4512                                       force_expire ? "FORCE" : "check",
4513                                       dns_rbt_formatnodename(rbtnode,
4514                                                            printname,
4515                                                            sizeof(printname)));
4516         }
4517
4518         /*
4519          * We may not need write access, but this code path is not performance
4520          * sensitive, so it should be okay to always lock as a writer.
4521          */
4522         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4523                   isc_rwlocktype_write);
4524
4525         for (header = rbtnode->data; header != NULL; header = header->next)
4526                 if (header->rdh_ttl <= now - RBTDB_VIRTUAL) {
4527                         /*
4528                          * We don't check if refcurrent(rbtnode) == 0 and try
4529                          * to free like we do in cache_find(), because
4530                          * refcurrent(rbtnode) must be non-zero.  This is so
4531                          * because 'node' is an argument to the function.
4532                          */
4533                         header->attributes |= RDATASET_ATTR_STALE;
4534                         rbtnode->dirty = 1;
4535                         if (log)
4536                                 isc_log_write(dns_lctx, category, module,
4537                                               level, "overmem cache: stale %s",
4538                                               printname);
4539                 } else if (force_expire) {
4540                         if (! RETAIN(header)) {
4541                                 set_ttl(rbtdb, header, 0);
4542                                 header->attributes |= RDATASET_ATTR_STALE;
4543                                 rbtnode->dirty = 1;
4544                         } else if (log) {
4545                                 isc_log_write(dns_lctx, category, module,
4546                                               level, "overmem cache: "
4547                                               "reprieve by RETAIN() %s",
4548                                               printname);
4549                         }
4550                 } else if (rbtdb->overmem && log)
4551                         isc_log_write(dns_lctx, category, module, level,
4552                                       "overmem cache: saved %s", printname);
4553
4554         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4555                     isc_rwlocktype_write);
4556
4557         return (ISC_R_SUCCESS);
4558 }
4559
4560 static void
4561 overmem(dns_db_t *db, isc_boolean_t overmem) {
4562         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4563
4564         if (IS_CACHE(rbtdb))
4565                 rbtdb->overmem = overmem;
4566 }
4567
4568 static void
4569 printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) {
4570         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4571         dns_rbtnode_t *rbtnode = node;
4572         isc_boolean_t first;
4573
4574         REQUIRE(VALID_RBTDB(rbtdb));
4575
4576         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4577                   isc_rwlocktype_read);
4578
4579         fprintf(out, "node %p, %u references, locknum = %u\n",
4580                 rbtnode, dns_rbtnode_refcurrent(rbtnode),
4581                 rbtnode->locknum);
4582         if (rbtnode->data != NULL) {
4583                 rdatasetheader_t *current, *top_next;
4584
4585                 for (current = rbtnode->data; current != NULL;
4586                      current = top_next) {
4587                         top_next = current->next;
4588                         first = ISC_TRUE;
4589                         fprintf(out, "\ttype %u", current->type);
4590                         do {
4591                                 if (!first)
4592                                         fprintf(out, "\t");
4593                                 first = ISC_FALSE;
4594                                 fprintf(out,
4595                                         "\tserial = %lu, ttl = %u, "
4596                                         "trust = %u, attributes = %u\n",
4597                                         (unsigned long)current->serial,
4598                                         current->rdh_ttl,
4599                                         current->trust,
4600                                         current->attributes);
4601                                 current = current->down;
4602                         } while (current != NULL);
4603                 }
4604         } else
4605                 fprintf(out, "(empty)\n");
4606
4607         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4608                     isc_rwlocktype_read);
4609 }
4610
4611 static isc_result_t
4612 createiterator(dns_db_t *db, isc_boolean_t relative_names,
4613                dns_dbiterator_t **iteratorp)
4614 {
4615         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4616         rbtdb_dbiterator_t *rbtdbiter;
4617
4618         REQUIRE(VALID_RBTDB(rbtdb));
4619
4620         rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
4621         if (rbtdbiter == NULL)
4622                 return (ISC_R_NOMEMORY);
4623
4624         rbtdbiter->common.methods = &dbiterator_methods;
4625         rbtdbiter->common.db = NULL;
4626         dns_db_attach(db, &rbtdbiter->common.db);
4627         rbtdbiter->common.relative_names = relative_names;
4628         rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
4629         rbtdbiter->common.cleaning = ISC_FALSE;
4630         rbtdbiter->paused = ISC_TRUE;
4631         rbtdbiter->tree_locked = isc_rwlocktype_none;
4632         rbtdbiter->result = ISC_R_SUCCESS;
4633         dns_fixedname_init(&rbtdbiter->name);
4634         dns_fixedname_init(&rbtdbiter->origin);
4635         rbtdbiter->node = NULL;
4636         rbtdbiter->delete = 0;
4637         memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions));
4638         dns_rbtnodechain_init(&rbtdbiter->chain, db->mctx);
4639
4640         *iteratorp = (dns_dbiterator_t *)rbtdbiter;
4641
4642         return (ISC_R_SUCCESS);
4643 }
4644
4645 static isc_result_t
4646 zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4647                   dns_rdatatype_t type, dns_rdatatype_t covers,
4648                   isc_stdtime_t now, dns_rdataset_t *rdataset,
4649                   dns_rdataset_t *sigrdataset)
4650 {
4651         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4652         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4653         rdatasetheader_t *header, *header_next, *found, *foundsig;
4654         rbtdb_serial_t serial;
4655         rbtdb_version_t *rbtversion = version;
4656         isc_boolean_t close_version = ISC_FALSE;
4657         rbtdb_rdatatype_t matchtype, sigmatchtype;
4658
4659         REQUIRE(VALID_RBTDB(rbtdb));
4660         REQUIRE(type != dns_rdatatype_any);
4661
4662         if (rbtversion == NULL) {
4663                 currentversion(db, (dns_dbversion_t **) (void *)(&rbtversion));
4664                 close_version = ISC_TRUE;
4665         }
4666         serial = rbtversion->serial;
4667         now = 0;
4668
4669         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4670                   isc_rwlocktype_read);
4671
4672         found = NULL;
4673         foundsig = NULL;
4674         matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
4675         if (covers == 0)
4676                 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4677         else
4678                 sigmatchtype = 0;
4679
4680         for (header = rbtnode->data; header != NULL; header = header_next) {
4681                 header_next = header->next;
4682                 do {
4683                         if (header->serial <= serial &&
4684                             !IGNORE(header)) {
4685                                 /*
4686                                  * Is this a "this rdataset doesn't
4687                                  * exist" record?
4688                                  */
4689                                 if (NONEXISTENT(header))
4690                                         header = NULL;
4691                                 break;
4692                         } else
4693                                 header = header->down;
4694                 } while (header != NULL);
4695                 if (header != NULL) {
4696                         /*
4697                          * We have an active, extant rdataset.  If it's a
4698                          * type we're looking for, remember it.
4699                          */
4700                         if (header->type == matchtype) {
4701                                 found = header;
4702                                 if (foundsig != NULL)
4703                                         break;
4704                         } else if (header->type == sigmatchtype) {
4705                                 foundsig = header;
4706                                 if (found != NULL)
4707                                         break;
4708                         }
4709                 }
4710         }
4711         if (found != NULL) {
4712                 bind_rdataset(rbtdb, rbtnode, found, now, rdataset);
4713                 if (foundsig != NULL)
4714                         bind_rdataset(rbtdb, rbtnode, foundsig, now,
4715                                       sigrdataset);
4716         }
4717
4718         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4719                     isc_rwlocktype_read);
4720
4721         if (close_version)
4722                 closeversion(db, (dns_dbversion_t **) (void *)(&rbtversion),
4723                              ISC_FALSE);
4724
4725         if (found == NULL)
4726                 return (ISC_R_NOTFOUND);
4727
4728         return (ISC_R_SUCCESS);
4729 }
4730
4731 static isc_result_t
4732 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4733                    dns_rdatatype_t type, dns_rdatatype_t covers,
4734                    isc_stdtime_t now, dns_rdataset_t *rdataset,
4735                    dns_rdataset_t *sigrdataset)
4736 {
4737         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4738         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4739         rdatasetheader_t *header, *header_next, *found, *foundsig;
4740         rbtdb_rdatatype_t matchtype, sigmatchtype, negtype;
4741         isc_result_t result;
4742         nodelock_t *lock;
4743         isc_rwlocktype_t locktype;
4744
4745         REQUIRE(VALID_RBTDB(rbtdb));
4746         REQUIRE(type != dns_rdatatype_any);
4747
4748         UNUSED(version);
4749
4750         result = ISC_R_SUCCESS;
4751
4752         if (now == 0)
4753                 isc_stdtime_get(&now);
4754
4755         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
4756         locktype = isc_rwlocktype_read;
4757         NODE_LOCK(lock, locktype);
4758
4759         found = NULL;
4760         foundsig = NULL;
4761         matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
4762         negtype = RBTDB_RDATATYPE_VALUE(0, type);
4763         if (covers == 0)
4764                 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4765         else
4766                 sigmatchtype = 0;
4767
4768         for (header = rbtnode->data; header != NULL; header = header_next) {
4769                 header_next = header->next;
4770                 if (header->rdh_ttl <= now) {
4771                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
4772                             (locktype == isc_rwlocktype_write ||
4773                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
4774                                 /*
4775                                  * We update the node's status only when we
4776                                  * can get write access.
4777                                  */
4778                                 locktype = isc_rwlocktype_write;
4779
4780                                 /*
4781                                  * We don't check if refcurrent(rbtnode) == 0
4782                                  * and try to free like we do in cache_find(),
4783                                  * because refcurrent(rbtnode) must be
4784                                  * non-zero.  This is so because 'node' is an
4785                                  * argument to the function.
4786                                  */
4787                                 header->attributes |= RDATASET_ATTR_STALE;
4788                                 rbtnode->dirty = 1;
4789                         }
4790                 } else if (EXISTS(header)) {
4791                         if (header->type == matchtype)
4792                                 found = header;
4793                         else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
4794                                  header->type == negtype)
4795                                 found = header;
4796                         else if (header->type == sigmatchtype)
4797                                 foundsig = header;
4798                 }
4799         }
4800         if (found != NULL) {
4801                 bind_rdataset(rbtdb, rbtnode, found, now, rdataset);
4802                 if (foundsig != NULL)
4803                         bind_rdataset(rbtdb, rbtnode, foundsig, now,
4804                                       sigrdataset);
4805         }
4806
4807         NODE_UNLOCK(lock, locktype);
4808
4809         if (found == NULL)
4810                 return (ISC_R_NOTFOUND);
4811
4812         if (RBTDB_RDATATYPE_BASE(found->type) == 0) {
4813                 /*
4814                  * We found a negative cache entry.
4815                  */
4816                 if (NXDOMAIN(found))
4817                         result = DNS_R_NCACHENXDOMAIN;
4818                 else
4819                         result = DNS_R_NCACHENXRRSET;
4820         }
4821
4822         return (result);
4823 }
4824
4825 static isc_result_t
4826 allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4827              isc_stdtime_t now, dns_rdatasetiter_t **iteratorp)
4828 {
4829         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4830         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4831         rbtdb_version_t *rbtversion = version;
4832         rbtdb_rdatasetiter_t *iterator;
4833         unsigned int refs;
4834
4835         REQUIRE(VALID_RBTDB(rbtdb));
4836
4837         iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
4838         if (iterator == NULL)
4839                 return (ISC_R_NOMEMORY);
4840
4841         if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
4842                 now = 0;
4843                 if (rbtversion == NULL)
4844                         currentversion(db,
4845                                  (dns_dbversion_t **) (void *)(&rbtversion));
4846                 else {
4847                         unsigned int refs;
4848
4849                         isc_refcount_increment(&rbtversion->references,
4850                                                &refs);
4851                         INSIST(refs > 1);
4852                 }
4853         } else {
4854                 if (now == 0)
4855                         isc_stdtime_get(&now);
4856                 rbtversion = NULL;
4857         }
4858
4859         iterator->common.magic = DNS_RDATASETITER_MAGIC;
4860         iterator->common.methods = &rdatasetiter_methods;
4861         iterator->common.db = db;
4862         iterator->common.node = node;
4863         iterator->common.version = (dns_dbversion_t *)rbtversion;
4864         iterator->common.now = now;
4865
4866         NODE_STRONGLOCK(&rbtdb->node_locks[rbtnode->locknum].lock);
4867
4868         dns_rbtnode_refincrement(rbtnode, &refs);
4869         INSIST(refs != 0);
4870
4871         iterator->current = NULL;
4872
4873         NODE_STRONGUNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock);
4874
4875         *iteratorp = (dns_rdatasetiter_t *)iterator;
4876
4877         return (ISC_R_SUCCESS);
4878 }
4879
4880 static isc_boolean_t
4881 cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) {
4882         rdatasetheader_t *header, *header_next;
4883         isc_boolean_t cname, other_data;
4884         dns_rdatatype_t rdtype;
4885
4886         /*
4887          * The caller must hold the node lock.
4888          */
4889
4890         /*
4891          * Look for CNAME and "other data" rdatasets active in our version.
4892          */
4893         cname = ISC_FALSE;
4894         other_data = ISC_FALSE;
4895         for (header = node->data; header != NULL; header = header_next) {
4896                 header_next = header->next;
4897                 if (header->type == dns_rdatatype_cname) {
4898                         /*
4899                          * Look for an active extant CNAME.
4900                          */
4901                         do {
4902                                 if (header->serial <= serial &&
4903                                     !IGNORE(header)) {
4904                                         /*
4905                                          * Is this a "this rdataset doesn't
4906                                          * exist" record?
4907                                          */
4908                                         if (NONEXISTENT(header))
4909                                                 header = NULL;
4910                                         break;
4911                                 } else
4912                                         header = header->down;
4913                         } while (header != NULL);
4914                         if (header != NULL)
4915                                 cname = ISC_TRUE;
4916                 } else {
4917                         /*
4918                          * Look for active extant "other data".
4919                          *
4920                          * "Other data" is any rdataset whose type is not
4921                          * KEY, NSEC, SIG or RRSIG.
4922                          */
4923                         rdtype = RBTDB_RDATATYPE_BASE(header->type);
4924                         if (rdtype != dns_rdatatype_key &&
4925                             rdtype != dns_rdatatype_sig &&
4926                             rdtype != dns_rdatatype_nsec &&
4927                             rdtype != dns_rdatatype_rrsig) {
4928                                 /*
4929                                  * Is it active and extant?
4930                                  */
4931                                 do {
4932                                         if (header->serial <= serial &&
4933                                             !IGNORE(header)) {
4934                                                 /*
4935                                                  * Is this a "this rdataset
4936                                                  * doesn't exist" record?
4937                                                  */
4938                                                 if (NONEXISTENT(header))
4939                                                         header = NULL;
4940                                                 break;
4941                                         } else
4942                                                 header = header->down;
4943                                 } while (header != NULL);
4944                                 if (header != NULL)
4945                                         other_data = ISC_TRUE;
4946                         }
4947                 }
4948         }
4949
4950         if (cname && other_data)
4951                 return (ISC_TRUE);
4952
4953         return (ISC_FALSE);
4954 }
4955
4956 static isc_result_t
4957 add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion,
4958     rdatasetheader_t *newheader, unsigned int options, isc_boolean_t loading,
4959     dns_rdataset_t *addedrdataset, isc_stdtime_t now)
4960 {
4961         rbtdb_changed_t *changed = NULL;
4962         rdatasetheader_t *topheader, *topheader_prev, *header;
4963         unsigned char *merged;
4964         isc_result_t result;
4965         isc_boolean_t header_nx;
4966         isc_boolean_t newheader_nx;
4967         isc_boolean_t merge;
4968         dns_rdatatype_t rdtype, covers;
4969         rbtdb_rdatatype_t negtype;
4970         dns_trust_t trust;
4971
4972         /*
4973          * Add an rdatasetheader_t to a node.
4974          */
4975
4976         /*
4977          * Caller must be holding the node lock.
4978          */
4979
4980         if ((options & DNS_DBADD_MERGE) != 0) {
4981                 REQUIRE(rbtversion != NULL);
4982                 merge = ISC_TRUE;
4983         } else
4984                 merge = ISC_FALSE;
4985
4986         if ((options & DNS_DBADD_FORCE) != 0)
4987                 trust = dns_trust_ultimate;
4988         else
4989                 trust = newheader->trust;
4990
4991         if (rbtversion != NULL && !loading) {
4992                 /*
4993                  * We always add a changed record, even if no changes end up
4994                  * being made to this node, because it's harmless and
4995                  * simplifies the code.
4996                  */
4997                 changed = add_changed(rbtdb, rbtversion, rbtnode);
4998                 if (changed == NULL) {
4999                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5000                         return (ISC_R_NOMEMORY);
5001                 }
5002         }
5003
5004         newheader_nx = NONEXISTENT(newheader) ? ISC_TRUE : ISC_FALSE;
5005         topheader_prev = NULL;
5006
5007         negtype = 0;
5008         if (rbtversion == NULL && !newheader_nx) {
5009                 rdtype = RBTDB_RDATATYPE_BASE(newheader->type);
5010                 if (rdtype == 0) {
5011                         /*
5012                          * We're adding a negative cache entry.
5013                          */
5014                         covers = RBTDB_RDATATYPE_EXT(newheader->type);
5015                         if (covers == dns_rdatatype_any) {
5016                                 /*
5017                                  * We're adding an negative cache entry
5018                                  * which covers all types (NXDOMAIN,
5019                                  * NODATA(QTYPE=ANY)).
5020                                  *
5021                                  * We make all other data stale so that the
5022                                  * only rdataset that can be found at this
5023                                  * node is the negative cache entry.
5024                                  */
5025                                 for (topheader = rbtnode->data;
5026                                      topheader != NULL;
5027                                      topheader = topheader->next) {
5028                                         set_ttl(rbtdb, topheader, 0);
5029                                         topheader->attributes |=
5030                                                 RDATASET_ATTR_STALE;
5031                                 }
5032                                 rbtnode->dirty = 1;
5033                                 goto find_header;
5034                         }
5035                         negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
5036                 } else {
5037                         /*
5038                          * We're adding something that isn't a
5039                          * negative cache entry.  Look for an extant
5040                          * non-stale NXDOMAIN/NODATA(QTYPE=ANY) negative
5041                          * cache entry.
5042                          */
5043                         for (topheader = rbtnode->data;
5044                              topheader != NULL;
5045                              topheader = topheader->next) {
5046                                 if (topheader->type ==
5047                                     RBTDB_RDATATYPE_NCACHEANY)
5048                                         break;
5049                         }
5050                         if (topheader != NULL && EXISTS(topheader) &&
5051                             topheader->rdh_ttl > now) {
5052                                 /*
5053                                  * Found one.
5054                                  */
5055                                 if (trust < topheader->trust) {
5056                                         /*
5057                                          * The NXDOMAIN/NODATA(QTYPE=ANY)
5058                                          * is more trusted.
5059                                          */
5060                                         free_rdataset(rbtdb,
5061                                                       rbtdb->common.mctx,
5062                                                       newheader);
5063                                         if (addedrdataset != NULL)
5064                                                 bind_rdataset(rbtdb, rbtnode,
5065                                                               topheader, now,
5066                                                               addedrdataset);
5067                                         return (DNS_R_UNCHANGED);
5068                                 }
5069                                 /*
5070                                  * The new rdataset is better.  Expire the
5071                                  * NXDOMAIN/NODATA(QTYPE=ANY).
5072                                  */
5073                                 set_ttl(rbtdb, topheader, 0);
5074                                 topheader->attributes |= RDATASET_ATTR_STALE;
5075                                 rbtnode->dirty = 1;
5076                                 topheader = NULL;
5077                                 goto find_header;
5078                         }
5079                         negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
5080                 }
5081         }
5082
5083         for (topheader = rbtnode->data;
5084              topheader != NULL;
5085              topheader = topheader->next) {
5086                 if (topheader->type == newheader->type ||
5087                     topheader->type == negtype)
5088                         break;
5089                 topheader_prev = topheader;
5090         }
5091
5092  find_header:
5093         /*
5094          * If header isn't NULL, we've found the right type.  There may be
5095          * IGNORE rdatasets between the top of the chain and the first real
5096          * data.  We skip over them.
5097          */
5098         header = topheader;
5099         while (header != NULL && IGNORE(header))
5100                 header = header->down;
5101         if (header != NULL) {
5102                 header_nx = NONEXISTENT(header) ? ISC_TRUE : ISC_FALSE;
5103
5104                 /*
5105                  * Deleting an already non-existent rdataset has no effect.
5106                  */
5107                 if (header_nx && newheader_nx) {
5108                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5109                         return (DNS_R_UNCHANGED);
5110                 }
5111
5112                 /*
5113                  * Trying to add an rdataset with lower trust to a cache DB
5114                  * has no effect, provided that the cache data isn't stale.
5115                  */
5116                 if (rbtversion == NULL && trust < header->trust &&
5117                     (header->rdh_ttl > now || header_nx)) {
5118                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5119                         if (addedrdataset != NULL)
5120                                 bind_rdataset(rbtdb, rbtnode, header, now,
5121                                               addedrdataset);
5122                         return (DNS_R_UNCHANGED);
5123                 }
5124
5125                 /*
5126                  * Don't merge if a nonexistent rdataset is involved.
5127                  */
5128                 if (merge && (header_nx || newheader_nx))
5129                         merge = ISC_FALSE;
5130
5131                 /*
5132                  * If 'merge' is ISC_TRUE, we'll try to create a new rdataset
5133                  * that is the union of 'newheader' and 'header'.
5134                  */
5135                 if (merge) {
5136                         unsigned int flags = 0;
5137                         INSIST(rbtversion->serial >= header->serial);
5138                         merged = NULL;
5139                         result = ISC_R_SUCCESS;
5140
5141                         if ((options & DNS_DBADD_EXACT) != 0)
5142                                 flags |= DNS_RDATASLAB_EXACT;
5143                         if ((options & DNS_DBADD_EXACTTTL) != 0 &&
5144                              newheader->rdh_ttl != header->rdh_ttl)
5145                                         result = DNS_R_NOTEXACT;
5146                         else if (newheader->rdh_ttl != header->rdh_ttl)
5147                                 flags |= DNS_RDATASLAB_FORCE;
5148                         if (result == ISC_R_SUCCESS)
5149                                 result = dns_rdataslab_merge(
5150                                              (unsigned char *)header,
5151                                              (unsigned char *)newheader,
5152                                              (unsigned int)(sizeof(*newheader)),
5153                                              rbtdb->common.mctx,
5154                                              rbtdb->common.rdclass,
5155                                              (dns_rdatatype_t)header->type,
5156                                              flags, &merged);
5157                         if (result == ISC_R_SUCCESS) {
5158                                 /*
5159                                  * If 'header' has the same serial number as
5160                                  * we do, we could clean it up now if we knew
5161                                  * that our caller had no references to it.
5162                                  * We don't know this, however, so we leave it
5163                                  * alone.  It will get cleaned up when
5164                                  * clean_zone_node() runs.
5165                                  */
5166                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5167                                               newheader);
5168                                 newheader = (rdatasetheader_t *)merged;
5169                         } else {
5170                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5171                                               newheader);
5172                                 return (result);
5173                         }
5174                 }
5175                 /*
5176                  * Don't replace existing NS, A and AAAA RRsets
5177                  * in the cache if they are already exist.  This
5178                  * prevents named being locked to old servers.
5179                  * Don't lower trust of existing record if the
5180                  * update is forced.
5181                  */
5182                 if (IS_CACHE(rbtdb) && header->rdh_ttl > now &&
5183                     header->type == dns_rdatatype_ns &&
5184                     !header_nx && !newheader_nx &&
5185                     header->trust >= newheader->trust &&
5186                     dns_rdataslab_equalx((unsigned char *)header,
5187                                          (unsigned char *)newheader,
5188                                          (unsigned int)(sizeof(*newheader)),
5189                                          rbtdb->common.rdclass,
5190                                          (dns_rdatatype_t)header->type)) {
5191                         /*
5192                          * Honour the new ttl if it is less than the
5193                          * older one.
5194                          */
5195                         if (header->rdh_ttl > newheader->rdh_ttl)
5196                                 set_ttl(rbtdb, header, newheader->rdh_ttl);
5197                         if (header->noqname == NULL &&
5198                             newheader->noqname != NULL) {
5199                                 header->noqname = newheader->noqname;
5200                                 newheader->noqname = NULL;
5201                         }
5202                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5203                         if (addedrdataset != NULL)
5204                                 bind_rdataset(rbtdb, rbtnode, header, now,
5205                                               addedrdataset);
5206                         return (ISC_R_SUCCESS);
5207                 }
5208                 if (IS_CACHE(rbtdb) && header->rdh_ttl > now &&
5209                     (header->type == dns_rdatatype_a ||
5210                      header->type == dns_rdatatype_aaaa) &&
5211                     !header_nx && !newheader_nx &&
5212                     header->trust >= newheader->trust &&
5213                     dns_rdataslab_equal((unsigned char *)header,
5214                                         (unsigned char *)newheader,
5215                                         (unsigned int)(sizeof(*newheader)))) {
5216                         /*
5217                          * Honour the new ttl if it is less than the
5218                          * older one.
5219                          */
5220                         if (header->rdh_ttl > newheader->rdh_ttl)
5221                                 set_ttl(rbtdb, header, newheader->rdh_ttl);
5222                         if (header->noqname == NULL &&
5223                             newheader->noqname != NULL) {
5224                                 header->noqname = newheader->noqname;
5225                                 newheader->noqname = NULL;
5226                         }
5227                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5228                         if (addedrdataset != NULL)
5229                                 bind_rdataset(rbtdb, rbtnode, header, now,
5230                                               addedrdataset);
5231                         return (ISC_R_SUCCESS);
5232                 }
5233                 INSIST(rbtversion == NULL ||
5234                        rbtversion->serial >= topheader->serial);
5235                 if (topheader_prev != NULL)
5236                         topheader_prev->next = newheader;
5237                 else
5238                         rbtnode->data = newheader;
5239                 newheader->next = topheader->next;
5240                 if (loading) {
5241                         /*
5242                          * There are no other references to 'header' when
5243                          * loading, so we MAY clean up 'header' now.
5244                          * Since we don't generate changed records when
5245                          * loading, we MUST clean up 'header' now.
5246                          */
5247                         newheader->down = NULL;
5248                         free_rdataset(rbtdb, rbtdb->common.mctx, header);
5249                 } else {
5250                         newheader->down = topheader;
5251                         topheader->next = newheader;
5252                         rbtnode->dirty = 1;
5253                         if (changed != NULL)
5254                                 changed->dirty = ISC_TRUE;
5255                         if (rbtversion == NULL) {
5256                                 set_ttl(rbtdb, header, 0);
5257                                 header->attributes |= RDATASET_ATTR_STALE;
5258                         }
5259                         if (IS_CACHE(rbtdb)) {
5260                                 int idx = newheader->node->locknum;
5261
5262                                 ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
5263                                                  newheader, lru_link);
5264
5265                                 /*
5266                                  * XXXMLG We don't check the return value
5267                                  * here.  If it fails, we will not do TTL
5268                                  * based expiry on this node.  However, we
5269                                  * will do it on the LRU side, so memory
5270                                  * will not leak... for long.
5271                                  */
5272                                 isc_heap_insert(rbtdb->heaps[idx], newheader);
5273                         }
5274                 }
5275         } else {
5276                 /*
5277                  * No non-IGNORED rdatasets of the given type exist at
5278                  * this node.
5279                  */
5280
5281                 /*
5282                  * If we're trying to delete the type, don't bother.
5283                  */
5284                 if (newheader_nx) {
5285                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5286                         return (DNS_R_UNCHANGED);
5287                 }
5288
5289                 if (topheader != NULL) {
5290                         /*
5291                          * We have an list of rdatasets of the given type,
5292                          * but they're all marked IGNORE.  We simply insert
5293                          * the new rdataset at the head of the list.
5294                          *
5295                          * Ignored rdatasets cannot occur during loading, so
5296                          * we INSIST on it.
5297                          */
5298                         INSIST(!loading);
5299                         INSIST(rbtversion == NULL ||
5300                                rbtversion->serial >= topheader->serial);
5301                         if (topheader_prev != NULL)
5302                                 topheader_prev->next = newheader;
5303                         else
5304                                 rbtnode->data = newheader;
5305                         newheader->next = topheader->next;
5306                         newheader->down = topheader;
5307                         topheader->next = newheader;
5308                         rbtnode->dirty = 1;
5309                         if (changed != NULL)
5310                                 changed->dirty = ISC_TRUE;
5311                 } else {
5312                         /*
5313                          * No rdatasets of the given type exist at the node.
5314                          */
5315                         newheader->next = rbtnode->data;
5316                         newheader->down = NULL;
5317                         rbtnode->data = newheader;
5318                 }
5319                 if (IS_CACHE(rbtdb)) {
5320                         int idx = newheader->node->locknum;
5321                         ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
5322                                          newheader, lru_link);
5323                         isc_heap_insert(rbtdb->heaps[idx], newheader);
5324                 }
5325         }
5326
5327         /*
5328          * Check if the node now contains CNAME and other data.
5329          */
5330         if (rbtversion != NULL &&
5331             cname_and_other_data(rbtnode, rbtversion->serial))
5332                 return (DNS_R_CNAMEANDOTHER);
5333
5334         if (addedrdataset != NULL)
5335                 bind_rdataset(rbtdb, rbtnode, newheader, now, addedrdataset);
5336
5337         return (ISC_R_SUCCESS);
5338 }
5339
5340 static inline isc_boolean_t
5341 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
5342                 rbtdb_rdatatype_t type)
5343 {
5344         if (IS_CACHE(rbtdb)) {
5345                 if (type == dns_rdatatype_dname)
5346                         return (ISC_TRUE);
5347                 else
5348                         return (ISC_FALSE);
5349         } else if (type == dns_rdatatype_dname ||
5350                    (type == dns_rdatatype_ns &&
5351                     (node != rbtdb->origin_node || IS_STUB(rbtdb))))
5352                 return (ISC_TRUE);
5353         return (ISC_FALSE);
5354 }
5355
5356 static inline isc_result_t
5357 addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
5358            dns_rdataset_t *rdataset)
5359 {
5360         struct noqname *noqname;
5361         isc_mem_t *mctx = rbtdb->common.mctx;
5362         dns_name_t name;
5363         dns_rdataset_t nsec, nsecsig;
5364         isc_result_t result;
5365         isc_region_t r;
5366
5367         dns_name_init(&name, NULL);
5368         dns_rdataset_init(&nsec);
5369         dns_rdataset_init(&nsecsig);
5370
5371         result = dns_rdataset_getnoqname(rdataset, &name, &nsec, &nsecsig);
5372         RUNTIME_CHECK(result == ISC_R_SUCCESS);
5373
5374         noqname = isc_mem_get(mctx, sizeof(*noqname));
5375         if (noqname == NULL) {
5376                 result = ISC_R_NOMEMORY;
5377                 goto cleanup;
5378         }
5379         dns_name_init(&noqname->name, NULL);
5380         noqname->nsec = NULL;
5381         noqname->nsecsig = NULL;
5382         result = dns_name_dup(&name, mctx, &noqname->name);
5383         if (result != ISC_R_SUCCESS)
5384                 goto cleanup;
5385         result = dns_rdataslab_fromrdataset(&nsec, mctx, &r, 0);
5386         if (result != ISC_R_SUCCESS)
5387                 goto cleanup;
5388         noqname->nsec = r.base;
5389         result = dns_rdataslab_fromrdataset(&nsecsig, mctx, &r, 0);
5390         if (result != ISC_R_SUCCESS)
5391                 goto cleanup;
5392         noqname->nsecsig = r.base;
5393         dns_rdataset_disassociate(&nsec);
5394         dns_rdataset_disassociate(&nsecsig);
5395         newheader->noqname = noqname;
5396         return (ISC_R_SUCCESS);
5397
5398 cleanup:
5399         dns_rdataset_disassociate(&nsec);
5400         dns_rdataset_disassociate(&nsecsig);
5401         free_noqname(mctx, &noqname);
5402         return(result);
5403 }
5404
5405 static isc_result_t
5406 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5407             isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
5408             dns_rdataset_t *addedrdataset)
5409 {
5410         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5411         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5412         rbtdb_version_t *rbtversion = version;
5413         isc_region_t region;
5414         rdatasetheader_t *newheader;
5415         rdatasetheader_t *header;
5416         isc_result_t result;
5417         isc_boolean_t delegating;
5418         isc_boolean_t tree_locked = ISC_FALSE;
5419
5420         REQUIRE(VALID_RBTDB(rbtdb));
5421
5422         if (rbtversion == NULL) {
5423                 if (now == 0)
5424                         isc_stdtime_get(&now);
5425         } else
5426                 now = 0;
5427
5428         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5429                                             &region,
5430                                             sizeof(rdatasetheader_t));
5431         if (result != ISC_R_SUCCESS)
5432                 return (result);
5433
5434         newheader = (rdatasetheader_t *)region.base;
5435         init_rdataset(rbtdb, newheader);
5436         set_ttl(rbtdb, newheader, rdataset->ttl + now);
5437         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5438                                                 rdataset->covers);
5439         newheader->attributes = 0;
5440         newheader->noqname = NULL;
5441         newheader->count = init_count++;
5442         newheader->trust = rdataset->trust;
5443         newheader->additional_auth = NULL;
5444         newheader->additional_glue = NULL;
5445         newheader->last_used = now;
5446         newheader->node = rbtnode;
5447         if (rbtversion != NULL) {
5448                 newheader->serial = rbtversion->serial;
5449                 now = 0;
5450         } else {
5451                 newheader->serial = 1;
5452                 if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0)
5453                         newheader->attributes |= RDATASET_ATTR_NXDOMAIN;
5454                 if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
5455                         result = addnoqname(rbtdb, newheader, rdataset);
5456                         if (result != ISC_R_SUCCESS) {
5457                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5458                                               newheader);
5459                                 return (result);
5460                         }
5461                 }
5462         }
5463
5464         /*
5465          * If we're adding a delegation type (e.g. NS or DNAME for a zone,
5466          * just DNAME for the cache), then we need to set the callback bit
5467          * on the node.
5468          */
5469         if (delegating_type(rbtdb, rbtnode, rdataset->type))
5470                 delegating = ISC_TRUE;
5471         else
5472                 delegating = ISC_FALSE;
5473
5474         /*
5475          * If we're adding a delegation type or the DB is a cache in an overmem
5476          * state, hold an exclusive lock on the tree.  In the latter case
5477          * the lock does not necessarily have to be acquired but it will help
5478          * purge stale entries more effectively.
5479          */
5480         if (delegating || (IS_CACHE(rbtdb) && rbtdb->overmem)) {
5481                 tree_locked = ISC_TRUE;
5482                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5483         }
5484
5485         if (IS_CACHE(rbtdb) && rbtdb->overmem)
5486                 overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked);
5487
5488         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5489                   isc_rwlocktype_write);
5490
5491         if (rbtdb->rrsetstats != NULL) {
5492                 newheader->attributes |= RDATASET_ATTR_STATCOUNT;
5493                 update_rrsetstats(rbtdb, newheader, ISC_TRUE);
5494         }
5495
5496         if (IS_CACHE(rbtdb)) {
5497                 if (tree_locked)
5498                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
5499
5500                 header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
5501                 if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL)
5502                         expire_header(rbtdb, header, tree_locked);
5503
5504                 /*
5505                  * If we've been holding a write lock on the tree just for
5506                  * cleaning, we can release it now.  However, we still need the
5507                  * node lock.
5508                  */
5509                 if (tree_locked && !delegating) {
5510                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5511                         tree_locked = ISC_FALSE;
5512                 }
5513         }
5514
5515         result = add(rbtdb, rbtnode, rbtversion, newheader, options, ISC_FALSE,
5516                      addedrdataset, now);
5517         if (result == ISC_R_SUCCESS && delegating)
5518                 rbtnode->find_callback = 1;
5519
5520         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5521                     isc_rwlocktype_write);
5522
5523         if (tree_locked)
5524                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5525
5526         /*
5527          * Update the zone's secure status.  If version is non-NULL
5528          * this is deferred until closeversion() is called.
5529          */
5530         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5531                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5532
5533         return (result);
5534 }
5535
5536 static isc_result_t
5537 subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5538                  dns_rdataset_t *rdataset, unsigned int options,
5539                  dns_rdataset_t *newrdataset)
5540 {
5541         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5542         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5543         rbtdb_version_t *rbtversion = version;
5544         rdatasetheader_t *topheader, *topheader_prev, *header, *newheader;
5545         unsigned char *subresult;
5546         isc_region_t region;
5547         isc_result_t result;
5548         rbtdb_changed_t *changed;
5549
5550         REQUIRE(VALID_RBTDB(rbtdb));
5551
5552         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5553                                             &region,
5554                                             sizeof(rdatasetheader_t));
5555         if (result != ISC_R_SUCCESS)
5556                 return (result);
5557         newheader = (rdatasetheader_t *)region.base;
5558         init_rdataset(rbtdb, newheader);
5559         set_ttl(rbtdb, newheader, rdataset->ttl);
5560         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5561                                                 rdataset->covers);
5562         newheader->attributes = 0;
5563         newheader->serial = rbtversion->serial;
5564         newheader->trust = 0;
5565         newheader->noqname = NULL;
5566         newheader->count = init_count++;
5567         newheader->additional_auth = NULL;
5568         newheader->additional_glue = NULL;
5569         newheader->last_used = 0;
5570         newheader->node = rbtnode;
5571
5572         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5573                   isc_rwlocktype_write);
5574
5575         changed = add_changed(rbtdb, rbtversion, rbtnode);
5576         if (changed == NULL) {
5577                 free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5578                 NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5579                             isc_rwlocktype_write);
5580                 return (ISC_R_NOMEMORY);
5581         }
5582
5583         topheader_prev = NULL;
5584         for (topheader = rbtnode->data;
5585              topheader != NULL;
5586              topheader = topheader->next) {
5587                 if (topheader->type == newheader->type)
5588                         break;
5589                 topheader_prev = topheader;
5590         }
5591         /*
5592          * If header isn't NULL, we've found the right type.  There may be
5593          * IGNORE rdatasets between the top of the chain and the first real
5594          * data.  We skip over them.
5595          */
5596         header = topheader;
5597         while (header != NULL && IGNORE(header))
5598                 header = header->down;
5599         if (header != NULL && EXISTS(header)) {
5600                 unsigned int flags = 0;
5601                 subresult = NULL;
5602                 result = ISC_R_SUCCESS;
5603                 if ((options & DNS_DBSUB_EXACT) != 0) {
5604                         flags |= DNS_RDATASLAB_EXACT;
5605                         if (newheader->rdh_ttl != header->rdh_ttl)
5606                                 result = DNS_R_NOTEXACT;
5607                 }
5608                 if (result == ISC_R_SUCCESS)
5609                         result = dns_rdataslab_subtract(
5610                                         (unsigned char *)header,
5611                                         (unsigned char *)newheader,
5612                                         (unsigned int)(sizeof(*newheader)),
5613                                         rbtdb->common.mctx,
5614                                         rbtdb->common.rdclass,
5615                                         (dns_rdatatype_t)header->type,
5616                                         flags, &subresult);
5617                 if (result == ISC_R_SUCCESS) {
5618                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5619                         newheader = (rdatasetheader_t *)subresult;
5620                         /*
5621                          * We have to set the serial since the rdataslab
5622                          * subtraction routine copies the reserved portion of
5623                          * header, not newheader.
5624                          */
5625                         newheader->serial = rbtversion->serial;
5626                         /*
5627                          * XXXJT: dns_rdataslab_subtract() copied the pointers
5628                          * to additional info.  We need to clear these fields
5629                          * to avoid having duplicated references.
5630                          */
5631                         newheader->additional_auth = NULL;
5632                         newheader->additional_glue = NULL;
5633                 } else if (result == DNS_R_NXRRSET) {
5634                         /*
5635                          * This subtraction would remove all of the rdata;
5636                          * add a nonexistent header instead.
5637                          */
5638                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5639                         newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
5640                         if (newheader == NULL) {
5641                                 result = ISC_R_NOMEMORY;
5642                                 goto unlock;
5643                         }
5644                         set_ttl(rbtdb, newheader, 0);
5645                         newheader->type = topheader->type;
5646                         newheader->attributes = RDATASET_ATTR_NONEXISTENT;
5647                         newheader->trust = 0;
5648                         newheader->serial = rbtversion->serial;
5649                         newheader->noqname = NULL;
5650                         newheader->count = 0;
5651                         newheader->additional_auth = NULL;
5652                         newheader->additional_glue = NULL;
5653                         newheader->node = rbtnode;
5654                         newheader->last_used = 0;
5655                 } else {
5656                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5657                         goto unlock;
5658                 }
5659
5660                 /*
5661                  * If we're here, we want to link newheader in front of
5662                  * topheader.
5663                  */
5664                 INSIST(rbtversion->serial >= topheader->serial);
5665                 if (topheader_prev != NULL)
5666                         topheader_prev->next = newheader;
5667                 else
5668                         rbtnode->data = newheader;
5669                 newheader->next = topheader->next;
5670                 newheader->down = topheader;
5671                 topheader->next = newheader;
5672                 rbtnode->dirty = 1;
5673                 changed->dirty = ISC_TRUE;
5674         } else {
5675                 /*
5676                  * The rdataset doesn't exist, so we don't need to do anything
5677                  * to satisfy the deletion request.
5678                  */
5679                 free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5680                 if ((options & DNS_DBSUB_EXACT) != 0)
5681                         result = DNS_R_NOTEXACT;
5682                 else
5683                         result = DNS_R_UNCHANGED;
5684         }
5685
5686         if (result == ISC_R_SUCCESS && newrdataset != NULL)
5687                 bind_rdataset(rbtdb, rbtnode, newheader, 0, newrdataset);
5688
5689  unlock:
5690         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5691                     isc_rwlocktype_write);
5692
5693         /*
5694          * Update the zone's secure status.  If version is non-NULL
5695          * this is deferred until closeversion() is called.
5696          */
5697         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5698                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5699
5700         return (result);
5701 }
5702
5703 static isc_result_t
5704 deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5705                dns_rdatatype_t type, dns_rdatatype_t covers)
5706 {
5707         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5708         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5709         rbtdb_version_t *rbtversion = version;
5710         isc_result_t result;
5711         rdatasetheader_t *newheader;
5712
5713         REQUIRE(VALID_RBTDB(rbtdb));
5714
5715         if (type == dns_rdatatype_any)
5716                 return (ISC_R_NOTIMPLEMENTED);
5717         if (type == dns_rdatatype_rrsig && covers == 0)
5718                 return (ISC_R_NOTIMPLEMENTED);
5719
5720         newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
5721         if (newheader == NULL)
5722                 return (ISC_R_NOMEMORY);
5723         set_ttl(rbtdb, newheader, 0);
5724         newheader->type = RBTDB_RDATATYPE_VALUE(type, covers);
5725         newheader->attributes = RDATASET_ATTR_NONEXISTENT;
5726         newheader->trust = 0;
5727         newheader->noqname = NULL;
5728         newheader->additional_auth = NULL;
5729         newheader->additional_glue = NULL;
5730         if (rbtversion != NULL)
5731                 newheader->serial = rbtversion->serial;
5732         else
5733                 newheader->serial = 0;
5734         newheader->count = 0;
5735         newheader->last_used = 0;
5736         newheader->node = rbtnode;
5737
5738         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5739                   isc_rwlocktype_write);
5740
5741         result = add(rbtdb, rbtnode, rbtversion, newheader, DNS_DBADD_FORCE,
5742                      ISC_FALSE, NULL, 0);
5743
5744         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5745                     isc_rwlocktype_write);
5746
5747         /*
5748          * Update the zone's secure status.  If version is non-NULL
5749          * this is deferred until closeversion() is called.
5750          */
5751         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5752                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5753
5754         return (result);
5755 }
5756
5757 static isc_result_t
5758 loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) {
5759         rbtdb_load_t *loadctx = arg;
5760         dns_rbtdb_t *rbtdb = loadctx->rbtdb;
5761         dns_rbtnode_t *node;
5762         isc_result_t result;
5763         isc_region_t region;
5764         rdatasetheader_t *newheader;
5765
5766         /*
5767          * This routine does no node locking.  See comments in
5768          * 'load' below for more information on loading and
5769          * locking.
5770          */
5771
5772
5773         /*
5774          * SOA records are only allowed at top of zone.
5775          */
5776         if (rdataset->type == dns_rdatatype_soa &&
5777             !IS_CACHE(rbtdb) && !dns_name_equal(name, &rbtdb->common.origin))
5778                 return (DNS_R_NOTZONETOP);
5779
5780         add_empty_wildcards(rbtdb, name);
5781
5782         if (dns_name_iswildcard(name)) {
5783                 /*
5784                  * NS record owners cannot legally be wild cards.
5785                  */
5786                 if (rdataset->type == dns_rdatatype_ns)
5787                         return (DNS_R_INVALIDNS);
5788                 result = add_wildcard_magic(rbtdb, name);
5789                 if (result != ISC_R_SUCCESS)
5790                         return (result);
5791         }
5792
5793         node = NULL;
5794         result = dns_rbt_addnode(rbtdb->tree, name, &node);
5795         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
5796                 return (result);
5797         if (result != ISC_R_EXISTS) {
5798                 dns_name_t foundname;
5799                 dns_name_init(&foundname, NULL);
5800                 dns_rbt_namefromnode(node, &foundname);
5801 #ifdef DNS_RBT_USEHASH
5802                 node->locknum = node->hashval % rbtdb->node_lock_count;
5803 #else
5804                 node->locknum = dns_name_hash(&foundname, ISC_TRUE) %
5805                         rbtdb->node_lock_count;
5806 #endif
5807         }
5808
5809         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5810                                             &region,
5811                                             sizeof(rdatasetheader_t));
5812         if (result != ISC_R_SUCCESS)
5813                 return (result);
5814         newheader = (rdatasetheader_t *)region.base;
5815         init_rdataset(rbtdb, newheader);
5816         set_ttl(rbtdb, newheader,
5817                 rdataset->ttl + loadctx->now); /* XXX overflow check */
5818         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5819                                                 rdataset->covers);
5820         newheader->attributes = 0;
5821         newheader->trust = rdataset->trust;
5822         newheader->serial = 1;
5823         newheader->noqname = NULL;
5824         newheader->count = init_count++;
5825         newheader->additional_auth = NULL;
5826         newheader->additional_glue = NULL;
5827         /* won't be used, but initialize anyway */
5828         newheader->last_used = 0;
5829         newheader->node = node;
5830
5831         result = add(rbtdb, node, rbtdb->current_version, newheader,
5832                      DNS_DBADD_MERGE, ISC_TRUE, NULL, 0);
5833         if (result == ISC_R_SUCCESS &&
5834             delegating_type(rbtdb, node, rdataset->type))
5835                 node->find_callback = 1;
5836         else if (result == DNS_R_UNCHANGED)
5837                 result = ISC_R_SUCCESS;
5838
5839         return (result);
5840 }
5841
5842 static isc_result_t
5843 beginload(dns_db_t *db, dns_addrdatasetfunc_t *addp, dns_dbload_t **dbloadp) {
5844         rbtdb_load_t *loadctx;
5845         dns_rbtdb_t *rbtdb;
5846
5847         rbtdb = (dns_rbtdb_t *)db;
5848
5849         REQUIRE(VALID_RBTDB(rbtdb));
5850
5851         loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx));
5852         if (loadctx == NULL)
5853                 return (ISC_R_NOMEMORY);
5854
5855         loadctx->rbtdb = rbtdb;
5856         if (IS_CACHE(rbtdb))
5857                 isc_stdtime_get(&loadctx->now);
5858         else
5859                 loadctx->now = 0;
5860
5861         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5862
5863         REQUIRE((rbtdb->attributes & (RBTDB_ATTR_LOADED|RBTDB_ATTR_LOADING))
5864                 == 0);
5865         rbtdb->attributes |= RBTDB_ATTR_LOADING;
5866
5867         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5868
5869         *addp = loading_addrdataset;
5870         *dbloadp = loadctx;
5871
5872         return (ISC_R_SUCCESS);
5873 }
5874
5875 static isc_result_t
5876 endload(dns_db_t *db, dns_dbload_t **dbloadp) {
5877         rbtdb_load_t *loadctx;
5878         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5879
5880         REQUIRE(VALID_RBTDB(rbtdb));
5881         REQUIRE(dbloadp != NULL);
5882         loadctx = *dbloadp;
5883         REQUIRE(loadctx->rbtdb == rbtdb);
5884
5885         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5886
5887         REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0);
5888         REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0);
5889
5890         rbtdb->attributes &= ~RBTDB_ATTR_LOADING;
5891         rbtdb->attributes |= RBTDB_ATTR_LOADED;
5892
5893         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5894
5895         /*
5896          * If there's a KEY rdataset at the zone origin containing a
5897          * zone key, we consider the zone secure.
5898          */
5899         if (! IS_CACHE(rbtdb))
5900                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5901
5902         *dbloadp = NULL;
5903
5904         isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx));
5905
5906         return (ISC_R_SUCCESS);
5907 }
5908
5909 static isc_result_t
5910 dump(dns_db_t *db, dns_dbversion_t *version, const char *filename,
5911      dns_masterformat_t masterformat) {
5912         dns_rbtdb_t *rbtdb;
5913
5914         rbtdb = (dns_rbtdb_t *)db;
5915
5916         REQUIRE(VALID_RBTDB(rbtdb));
5917
5918         return (dns_master_dump2(rbtdb->common.mctx, db, version,
5919                                  &dns_master_style_default,
5920                                  filename, masterformat));
5921 }
5922
5923 static void
5924 delete_callback(void *data, void *arg) {
5925         dns_rbtdb_t *rbtdb = arg;
5926         rdatasetheader_t *current, *next;
5927
5928         for (current = data; current != NULL; current = next) {
5929                 next = current->next;
5930                 free_rdataset(rbtdb, rbtdb->common.mctx, current);
5931         }
5932 }
5933
5934 static isc_boolean_t
5935 issecure(dns_db_t *db) {
5936         dns_rbtdb_t *rbtdb;
5937         isc_boolean_t secure;
5938
5939         rbtdb = (dns_rbtdb_t *)db;
5940
5941         REQUIRE(VALID_RBTDB(rbtdb));
5942
5943         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5944         secure = rbtdb->secure;
5945         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5946
5947         return (secure);
5948 }
5949
5950 static unsigned int
5951 nodecount(dns_db_t *db) {
5952         dns_rbtdb_t *rbtdb;
5953         unsigned int count;
5954
5955         rbtdb = (dns_rbtdb_t *)db;
5956
5957         REQUIRE(VALID_RBTDB(rbtdb));
5958
5959         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5960         count = dns_rbt_nodecount(rbtdb->tree);
5961         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5962
5963         return (count);
5964 }
5965
5966 static void
5967 settask(dns_db_t *db, isc_task_t *task) {
5968         dns_rbtdb_t *rbtdb;
5969
5970         rbtdb = (dns_rbtdb_t *)db;
5971
5972         REQUIRE(VALID_RBTDB(rbtdb));
5973
5974         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5975         if (rbtdb->task != NULL)
5976                 isc_task_detach(&rbtdb->task);
5977         if (task != NULL)
5978                 isc_task_attach(task, &rbtdb->task);
5979         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5980 }
5981
5982 static isc_boolean_t
5983 ispersistent(dns_db_t *db) {
5984         UNUSED(db);
5985         return (ISC_FALSE);
5986 }
5987
5988 static isc_result_t
5989 getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) {
5990         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5991         dns_rbtnode_t *onode;
5992         isc_result_t result = ISC_R_SUCCESS;
5993
5994         REQUIRE(VALID_RBTDB(rbtdb));
5995         REQUIRE(nodep != NULL && *nodep == NULL);
5996
5997         /* Note that the access to origin_node doesn't require a DB lock */
5998         onode = (dns_rbtnode_t *)rbtdb->origin_node;
5999         if (onode != NULL) {
6000                 NODE_STRONGLOCK(&rbtdb->node_locks[onode->locknum].lock);
6001                 new_reference(rbtdb, onode);
6002                 NODE_STRONGUNLOCK(&rbtdb->node_locks[onode->locknum].lock);
6003
6004                 *nodep = rbtdb->origin_node;
6005         } else {
6006                 INSIST(!IS_CACHE(rbtdb));
6007                 result = ISC_R_NOTFOUND;
6008         }
6009
6010         return (result);
6011 }
6012
6013 static dns_stats_t *
6014 getrrsetstats(dns_db_t *db) {
6015         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6016
6017         REQUIRE(VALID_RBTDB(rbtdb));
6018         REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
6019
6020         return (rbtdb->rrsetstats);
6021 }
6022
6023 static dns_dbmethods_t zone_methods = {
6024         attach,
6025         detach,
6026         beginload,
6027         endload,
6028         dump,
6029         currentversion,
6030         newversion,
6031         attachversion,
6032         closeversion,
6033         findnode,
6034         zone_find,
6035         zone_findzonecut,
6036         attachnode,
6037         detachnode,
6038         expirenode,
6039         printnode,
6040         createiterator,
6041         zone_findrdataset,
6042         allrdatasets,
6043         addrdataset,
6044         subtractrdataset,
6045         deleterdataset,
6046         issecure,
6047         nodecount,
6048         ispersistent,
6049         overmem,
6050         settask,
6051         getoriginnode,
6052         NULL,
6053         NULL
6054 };
6055
6056 static dns_dbmethods_t cache_methods = {
6057         attach,
6058         detach,
6059         beginload,
6060         endload,
6061         dump,
6062         currentversion,
6063         newversion,
6064         attachversion,
6065         closeversion,
6066         findnode,
6067         cache_find,
6068         cache_findzonecut,
6069         attachnode,
6070         detachnode,
6071         expirenode,
6072         printnode,
6073         createiterator,
6074         cache_findrdataset,
6075         allrdatasets,
6076         addrdataset,
6077         subtractrdataset,
6078         deleterdataset,
6079         issecure,
6080         nodecount,
6081         ispersistent,
6082         overmem,
6083         settask,
6084         getoriginnode,
6085         NULL,
6086         getrrsetstats
6087 };
6088
6089 isc_result_t
6090 #ifdef DNS_RBTDB_VERSION64
6091 dns_rbtdb64_create
6092 #else
6093 dns_rbtdb_create
6094 #endif
6095                 (isc_mem_t *mctx, dns_name_t *origin, dns_dbtype_t type,
6096                  dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
6097                  void *driverarg, dns_db_t **dbp)
6098 {
6099         dns_rbtdb_t *rbtdb;
6100         isc_result_t result;
6101         int i;
6102         dns_name_t name;
6103
6104         /* Keep the compiler happy. */
6105         UNUSED(argc);
6106         UNUSED(argv);
6107         UNUSED(driverarg);
6108
6109         rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
6110         if (rbtdb == NULL)
6111                 return (ISC_R_NOMEMORY);
6112
6113         memset(rbtdb, '\0', sizeof(*rbtdb));
6114         dns_name_init(&rbtdb->common.origin, NULL);
6115         rbtdb->common.attributes = 0;
6116         if (type == dns_dbtype_cache) {
6117                 rbtdb->common.methods = &cache_methods;
6118                 rbtdb->common.attributes |= DNS_DBATTR_CACHE;
6119         } else if (type == dns_dbtype_stub) {
6120                 rbtdb->common.methods = &zone_methods;
6121                 rbtdb->common.attributes |= DNS_DBATTR_STUB;
6122         } else
6123                 rbtdb->common.methods = &zone_methods;
6124         rbtdb->common.rdclass = rdclass;
6125         rbtdb->common.mctx = NULL;
6126
6127         result = RBTDB_INITLOCK(&rbtdb->lock);
6128         if (result != ISC_R_SUCCESS)
6129                 goto cleanup_rbtdb;
6130
6131         result = isc_rwlock_init(&rbtdb->tree_lock, 0, 0);
6132         if (result != ISC_R_SUCCESS)
6133                 goto cleanup_lock;
6134
6135         /*
6136          * Initialize node_lock_count in a generic way to support future
6137          * extension which allows the user to specify this value on creation.
6138          * Note that when specified for a cache DB it must be larger than 1
6139          * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
6140          */
6141         if (rbtdb->node_lock_count == 0) {
6142                 if (IS_CACHE(rbtdb))
6143                         rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
6144                 else
6145                         rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
6146         } else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
6147                 result = ISC_R_RANGE;
6148                 goto cleanup_tree_lock;
6149         }
6150         INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
6151         rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
6152                                         sizeof(rbtdb_nodelock_t));
6153         if (rbtdb->node_locks == NULL) {
6154                 result = ISC_R_NOMEMORY;
6155                 goto cleanup_tree_lock;
6156         }
6157
6158         rbtdb->rrsetstats = NULL;
6159         if (IS_CACHE(rbtdb)) {
6160                 result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
6161                 if (result != ISC_R_SUCCESS)
6162                         goto cleanup_node_locks;
6163                 rbtdb->rdatasets = isc_mem_get(mctx, rbtdb->node_lock_count *
6164                                                sizeof(rdatasetheaderlist_t));
6165                 if (rbtdb->rdatasets == NULL) {
6166                         result = ISC_R_NOMEMORY;
6167                         goto cleanup_rrsetstats;
6168                 }
6169                 for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6170                         ISC_LIST_INIT(rbtdb->rdatasets[i]);
6171
6172                 /*
6173                  * Create the heaps.
6174                  */
6175                 rbtdb->heaps = isc_mem_get(mctx, rbtdb->node_lock_count *
6176                                            sizeof(isc_heap_t *));
6177                 if (rbtdb->heaps == NULL) {
6178                         result = ISC_R_NOMEMORY;
6179                         goto cleanup_rdatasets;
6180                 }
6181                 for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6182                         rbtdb->heaps[i] = NULL;
6183                 for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
6184                         result = isc_heap_create(mctx, ttl_sooner,
6185                                                  ttl_set_index, 0,
6186                                                  &rbtdb->heaps[i]);
6187                         if (result != ISC_R_SUCCESS)
6188                                 goto cleanup_heaps;
6189                 }
6190         } else {
6191                 rbtdb->rdatasets = NULL;
6192                 rbtdb->heaps = NULL;
6193         }
6194
6195         rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
6196                                        sizeof(rbtnodelist_t));
6197         if (rbtdb->deadnodes == NULL) {
6198                 result = ISC_R_NOMEMORY;
6199                 goto cleanup_heaps;
6200         }
6201         for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6202                 ISC_LIST_INIT(rbtdb->deadnodes[i]);
6203
6204         rbtdb->active = rbtdb->node_lock_count;
6205
6206         for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
6207                 result = NODE_INITLOCK(&rbtdb->node_locks[i].lock);
6208                 if (result == ISC_R_SUCCESS) {
6209                         result = isc_refcount_init(&rbtdb->node_locks[i].references, 0);
6210                         if (result != ISC_R_SUCCESS)
6211                                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
6212                 }
6213                 if (result != ISC_R_SUCCESS) {
6214                         while (i-- > 0) {
6215                                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
6216                                 isc_refcount_decrement(&rbtdb->node_locks[i].references, NULL);
6217                                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
6218                         }
6219                         goto cleanup_deadnodes;
6220                 }
6221                 rbtdb->node_locks[i].exiting = ISC_FALSE;
6222         }
6223
6224         /*
6225          * Attach to the mctx.  The database will persist so long as there
6226          * are references to it, and attaching to the mctx ensures that our
6227          * mctx won't disappear out from under us.
6228          */
6229         isc_mem_attach(mctx, &rbtdb->common.mctx);
6230
6231         /*
6232          * Must be initialized before free_rbtdb() is called.
6233          */
6234         isc_ondestroy_init(&rbtdb->common.ondest);
6235
6236         /*
6237          * Make a copy of the origin name.
6238          */
6239         result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
6240         if (result != ISC_R_SUCCESS) {
6241                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6242                 return (result);
6243         }
6244
6245         /*
6246          * Make the Red-Black Tree.
6247          */
6248         result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
6249         if (result != ISC_R_SUCCESS) {
6250                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6251                 return (result);
6252         }
6253         /*
6254          * In order to set the node callback bit correctly in zone databases,
6255          * we need to know if the node has the origin name of the zone.
6256          * In loading_addrdataset() we could simply compare the new name
6257          * to the origin name, but this is expensive.  Also, we don't know the
6258          * node name in addrdataset(), so we need another way of knowing the
6259          * zone's top.
6260          *
6261          * We now explicitly create a node for the zone's origin, and then
6262          * we simply remember the node's address.  This is safe, because
6263          * the top-of-zone node can never be deleted, nor can its address
6264          * change.
6265          */
6266         if (!IS_CACHE(rbtdb)) {
6267                 rbtdb->origin_node = NULL;
6268                 result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
6269                                          &rbtdb->origin_node);
6270                 if (result != ISC_R_SUCCESS) {
6271                         INSIST(result != ISC_R_EXISTS);
6272                         free_rbtdb(rbtdb, ISC_FALSE, NULL);
6273                         return (result);
6274                 }
6275                 /*
6276                  * We need to give the origin node the right locknum.
6277                  */
6278                 dns_name_init(&name, NULL);
6279                 dns_rbt_namefromnode(rbtdb->origin_node, &name);
6280 #ifdef DNS_RBT_USEHASH
6281                 rbtdb->origin_node->locknum =
6282                         rbtdb->origin_node->hashval %
6283                         rbtdb->node_lock_count;
6284 #else
6285                 rbtdb->origin_node->locknum =
6286                         dns_name_hash(&name, ISC_TRUE) %
6287                         rbtdb->node_lock_count;
6288 #endif
6289         }
6290
6291         /*
6292          * Misc. Initialization.
6293          */
6294         result = isc_refcount_init(&rbtdb->references, 1);
6295         if (result != ISC_R_SUCCESS) {
6296                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6297                 return (result);
6298         }
6299         rbtdb->attributes = 0;
6300         rbtdb->secure = ISC_FALSE;
6301         rbtdb->overmem = ISC_FALSE;
6302         rbtdb->task = NULL;
6303
6304         /*
6305          * Version Initialization.
6306          */
6307         rbtdb->current_serial = 1;
6308         rbtdb->least_serial = 1;
6309         rbtdb->next_serial = 2;
6310         rbtdb->current_version = allocate_version(mctx, 1, 1, ISC_FALSE);
6311         if (rbtdb->current_version == NULL) {
6312                 isc_refcount_decrement(&rbtdb->references, NULL);
6313                 isc_refcount_destroy(&rbtdb->references);
6314                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6315                 return (ISC_R_NOMEMORY);
6316         }
6317         rbtdb->future_version = NULL;
6318         ISC_LIST_INIT(rbtdb->open_versions);
6319         /*
6320          * Keep the current version in the open list so that list operation
6321          * won't happen in normal lookup operations.
6322          */
6323         PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
6324
6325         rbtdb->common.magic = DNS_DB_MAGIC;
6326         rbtdb->common.impmagic = RBTDB_MAGIC;
6327
6328         *dbp = (dns_db_t *)rbtdb;
6329
6330         return (ISC_R_SUCCESS);
6331
6332  cleanup_deadnodes:
6333         isc_mem_put(mctx, rbtdb->deadnodes,
6334                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
6335
6336  cleanup_heaps:
6337         if (rbtdb->heaps != NULL) {
6338                 for (i = 0 ; i < (int)rbtdb->node_lock_count ; i++)
6339                         if (rbtdb->heaps[i] != NULL)
6340                                 isc_heap_destroy(&rbtdb->heaps[i]);
6341                 isc_mem_put(mctx, rbtdb->heaps,
6342                             rbtdb->node_lock_count * sizeof(isc_heap_t *));
6343         }
6344
6345  cleanup_rdatasets:
6346         if (rbtdb->rdatasets != NULL)
6347                 isc_mem_put(mctx, rbtdb->rdatasets, rbtdb->node_lock_count *
6348                             sizeof(rdatasetheaderlist_t));
6349  cleanup_rrsetstats:
6350         if (rbtdb->rrsetstats != NULL)
6351                 dns_stats_detach(&rbtdb->rrsetstats);
6352
6353  cleanup_node_locks:
6354         isc_mem_put(mctx, rbtdb->node_locks,
6355                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
6356
6357  cleanup_tree_lock:
6358         isc_rwlock_destroy(&rbtdb->tree_lock);
6359
6360  cleanup_lock:
6361         RBTDB_DESTROYLOCK(&rbtdb->lock);
6362
6363  cleanup_rbtdb:
6364         isc_mem_put(mctx, rbtdb,  sizeof(*rbtdb));
6365         return (result);
6366 }
6367
6368
6369 /*
6370  * Slabbed Rdataset Methods
6371  */
6372
6373 static void
6374 rdataset_disassociate(dns_rdataset_t *rdataset) {
6375         dns_db_t *db = rdataset->private1;
6376         dns_dbnode_t *node = rdataset->private2;
6377
6378         detachnode(db, &node);
6379 }
6380
6381 static isc_result_t
6382 rdataset_first(dns_rdataset_t *rdataset) {
6383         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
6384         unsigned int count;
6385
6386         count = raw[0] * 256 + raw[1];
6387         if (count == 0) {
6388                 rdataset->private5 = NULL;
6389                 return (ISC_R_NOMORE);
6390         }
6391
6392 #if DNS_RDATASET_FIXED
6393         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0)
6394                 raw += 2 + (4 * count);
6395         else
6396 #endif
6397                 raw += 2;
6398
6399         /*
6400          * The privateuint4 field is the number of rdata beyond the
6401          * cursor position, so we decrement the total count by one
6402          * before storing it.
6403          *
6404          * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the
6405          * first record.  If DNS_RDATASETATTR_LOADORDER is set 'raw' points
6406          * to the first entry in the offset table.
6407          */
6408         count--;
6409         rdataset->privateuint4 = count;
6410         rdataset->private5 = raw;
6411
6412         return (ISC_R_SUCCESS);
6413 }
6414
6415 static isc_result_t
6416 rdataset_next(dns_rdataset_t *rdataset) {
6417         unsigned int count;
6418         unsigned int length;
6419         unsigned char *raw;     /* RDATASLAB */
6420
6421         count = rdataset->privateuint4;
6422         if (count == 0)
6423                 return (ISC_R_NOMORE);
6424         count--;
6425         rdataset->privateuint4 = count;
6426
6427         /*
6428          * Skip forward one record (length + 4) or one offset (4).
6429          */
6430         raw = rdataset->private5;
6431 #if DNS_RDATASET_FIXED
6432         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) {
6433 #endif
6434                 length = raw[0] * 256 + raw[1];
6435                 raw += length;
6436 #if DNS_RDATASET_FIXED
6437         }
6438         rdataset->private5 = raw + 4;           /* length(2) + order(2) */
6439 #else
6440         rdataset->private5 = raw + 2;           /* length(2) */
6441 #endif
6442
6443         return (ISC_R_SUCCESS);
6444 }
6445
6446 static void
6447 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) {
6448         unsigned char *raw = rdataset->private5;        /* RDATASLAB */
6449 #if DNS_RDATASET_FIXED
6450         unsigned int offset;
6451 #endif
6452         isc_region_t r;
6453
6454         REQUIRE(raw != NULL);
6455
6456         /*
6457          * Find the start of the record if not already in private5
6458          * then skip the length and order fields.
6459          */
6460 #if DNS_RDATASET_FIXED
6461         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) {
6462                 offset = (raw[0] << 24) + (raw[1] << 16) +
6463                          (raw[2] << 8) + raw[3];
6464                 raw = rdataset->private3;
6465                 raw += offset;
6466         }
6467 #endif
6468         r.length = raw[0] * 256 + raw[1];
6469
6470 #if DNS_RDATASET_FIXED
6471         raw += 4;
6472 #else
6473         raw += 2;
6474 #endif
6475         r.base = raw;
6476         dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r);
6477 }
6478
6479 static void
6480 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) {
6481         dns_db_t *db = source->private1;
6482         dns_dbnode_t *node = source->private2;
6483         dns_dbnode_t *cloned_node = NULL;
6484
6485         attachnode(db, node, &cloned_node);
6486         *target = *source;
6487
6488         /*
6489          * Reset iterator state.
6490          */
6491         target->privateuint4 = 0;
6492         target->private5 = NULL;
6493 }
6494
6495 static unsigned int
6496 rdataset_count(dns_rdataset_t *rdataset) {
6497         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
6498         unsigned int count;
6499
6500         count = raw[0] * 256 + raw[1];
6501
6502         return (count);
6503 }
6504
6505 static isc_result_t
6506 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
6507                     dns_rdataset_t *nsec, dns_rdataset_t *nsecsig)
6508 {
6509         dns_db_t *db = rdataset->private1;
6510         dns_dbnode_t *node = rdataset->private2;
6511         dns_dbnode_t *cloned_node;
6512         struct noqname *noqname = rdataset->private6;
6513
6514         cloned_node = NULL;
6515         attachnode(db, node, &cloned_node);
6516         nsec->methods = &rdataset_methods;
6517         nsec->rdclass = db->rdclass;
6518         nsec->type = dns_rdatatype_nsec;
6519         nsec->covers = 0;
6520         nsec->ttl = rdataset->ttl;
6521         nsec->trust = rdataset->trust;
6522         nsec->private1 = rdataset->private1;
6523         nsec->private2 = rdataset->private2;
6524         nsec->private3 = noqname->nsec;
6525         nsec->privateuint4 = 0;
6526         nsec->private5 = NULL;
6527         nsec->private6 = NULL;
6528
6529         cloned_node = NULL;
6530         attachnode(db, node, &cloned_node);
6531         nsecsig->methods = &rdataset_methods;
6532         nsecsig->rdclass = db->rdclass;
6533         nsecsig->type = dns_rdatatype_rrsig;
6534         nsecsig->covers = dns_rdatatype_nsec;
6535         nsecsig->ttl = rdataset->ttl;
6536         nsecsig->trust = rdataset->trust;
6537         nsecsig->private1 = rdataset->private1;
6538         nsecsig->private2 = rdataset->private2;
6539         nsecsig->private3 = noqname->nsecsig;
6540         nsecsig->privateuint4 = 0;
6541         nsecsig->private5 = NULL;
6542         nsec->private6 = NULL;
6543
6544         dns_name_clone(&noqname->name, name);
6545
6546         return (ISC_R_SUCCESS);
6547 }
6548
6549 /*
6550  * Rdataset Iterator Methods
6551  */
6552
6553 static void
6554 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) {
6555         rbtdb_rdatasetiter_t *rbtiterator;
6556
6557         rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
6558
6559         if (rbtiterator->common.version != NULL)
6560                 closeversion(rbtiterator->common.db,
6561                              &rbtiterator->common.version, ISC_FALSE);
6562         detachnode(rbtiterator->common.db, &rbtiterator->common.node);
6563         isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
6564                     sizeof(*rbtiterator));
6565
6566         *iteratorp = NULL;
6567 }
6568
6569 static isc_result_t
6570 rdatasetiter_first(dns_rdatasetiter_t *iterator) {
6571         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6572         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6573         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6574         rbtdb_version_t *rbtversion = rbtiterator->common.version;
6575         rdatasetheader_t *header, *top_next;
6576         rbtdb_serial_t serial;
6577         isc_stdtime_t now;
6578
6579         if (IS_CACHE(rbtdb)) {
6580                 serial = 1;
6581                 now = rbtiterator->common.now;
6582         } else {
6583                 serial = rbtversion->serial;
6584                 now = 0;
6585         }
6586
6587         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6588                   isc_rwlocktype_read);
6589
6590         for (header = rbtnode->data; header != NULL; header = top_next) {
6591                 top_next = header->next;
6592                 do {
6593                         if (header->serial <= serial && !IGNORE(header)) {
6594                                 /*
6595                                  * Is this a "this rdataset doesn't exist"
6596                                  * record?  Or is it too old in the cache?
6597                                  *
6598                                  * Note: unlike everywhere else, we
6599                                  * check for now > header->ttl instead
6600                                  * of now >= header->ttl.  This allows
6601                                  * ANY and RRSIG queries for 0 TTL
6602                                  * rdatasets to work.
6603                                  */
6604                                 if (NONEXISTENT(header) ||
6605                                     (now != 0 && now > header->rdh_ttl))
6606                                         header = NULL;
6607                                 break;
6608                         } else
6609                                 header = header->down;
6610                 } while (header != NULL);
6611                 if (header != NULL)
6612                         break;
6613         }
6614
6615         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6616                     isc_rwlocktype_read);
6617
6618         rbtiterator->current = header;
6619
6620         if (header == NULL)
6621                 return (ISC_R_NOMORE);
6622
6623         return (ISC_R_SUCCESS);
6624 }
6625
6626 static isc_result_t
6627 rdatasetiter_next(dns_rdatasetiter_t *iterator) {
6628         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6629         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6630         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6631         rbtdb_version_t *rbtversion = rbtiterator->common.version;
6632         rdatasetheader_t *header, *top_next;
6633         rbtdb_serial_t serial;
6634         isc_stdtime_t now;
6635         rbtdb_rdatatype_t type, negtype;
6636         dns_rdatatype_t rdtype, covers;
6637
6638         header = rbtiterator->current;
6639         if (header == NULL)
6640                 return (ISC_R_NOMORE);
6641
6642         if (IS_CACHE(rbtdb)) {
6643                 serial = 1;
6644                 now = rbtiterator->common.now;
6645         } else {
6646                 serial = rbtversion->serial;
6647                 now = 0;
6648         }
6649
6650         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6651                   isc_rwlocktype_read);
6652
6653         type = header->type;
6654         rdtype = RBTDB_RDATATYPE_BASE(header->type);
6655         if (rdtype == 0) {
6656                 covers = RBTDB_RDATATYPE_EXT(header->type);
6657                 negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
6658         } else
6659                 negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
6660         for (header = header->next; header != NULL; header = top_next) {
6661                 top_next = header->next;
6662                 /*
6663                  * If not walking back up the down list.
6664                  */
6665                 if (header->type != type && header->type != negtype) {
6666                         do {
6667                                 if (header->serial <= serial &&
6668                                     !IGNORE(header)) {
6669                                         /*
6670                                          * Is this a "this rdataset doesn't
6671                                          * exist" record?
6672                                          *
6673                                          * Note: unlike everywhere else, we
6674                                          * check for now > header->ttl instead
6675                                          * of now >= header->ttl.  This allows
6676                                          * ANY and RRSIG queries for 0 TTL
6677                                          * rdatasets to work.
6678                                          */
6679                                         if ((header->attributes &
6680                                              RDATASET_ATTR_NONEXISTENT) != 0 ||
6681                                             (now != 0 && now > header->rdh_ttl))
6682                                                 header = NULL;
6683                                         break;
6684                                 } else
6685                                         header = header->down;
6686                         } while (header != NULL);
6687                         if (header != NULL)
6688                                 break;
6689                 }
6690         }
6691
6692         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6693                     isc_rwlocktype_read);
6694
6695         rbtiterator->current = header;
6696
6697         if (header == NULL)
6698                 return (ISC_R_NOMORE);
6699
6700         return (ISC_R_SUCCESS);
6701 }
6702
6703 static void
6704 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) {
6705         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6706         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6707         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6708         rdatasetheader_t *header;
6709
6710         header = rbtiterator->current;
6711         REQUIRE(header != NULL);
6712
6713         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6714                   isc_rwlocktype_read);
6715
6716         bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
6717                       rdataset);
6718
6719         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6720                     isc_rwlocktype_read);
6721 }
6722
6723
6724 /*
6725  * Database Iterator Methods
6726  */
6727
6728 static inline void
6729 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
6730         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6731         dns_rbtnode_t *node = rbtdbiter->node;
6732
6733         if (node == NULL)
6734                 return;
6735
6736         INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
6737         reactivate_node(rbtdb, node, rbtdbiter->tree_locked);
6738 }
6739
6740 static inline void
6741 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
6742         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6743         dns_rbtnode_t *node = rbtdbiter->node;
6744         nodelock_t *lock;
6745
6746         if (node == NULL)
6747                 return;
6748
6749         lock = &rbtdb->node_locks[node->locknum].lock;
6750         NODE_LOCK(lock, isc_rwlocktype_read);
6751         decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
6752                             rbtdbiter->tree_locked, ISC_FALSE);
6753         NODE_UNLOCK(lock, isc_rwlocktype_read);
6754
6755         rbtdbiter->node = NULL;
6756 }
6757
6758 static void
6759 flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
6760         dns_rbtnode_t *node;
6761         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6762         isc_boolean_t was_read_locked = ISC_FALSE;
6763         nodelock_t *lock;
6764         int i;
6765
6766         if (rbtdbiter->delete != 0) {
6767                 /*
6768                  * Note that "%d node of %d in tree" can report things like
6769                  * "flush_deletions: 59 nodes of 41 in tree".  This means
6770                  * That some nodes appear on the deletions list more than
6771                  * once.  Only the last occurence will actually be deleted.
6772                  */
6773                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
6774                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
6775                               "flush_deletions: %d nodes of %d in tree",
6776                               rbtdbiter->delete,
6777                               dns_rbt_nodecount(rbtdb->tree));
6778
6779                 if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
6780                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6781                         was_read_locked = ISC_TRUE;
6782                 }
6783                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6784                 rbtdbiter->tree_locked = isc_rwlocktype_write;
6785
6786                 for (i = 0; i < rbtdbiter->delete; i++) {
6787                         node = rbtdbiter->deletions[i];
6788                         lock = &rbtdb->node_locks[node->locknum].lock;
6789
6790                         NODE_LOCK(lock, isc_rwlocktype_read);
6791                         decrement_reference(rbtdb, node, 0,
6792                                             isc_rwlocktype_read,
6793                                             rbtdbiter->tree_locked, ISC_FALSE);
6794                         NODE_UNLOCK(lock, isc_rwlocktype_read);
6795                 }
6796
6797                 rbtdbiter->delete = 0;
6798
6799                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6800                 if (was_read_locked) {
6801                         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6802                         rbtdbiter->tree_locked = isc_rwlocktype_read;
6803
6804                 } else {
6805                         rbtdbiter->tree_locked = isc_rwlocktype_none;
6806                 }
6807         }
6808 }
6809
6810 static inline void
6811 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
6812         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6813
6814         REQUIRE(rbtdbiter->paused);
6815         REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
6816
6817         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6818         rbtdbiter->tree_locked = isc_rwlocktype_read;
6819
6820         rbtdbiter->paused = ISC_FALSE;
6821 }
6822
6823 static void
6824 dbiterator_destroy(dns_dbiterator_t **iteratorp) {
6825         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
6826         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6827         dns_db_t *db = NULL;
6828
6829         if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
6830                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6831                 rbtdbiter->tree_locked = isc_rwlocktype_none;
6832         } else
6833                 INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
6834
6835         dereference_iter_node(rbtdbiter);
6836
6837         flush_deletions(rbtdbiter);
6838
6839         dns_db_attach(rbtdbiter->common.db, &db);
6840         dns_db_detach(&rbtdbiter->common.db);
6841
6842         dns_rbtnodechain_reset(&rbtdbiter->chain);
6843         isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
6844         dns_db_detach(&db);
6845
6846         *iteratorp = NULL;
6847 }
6848
6849 static isc_result_t
6850 dbiterator_first(dns_dbiterator_t *iterator) {
6851         isc_result_t result;
6852         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6853         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6854         dns_name_t *name, *origin;
6855
6856         if (rbtdbiter->result != ISC_R_SUCCESS &&
6857             rbtdbiter->result != ISC_R_NOMORE)
6858                 return (rbtdbiter->result);
6859
6860         if (rbtdbiter->paused)
6861                 resume_iteration(rbtdbiter);
6862
6863         dereference_iter_node(rbtdbiter);
6864
6865         name = dns_fixedname_name(&rbtdbiter->name);
6866         origin = dns_fixedname_name(&rbtdbiter->origin);
6867         dns_rbtnodechain_reset(&rbtdbiter->chain);
6868
6869         result = dns_rbtnodechain_first(&rbtdbiter->chain, rbtdb->tree, name,
6870                                         origin);
6871
6872         if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
6873                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6874                                                   NULL, &rbtdbiter->node);
6875                 if (result == ISC_R_SUCCESS) {
6876                         rbtdbiter->new_origin = ISC_TRUE;
6877                         reference_iter_node(rbtdbiter);
6878                 }
6879         } else {
6880                 INSIST(result == ISC_R_NOTFOUND);
6881                 result = ISC_R_NOMORE; /* The tree is empty. */
6882         }
6883
6884         rbtdbiter->result = result;
6885
6886         return (result);
6887 }
6888
6889 static isc_result_t
6890 dbiterator_last(dns_dbiterator_t *iterator) {
6891         isc_result_t result;
6892         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6893         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6894         dns_name_t *name, *origin;
6895
6896         if (rbtdbiter->result != ISC_R_SUCCESS &&
6897             rbtdbiter->result != ISC_R_NOMORE)
6898                 return (rbtdbiter->result);
6899
6900         if (rbtdbiter->paused)
6901                 resume_iteration(rbtdbiter);
6902
6903         dereference_iter_node(rbtdbiter);
6904
6905         name = dns_fixedname_name(&rbtdbiter->name);
6906         origin = dns_fixedname_name(&rbtdbiter->origin);
6907         dns_rbtnodechain_reset(&rbtdbiter->chain);
6908
6909         result = dns_rbtnodechain_last(&rbtdbiter->chain, rbtdb->tree, name,
6910                                        origin);
6911         if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
6912                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6913                                                   NULL, &rbtdbiter->node);
6914                 if (result == ISC_R_SUCCESS) {
6915                         rbtdbiter->new_origin = ISC_TRUE;
6916                         reference_iter_node(rbtdbiter);
6917                 }
6918         } else {
6919                 INSIST(result == ISC_R_NOTFOUND);
6920                 result = ISC_R_NOMORE; /* The tree is empty. */
6921         }
6922
6923         rbtdbiter->result = result;
6924
6925         return (result);
6926 }
6927
6928 static isc_result_t
6929 dbiterator_seek(dns_dbiterator_t *iterator, dns_name_t *name) {
6930         isc_result_t result;
6931         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6932         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6933         dns_name_t *iname, *origin;
6934
6935         if (rbtdbiter->result != ISC_R_SUCCESS &&
6936             rbtdbiter->result != ISC_R_NOMORE)
6937                 return (rbtdbiter->result);
6938
6939         if (rbtdbiter->paused)
6940                 resume_iteration(rbtdbiter);
6941
6942         dereference_iter_node(rbtdbiter);
6943
6944         iname = dns_fixedname_name(&rbtdbiter->name);
6945         origin = dns_fixedname_name(&rbtdbiter->origin);
6946         dns_rbtnodechain_reset(&rbtdbiter->chain);
6947
6948         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &rbtdbiter->node,
6949                                   &rbtdbiter->chain, DNS_RBTFIND_EMPTYDATA,
6950                                   NULL, NULL);
6951         if (result == ISC_R_SUCCESS) {
6952                 result = dns_rbtnodechain_current(&rbtdbiter->chain, iname,
6953                                                   origin, NULL);
6954                 if (result == ISC_R_SUCCESS) {
6955                         rbtdbiter->new_origin = ISC_TRUE;
6956                         reference_iter_node(rbtdbiter);
6957                 }
6958
6959         } else if (result == DNS_R_PARTIALMATCH)
6960                 result = ISC_R_NOTFOUND;
6961
6962         rbtdbiter->result = result;
6963
6964         return (result);
6965 }
6966
6967 static isc_result_t
6968 dbiterator_prev(dns_dbiterator_t *iterator) {
6969         isc_result_t result;
6970         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6971         dns_name_t *name, *origin;
6972
6973         REQUIRE(rbtdbiter->node != NULL);
6974
6975         if (rbtdbiter->result != ISC_R_SUCCESS)
6976                 return (rbtdbiter->result);
6977
6978         if (rbtdbiter->paused)
6979                 resume_iteration(rbtdbiter);
6980
6981         name = dns_fixedname_name(&rbtdbiter->name);
6982         origin = dns_fixedname_name(&rbtdbiter->origin);
6983         result = dns_rbtnodechain_prev(&rbtdbiter->chain, name, origin);
6984
6985         dereference_iter_node(rbtdbiter);
6986
6987         if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
6988                 rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN);
6989                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6990                                                   NULL, &rbtdbiter->node);
6991         }
6992
6993         if (result == ISC_R_SUCCESS)
6994                 reference_iter_node(rbtdbiter);
6995
6996         rbtdbiter->result = result;
6997
6998         return (result);
6999 }
7000
7001 static isc_result_t
7002 dbiterator_next(dns_dbiterator_t *iterator) {
7003         isc_result_t result;
7004         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7005         dns_name_t *name, *origin;
7006
7007         REQUIRE(rbtdbiter->node != NULL);
7008
7009         if (rbtdbiter->result != ISC_R_SUCCESS)
7010                 return (rbtdbiter->result);
7011
7012         if (rbtdbiter->paused)
7013                 resume_iteration(rbtdbiter);
7014
7015         name = dns_fixedname_name(&rbtdbiter->name);
7016         origin = dns_fixedname_name(&rbtdbiter->origin);
7017         result = dns_rbtnodechain_next(&rbtdbiter->chain, name, origin);
7018
7019         dereference_iter_node(rbtdbiter);
7020
7021         if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
7022                 rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN);
7023                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
7024                                                   NULL, &rbtdbiter->node);
7025         }
7026         if (result == ISC_R_SUCCESS)
7027                 reference_iter_node(rbtdbiter);
7028
7029         rbtdbiter->result = result;
7030
7031         return (result);
7032 }
7033
7034 static isc_result_t
7035 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
7036                    dns_name_t *name)
7037 {
7038         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
7039         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7040         dns_rbtnode_t *node = rbtdbiter->node;
7041         isc_result_t result;
7042         dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
7043         dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
7044
7045         REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
7046         REQUIRE(rbtdbiter->node != NULL);
7047
7048         if (rbtdbiter->paused)
7049                 resume_iteration(rbtdbiter);
7050
7051         if (name != NULL) {
7052                 if (rbtdbiter->common.relative_names)
7053                         origin = NULL;
7054                 result = dns_name_concatenate(nodename, origin, name, NULL);
7055                 if (result != ISC_R_SUCCESS)
7056                         return (result);
7057                 if (rbtdbiter->common.relative_names && rbtdbiter->new_origin)
7058                         result = DNS_R_NEWORIGIN;
7059         } else
7060                 result = ISC_R_SUCCESS;
7061
7062         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
7063         new_reference(rbtdb, node);
7064         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
7065
7066         *nodep = rbtdbiter->node;
7067
7068         if (iterator->cleaning && result == ISC_R_SUCCESS) {
7069                 isc_result_t expire_result;
7070
7071                 /*
7072                  * If the deletion array is full, flush it before trying
7073                  * to expire the current node.  The current node can't
7074                  * fully deleted while the iteration cursor is still on it.
7075                  */
7076                 if (rbtdbiter->delete == DELETION_BATCH_MAX)
7077                         flush_deletions(rbtdbiter);
7078
7079                 expire_result = expirenode(iterator->db, *nodep, 0);
7080
7081                 /*
7082                  * expirenode() currently always returns success.
7083                  */
7084                 if (expire_result == ISC_R_SUCCESS && node->down == NULL) {
7085                         unsigned int refs;
7086
7087                         rbtdbiter->deletions[rbtdbiter->delete++] = node;
7088                         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
7089                         dns_rbtnode_refincrement(node, &refs);
7090                         INSIST(refs != 0);
7091                         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
7092                 }
7093         }
7094
7095         return (result);
7096 }
7097
7098 static isc_result_t
7099 dbiterator_pause(dns_dbiterator_t *iterator) {
7100         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
7101         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7102
7103         if (rbtdbiter->result != ISC_R_SUCCESS &&
7104             rbtdbiter->result != ISC_R_NOMORE)
7105                 return (rbtdbiter->result);
7106
7107         if (rbtdbiter->paused)
7108                 return (ISC_R_SUCCESS);
7109
7110         rbtdbiter->paused = ISC_TRUE;
7111
7112         if (rbtdbiter->tree_locked != isc_rwlocktype_none) {
7113                 INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read);
7114                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7115                 rbtdbiter->tree_locked = isc_rwlocktype_none;
7116         }
7117
7118         flush_deletions(rbtdbiter);
7119
7120         return (ISC_R_SUCCESS);
7121 }
7122
7123 static isc_result_t
7124 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
7125         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7126         dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
7127
7128         if (rbtdbiter->result != ISC_R_SUCCESS)
7129                 return (rbtdbiter->result);
7130
7131         return (dns_name_copy(origin, name, NULL));
7132 }
7133
7134 /*%
7135  * Additional cache routines.
7136  */
7137 static isc_result_t
7138 rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type,
7139                        dns_rdatatype_t qtype, dns_acache_t *acache,
7140                        dns_zone_t **zonep, dns_db_t **dbp,
7141                        dns_dbversion_t **versionp, dns_dbnode_t **nodep,
7142                        dns_name_t *fname, dns_message_t *msg,
7143                        isc_stdtime_t now)
7144 {
7145         dns_rbtdb_t *rbtdb = rdataset->private1;
7146         dns_rbtnode_t *rbtnode = rdataset->private2;
7147         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7148         unsigned int current_count = rdataset->privateuint4;
7149         unsigned int count;
7150         rdatasetheader_t *header;
7151         nodelock_t *nodelock;
7152         unsigned int total_count;
7153         acachectl_t *acarray;
7154         dns_acacheentry_t *entry;
7155         isc_result_t result;
7156
7157         UNUSED(qtype); /* we do not use this value at least for now */
7158         UNUSED(acache);
7159
7160         header = (struct rdatasetheader *)(raw - sizeof(*header));
7161
7162         total_count = raw[0] * 256 + raw[1];
7163         INSIST(total_count > current_count);
7164         count = total_count - current_count - 1;
7165
7166         acarray = NULL;
7167
7168         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7169         NODE_LOCK(nodelock, isc_rwlocktype_read);
7170
7171         switch (type) {
7172         case dns_rdatasetadditional_fromauth:
7173                 acarray = header->additional_auth;
7174                 break;
7175         case dns_rdatasetadditional_fromcache:
7176                 acarray = NULL;
7177                 break;
7178         case dns_rdatasetadditional_fromglue:
7179                 acarray = header->additional_glue;
7180                 break;
7181         default:
7182                 INSIST(0);
7183         }
7184
7185         if (acarray == NULL) {
7186                 if (type != dns_rdatasetadditional_fromcache)
7187                         dns_acache_countquerymiss(acache);
7188                 NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7189                 return (ISC_R_NOTFOUND);
7190         }
7191
7192         if (acarray[count].entry == NULL) {
7193                 dns_acache_countquerymiss(acache);
7194                 NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7195                 return (ISC_R_NOTFOUND);
7196         }
7197
7198         entry = NULL;
7199         dns_acache_attachentry(acarray[count].entry, &entry);
7200
7201         NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7202
7203         result = dns_acache_getentry(entry, zonep, dbp, versionp,
7204                                      nodep, fname, msg, now);
7205
7206         dns_acache_detachentry(&entry);
7207
7208         return (result);
7209 }
7210
7211 static void
7212 acache_callback(dns_acacheentry_t *entry, void **arg) {
7213         dns_rbtdb_t *rbtdb;
7214         dns_rbtnode_t *rbtnode;
7215         nodelock_t *nodelock;
7216         acachectl_t *acarray = NULL;
7217         acache_cbarg_t *cbarg;
7218         unsigned int count;
7219
7220         REQUIRE(arg != NULL);
7221         cbarg = *arg;
7222
7223         /*
7224          * The caller must hold the entry lock.
7225          */
7226
7227         rbtdb = (dns_rbtdb_t *)cbarg->db;
7228         rbtnode = (dns_rbtnode_t *)cbarg->node;
7229
7230         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7231         NODE_LOCK(nodelock, isc_rwlocktype_write);
7232
7233         switch (cbarg->type) {
7234         case dns_rdatasetadditional_fromauth:
7235                 acarray = cbarg->header->additional_auth;
7236                 break;
7237         case dns_rdatasetadditional_fromglue:
7238                 acarray = cbarg->header->additional_glue;
7239                 break;
7240         default:
7241                 INSIST(0);
7242         }
7243
7244         count = cbarg->count;
7245         if (acarray != NULL && acarray[count].entry == entry) {
7246                 acarray[count].entry = NULL;
7247                 INSIST(acarray[count].cbarg == cbarg);
7248                 isc_mem_put(rbtdb->common.mctx, cbarg, sizeof(acache_cbarg_t));
7249                 acarray[count].cbarg = NULL;
7250         } else
7251                 isc_mem_put(rbtdb->common.mctx, cbarg, sizeof(acache_cbarg_t));
7252
7253         dns_acache_detachentry(&entry);
7254
7255         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7256
7257         dns_db_detachnode((dns_db_t *)rbtdb, (dns_dbnode_t **)(void*)&rbtnode);
7258         dns_db_detach((dns_db_t **)(void*)&rbtdb);
7259
7260         *arg = NULL;
7261 }
7262
7263 static void
7264 acache_cancelentry(isc_mem_t *mctx, dns_acacheentry_t *entry,
7265                       acache_cbarg_t **cbargp)
7266 {
7267         acache_cbarg_t *cbarg;
7268
7269         REQUIRE(mctx != NULL);
7270         REQUIRE(entry != NULL);
7271         REQUIRE(cbargp != NULL && *cbargp != NULL);
7272
7273         cbarg = *cbargp;
7274
7275         dns_acache_cancelentry(entry);
7276         dns_db_detachnode(cbarg->db, &cbarg->node);
7277         dns_db_detach(&cbarg->db);
7278
7279         isc_mem_put(mctx, cbarg, sizeof(acache_cbarg_t));
7280
7281         *cbargp = NULL;
7282 }
7283
7284 static isc_result_t
7285 rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type,
7286                        dns_rdatatype_t qtype, dns_acache_t *acache,
7287                        dns_zone_t *zone, dns_db_t *db,
7288                        dns_dbversion_t *version, dns_dbnode_t *node,
7289                        dns_name_t *fname)
7290 {
7291         dns_rbtdb_t *rbtdb = rdataset->private1;
7292         dns_rbtnode_t *rbtnode = rdataset->private2;
7293         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7294         unsigned int current_count = rdataset->privateuint4;
7295         rdatasetheader_t *header;
7296         unsigned int total_count, count;
7297         nodelock_t *nodelock;
7298         isc_result_t result;
7299         acachectl_t *acarray;
7300         dns_acacheentry_t *newentry, *oldentry = NULL;
7301         acache_cbarg_t *newcbarg, *oldcbarg = NULL;
7302
7303         UNUSED(qtype);
7304
7305         if (type == dns_rdatasetadditional_fromcache)
7306                 return (ISC_R_SUCCESS);
7307
7308         header = (struct rdatasetheader *)(raw - sizeof(*header));
7309
7310         total_count = raw[0] * 256 + raw[1];
7311         INSIST(total_count > current_count);
7312         count = total_count - current_count - 1; /* should be private data */
7313
7314         newcbarg = isc_mem_get(rbtdb->common.mctx, sizeof(*newcbarg));
7315         if (newcbarg == NULL)
7316                 return (ISC_R_NOMEMORY);
7317         newcbarg->type = type;
7318         newcbarg->count = count;
7319         newcbarg->header = header;
7320         newcbarg->db = NULL;
7321         dns_db_attach((dns_db_t *)rbtdb, &newcbarg->db);
7322         newcbarg->node = NULL;
7323         dns_db_attachnode((dns_db_t *)rbtdb, (dns_dbnode_t *)rbtnode,
7324                           &newcbarg->node);
7325         newentry = NULL;
7326         result = dns_acache_createentry(acache, (dns_db_t *)rbtdb,
7327                                         acache_callback, newcbarg, &newentry);
7328         if (result != ISC_R_SUCCESS)
7329                 goto fail;
7330         /* Set cache data in the new entry. */
7331         result = dns_acache_setentry(acache, newentry, zone, db,
7332                                      version, node, fname);
7333         if (result != ISC_R_SUCCESS)
7334                 goto fail;
7335
7336         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7337         NODE_LOCK(nodelock, isc_rwlocktype_write);
7338
7339         acarray = NULL;
7340         switch (type) {
7341         case dns_rdatasetadditional_fromauth:
7342                 acarray = header->additional_auth;
7343                 break;
7344         case dns_rdatasetadditional_fromglue:
7345                 acarray = header->additional_glue;
7346                 break;
7347         default:
7348                 INSIST(0);
7349         }
7350
7351         if (acarray == NULL) {
7352                 unsigned int i;
7353
7354                 acarray = isc_mem_get(rbtdb->common.mctx, total_count *
7355                                       sizeof(acachectl_t));
7356
7357                 if (acarray == NULL) {
7358                         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7359                         goto fail;
7360                 }
7361
7362                 for (i = 0; i < total_count; i++) {
7363                         acarray[i].entry = NULL;
7364                         acarray[i].cbarg = NULL;
7365                 }
7366         }
7367         switch (type) {
7368         case dns_rdatasetadditional_fromauth:
7369                 header->additional_auth = acarray;
7370                 break;
7371         case dns_rdatasetadditional_fromglue:
7372                 header->additional_glue = acarray;
7373                 break;
7374         default:
7375                 INSIST(0);
7376         }
7377
7378         if (acarray[count].entry != NULL) {
7379                 /*
7380                  * Swap the entry.  Delay cleaning-up the old entry since
7381                  * it would require a node lock.
7382                  */
7383                 oldentry = acarray[count].entry;
7384                 INSIST(acarray[count].cbarg != NULL);
7385                 oldcbarg = acarray[count].cbarg;
7386         }
7387         acarray[count].entry = newentry;
7388         acarray[count].cbarg = newcbarg;
7389
7390         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7391
7392         if (oldentry != NULL) {
7393                 acache_cancelentry(rbtdb->common.mctx, oldentry, &oldcbarg);
7394                 dns_acache_detachentry(&oldentry);
7395         }
7396
7397         return (ISC_R_SUCCESS);
7398
7399   fail:
7400         if (newcbarg != NULL) {
7401                 if (newentry != NULL) {
7402                         acache_cancelentry(rbtdb->common.mctx, newentry,
7403                                            &newcbarg);
7404                         dns_acache_detachentry(&newentry);
7405                 } else {
7406                         dns_db_detachnode((dns_db_t *)rbtdb, &newcbarg->node);
7407                         dns_db_detach(&newcbarg->db);
7408                         isc_mem_put(rbtdb->common.mctx, newcbarg,
7409                             sizeof(*newcbarg));
7410                 }
7411         }
7412
7413         return (result);
7414 }
7415
7416 static isc_result_t
7417 rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset,
7418                        dns_rdatasetadditional_t type, dns_rdatatype_t qtype)
7419 {
7420         dns_rbtdb_t *rbtdb = rdataset->private1;
7421         dns_rbtnode_t *rbtnode = rdataset->private2;
7422         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7423         unsigned int current_count = rdataset->privateuint4;
7424         rdatasetheader_t *header;
7425         nodelock_t *nodelock;
7426         unsigned int total_count, count;
7427         acachectl_t *acarray;
7428         dns_acacheentry_t *entry;
7429         acache_cbarg_t *cbarg;
7430
7431         UNUSED(qtype);          /* we do not use this value at least for now */
7432         UNUSED(acache);
7433
7434         if (type == dns_rdatasetadditional_fromcache)
7435                 return (ISC_R_SUCCESS);
7436
7437         header = (struct rdatasetheader *)(raw - sizeof(*header));
7438
7439         total_count = raw[0] * 256 + raw[1];
7440         INSIST(total_count > current_count);
7441         count = total_count - current_count - 1;
7442
7443         acarray = NULL;
7444         entry = NULL;
7445
7446         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7447         NODE_LOCK(nodelock, isc_rwlocktype_write);
7448
7449         switch (type) {
7450         case dns_rdatasetadditional_fromauth:
7451                 acarray = header->additional_auth;
7452                 break;
7453         case dns_rdatasetadditional_fromglue:
7454                 acarray = header->additional_glue;
7455                 break;
7456         default:
7457                 INSIST(0);
7458         }
7459
7460         if (acarray == NULL) {
7461                 NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7462                 return (ISC_R_NOTFOUND);
7463         }
7464
7465         entry = acarray[count].entry;
7466         if (entry == NULL) {
7467                 NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7468                 return (ISC_R_NOTFOUND);
7469         }
7470
7471         acarray[count].entry = NULL;
7472         cbarg = acarray[count].cbarg;
7473         acarray[count].cbarg = NULL;
7474
7475         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7476
7477         if (entry != NULL) {
7478                 if (cbarg != NULL)
7479                         acache_cancelentry(rbtdb->common.mctx, entry, &cbarg);
7480                 dns_acache_detachentry(&entry);
7481         }
7482
7483         return (ISC_R_SUCCESS);
7484 }
7485
7486 /*%
7487  * Routines for LRU-based cache management.
7488  */
7489
7490 /*%
7491  * See if a given cache entry that is being reused needs to be updated
7492  * in the LRU-list.  From the LRU management point of view, this function is
7493  * expected to return true for almost all cases.  When used with threads,
7494  * however, this may cause a non-negligible performance penalty because a
7495  * writer lock will have to be acquired before updating the list.
7496  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
7497  * function returns true if the entry has not been updated for some period of
7498  * time.  We differentiate the NS or glue address case and the others since
7499  * experiments have shown that the former tends to be accessed relatively
7500  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
7501  * may cause external queries at a higher level zone, involving more
7502  * transactions).
7503  *
7504  * Caller must hold the node (read or write) lock.
7505  */
7506 static inline isc_boolean_t
7507 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) {
7508         if ((header->attributes &
7509              (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0)
7510                 return (ISC_FALSE);
7511
7512 #if DNS_RBTDB_LIMITLRUUPDATE
7513         if (header->type == dns_rdatatype_ns ||
7514             (header->trust == dns_trust_glue &&
7515              (header->type == dns_rdatatype_a ||
7516               header->type == dns_rdatatype_aaaa))) {
7517                 /*
7518                  * Glue records are updated if at least 60 seconds have passed
7519                  * since the previous update time.
7520                  */
7521                 return (header->last_used + 60 <= now);
7522         }
7523
7524         /* Other records are updated if 5 minutes have passed. */
7525         return (header->last_used + 300 <= now);
7526 #else
7527         UNUSED(now);
7528
7529         return (ISC_TRUE);
7530 #endif
7531 }
7532
7533 /*%
7534  * Update the timestamp of a given cache entry and move it to the head
7535  * of the corresponding LRU list.
7536  *
7537  * Caller must hold the node (write) lock.
7538  *
7539  * Note that the we do NOT touch the heap here, as the TTL has not changed.
7540  */
7541 static void
7542 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
7543               isc_stdtime_t now)
7544 {
7545         /* To be checked: can we really assume this? XXXMLG */
7546         INSIST(ISC_LINK_LINKED(header, lru_link));
7547
7548         ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum],
7549                         header, lru_link);
7550         header->last_used = now;
7551         ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum],
7552                          header, lru_link);
7553 }
7554
7555 /*%
7556  * Purge some expired and/or stale (i.e. unused for some period) cache entries
7557  * under an overmem condition.  To recover from this condition quickly, up to
7558  * 2 entries will be purged.  This process is triggered while adding a new
7559  * entry, and we specifically avoid purging entries in the same LRU bucket as
7560  * the one to which the new entry will belong.  Otherwise, we might purge
7561  * entries of the same name of different RR types while adding RRsets from a
7562  * single response (consider the case where we're adding A and AAAA glue records
7563  * of the same NS name).
7564  */
7565 static void
7566 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
7567               isc_stdtime_t now, isc_boolean_t tree_locked)
7568 {
7569         rdatasetheader_t *header, *header_prev;
7570         unsigned int locknum;
7571         int purgecount = 2;
7572
7573         for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
7574              locknum != locknum_start && purgecount > 0;
7575              locknum = (locknum + 1) % rbtdb->node_lock_count) {
7576                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
7577                           isc_rwlocktype_write);
7578
7579                 header = isc_heap_element(rbtdb->heaps[locknum], 1);
7580                 if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) {
7581                         expire_header(rbtdb, header, tree_locked);
7582                         purgecount--;
7583                 }
7584
7585                 for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
7586                      header != NULL && purgecount > 0;
7587                      header = header_prev) {
7588                         header_prev = ISC_LIST_PREV(header, lru_link);
7589                         /*
7590                          * Unlink the entry at this point to avoid checking it
7591                          * again even if it's currently used someone else and
7592                          * cannot be purged at this moment.  This entry won't be
7593                          * referenced any more (so unlinking is safe) since the
7594                          * TTL was reset to 0.
7595                          */
7596                         ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header,
7597                                         lru_link);
7598                         expire_header(rbtdb, header, tree_locked);
7599                         purgecount--;
7600                 }
7601
7602                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
7603                                     isc_rwlocktype_write);
7604         }
7605 }
7606
7607 static void
7608 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
7609               isc_boolean_t tree_locked)
7610 {
7611         set_ttl(rbtdb, header, 0);
7612         header->attributes |= RDATASET_ATTR_STALE;
7613         header->node->dirty = 1;
7614
7615         /*
7616          * Caller must hold the node (write) lock.
7617          */
7618
7619         if (dns_rbtnode_refcurrent(header->node) == 0) {
7620                 /*
7621                  * If no one else is using the node, we can clean it up now.
7622                  * We first need to gain a new reference to the node to meet a
7623                  * requirement of decrement_reference().
7624                  */
7625                 new_reference(rbtdb, header->node);
7626                 decrement_reference(rbtdb, header->node, 0,
7627                                     isc_rwlocktype_write,
7628                                     tree_locked ? isc_rwlocktype_write :
7629                                     isc_rwlocktype_none, ISC_FALSE);
7630         }
7631 }