Merge branch 'master' of ssh://crater.dragonflybsd.org/repository/git/dragonfly
[dragonfly.git] / contrib / bind / lib / dns / rbtdb.c
1 /*
2  * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: rbtdb.c,v 1.248.12.18.2.2 2009/12/31 21:45:53 each Exp $ */
19
20 /*! \file */
21
22 /*
23  * Principal Author: Bob Halley
24  */
25
26 #include <config.h>
27
28 #include <isc/heap.h>
29 #include <isc/event.h>
30 #include <isc/mem.h>
31 #include <isc/platform.h>
32 #include <isc/print.h>
33 #include <isc/mutex.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/rwlock.h>
37 #include <isc/string.h>
38 #include <isc/task.h>
39 #include <isc/time.h>
40 #include <isc/util.h>
41
42 #include <dns/acache.h>
43 #include <dns/db.h>
44 #include <dns/dbiterator.h>
45 #include <dns/events.h>
46 #include <dns/fixedname.h>
47 #include <dns/lib.h>
48 #include <dns/log.h>
49 #include <dns/masterdump.h>
50 #include <dns/rbt.h>
51 #include <dns/rdata.h>
52 #include <dns/rdataset.h>
53 #include <dns/rdatasetiter.h>
54 #include <dns/rdataslab.h>
55 #include <dns/result.h>
56 #include <dns/stats.h>
57 #include <dns/view.h>
58 #include <dns/zone.h>
59 #include <dns/zonekey.h>
60
61 #ifdef DNS_RBTDB_VERSION64
62 #include "rbtdb64.h"
63 #else
64 #include "rbtdb.h"
65 #endif
66
67 #ifdef DNS_RBTDB_VERSION64
68 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
69 #else
70 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
71 #endif
72
73 /*%
74  * Note that "impmagic" is not the first four bytes of the struct, so
75  * ISC_MAGIC_VALID cannot be used.
76  */
77 #define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
78                                  (rbtdb)->common.impmagic == RBTDB_MAGIC)
79
80 #ifdef DNS_RBTDB_VERSION64
81 typedef isc_uint64_t                    rbtdb_serial_t;
82 /*%
83  * Make casting easier in symbolic debuggers by using different names
84  * for the 64 bit version.
85  */
86 #define dns_rbtdb_t dns_rbtdb64_t
87 #define rdatasetheader_t rdatasetheader64_t
88 #define rbtdb_version_t rbtdb_version64_t
89 #else
90 typedef isc_uint32_t                    rbtdb_serial_t;
91 #endif
92
93 typedef isc_uint32_t                    rbtdb_rdatatype_t;
94
95 #define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
96 #define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
97 #define RBTDB_RDATATYPE_VALUE(b, e)     (((e) << 16) | (b))
98
99 #define RBTDB_RDATATYPE_SIGNSEC \
100                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
101 #define RBTDB_RDATATYPE_SIGNS \
102                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
103 #define RBTDB_RDATATYPE_SIGCNAME \
104                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
105 #define RBTDB_RDATATYPE_SIGDNAME \
106                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
107 #define RBTDB_RDATATYPE_NCACHEANY \
108                 RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
109
110 /*
111  * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
112  * Using rwlock is effective with regard to lookup performance only when
113  * it is implemented in an efficient way.
114  * Otherwise, it is generally wise to stick to the simple locking since rwlock
115  * would require more memory or can even make lookups slower due to its own
116  * overhead (when it internally calls mutex locks).
117  */
118 #ifdef ISC_RWLOCK_USEATOMIC
119 #define DNS_RBTDB_USERWLOCK 1
120 #else
121 #define DNS_RBTDB_USERWLOCK 0
122 #endif
123
124 #if DNS_RBTDB_USERWLOCK
125 #define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
126 #define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
127 #define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
128 #define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
129 #else
130 #define RBTDB_INITLOCK(l)       isc_mutex_init(l)
131 #define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
132 #define RBTDB_LOCK(l, t)        LOCK(l)
133 #define RBTDB_UNLOCK(l, t)      UNLOCK(l)
134 #endif
135
136 /*
137  * Since node locking is sensitive to both performance and memory footprint,
138  * we need some trick here.  If we have both high-performance rwlock and
139  * high performance and small-memory reference counters, we use rwlock for
140  * node lock and isc_refcount for node references.  In this case, we don't have
141  * to protect the access to the counters by locks.
142  * Otherwise, we simply use ordinary mutex lock for node locking, and use
143  * simple integers as reference counters which is protected by the lock.
144  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
145  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
146  * counters first and then protect other parts of a node as read-only data.
147  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
148  * provided for these special cases.  When we can use the efficient backend
149  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
150  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
151  * section including the access to the reference counter.
152  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
153  * section is also protected by NODE_STRONGLOCK().
154  */
155 #if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
156 typedef isc_rwlock_t nodelock_t;
157
158 #define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
159 #define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
160 #define NODE_LOCK(l, t)         RWLOCK((l), (t))
161 #define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
162 #define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)
163
164 #define NODE_STRONGLOCK(l)      ((void)0)
165 #define NODE_STRONGUNLOCK(l)    ((void)0)
166 #define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
167 #define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
168 #define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
169 #else
170 typedef isc_mutex_t nodelock_t;
171
172 #define NODE_INITLOCK(l)        isc_mutex_init(l)
173 #define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
174 #define NODE_LOCK(l, t)         LOCK(l)
175 #define NODE_UNLOCK(l, t)       UNLOCK(l)
176 #define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS
177
178 #define NODE_STRONGLOCK(l)      LOCK(l)
179 #define NODE_STRONGUNLOCK(l)    UNLOCK(l)
180 #define NODE_WEAKLOCK(l, t)     ((void)0)
181 #define NODE_WEAKUNLOCK(l, t)   ((void)0)
182 #define NODE_WEAKDOWNGRADE(l)   ((void)0)
183 #endif
184
185 /*%
186  * Whether to rate-limit updating the LRU to avoid possible thread contention.
187  * Our performance measurement has shown the cost is marginal, so it's defined
188  * to be 0 by default either with or without threads.
189  */
190 #ifndef DNS_RBTDB_LIMITLRUUPDATE
191 #define DNS_RBTDB_LIMITLRUUPDATE 0
192 #endif
193
194 /*
195  * Allow clients with a virtual time of up to 5 minutes in the past to see
196  * records that would have otherwise have expired.
197  */
198 #define RBTDB_VIRTUAL 300
199
200 struct noqname {
201         dns_name_t name;
202         void *     nsec;
203         void *     nsecsig;
204 };
205
206 typedef struct acachectl acachectl_t;
207
208 typedef struct rdatasetheader {
209         /*%
210          * Locked by the owning node's lock.
211          */
212         rbtdb_serial_t                  serial;
213         dns_ttl_t                       rdh_ttl;
214         rbtdb_rdatatype_t               type;
215         isc_uint16_t                    attributes;
216         dns_trust_t                     trust;
217         struct noqname                  *noqname;
218         /*%<
219          * We don't use the LIST macros, because the LIST structure has
220          * both head and tail pointers, and is doubly linked.
221          */
222
223         struct rdatasetheader           *next;
224         /*%<
225          * If this is the top header for an rdataset, 'next' points
226          * to the top header for the next rdataset (i.e., the next type).
227          * Otherwise, it points up to the header whose down pointer points
228          * at this header.
229          */
230
231         struct rdatasetheader           *down;
232         /*%<
233          * Points to the header for the next older version of
234          * this rdataset.
235          */
236
237         isc_uint32_t                    count;
238         /*%<
239          * Monotonously increased every time this rdataset is bound so that
240          * it is used as the base of the starting point in DNS responses
241          * when the "cyclic" rrset-order is required.  Since the ordering
242          * should not be so crucial, no lock is set for the counter for
243          * performance reasons.
244          */
245
246         acachectl_t                     *additional_auth;
247         acachectl_t                     *additional_glue;
248
249         dns_rbtnode_t                   *node;
250         isc_stdtime_t                   last_used;
251         ISC_LINK(struct rdatasetheader) lru_link;
252         /*%<
253          * Used for LRU-based cache management.  We should probably make
254          * these cache-DB specific.  We might also make it a pointer and
255          * ensure only the top header has a valid link to save memory.
256          * The linked-list is locked by the rbtdb->lrulock.
257          */
258
259         /*
260          * It's possible this should not be here anymore, but instead
261          * referenced from the bucket's heap directly.
262          */
263 #if 0
264         isc_heap_t                      *heap;
265 #endif
266         unsigned int                    heap_index;
267         /*%<
268          * Used for TTL-based cache cleaning.
269          */
270 } rdatasetheader_t;
271
272 typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
273 typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;
274
275 #define RDATASET_ATTR_NONEXISTENT       0x0001
276 #define RDATASET_ATTR_STALE             0x0002
277 #define RDATASET_ATTR_IGNORE            0x0004
278 #define RDATASET_ATTR_RETAIN            0x0008
279 #define RDATASET_ATTR_NXDOMAIN          0x0010
280 #define RDATASET_ATTR_RESIGN            0x0020
281 #define RDATASET_ATTR_STATCOUNT         0x0040
282
283 typedef struct acache_cbarg {
284         dns_rdatasetadditional_t        type;
285         unsigned int                    count;
286         dns_db_t                        *db;
287         dns_dbnode_t                    *node;
288         rdatasetheader_t                *header;
289 } acache_cbarg_t;
290
291 struct acachectl {
292         dns_acacheentry_t               *entry;
293         acache_cbarg_t                  *cbarg;
294 };
295
296 /*
297  * XXX
298  * When the cache will pre-expire data (due to memory low or other
299  * situations) before the rdataset's TTL has expired, it MUST
300  * respect the RETAIN bit and not expire the data until its TTL is
301  * expired.
302  */
303
304 #undef IGNORE                   /* WIN32 winbase.h defines this. */
305
306 #define EXISTS(header) \
307         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
308 #define NONEXISTENT(header) \
309         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
310 #define IGNORE(header) \
311         (((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
312 #define RETAIN(header) \
313         (((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
314 #define NXDOMAIN(header) \
315         (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
316
317 #define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
318
319 /*%
320  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
321  * There is a tradeoff issue about configuring this value: if this is too
322  * small, it may cause heavier contention between threads; if this is too large,
323  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
324  * The default value should work well for most environments, but this can
325  * also be configurable at compilation time via the
326  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
327  * 1 due to the assumption of overmem_purge().
328  */
329 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
330 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
331 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
332 #else
333 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
334 #endif
335 #else
336 #define DEFAULT_CACHE_NODE_LOCK_COUNT   16
337 #endif  /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
338
339 typedef struct {
340         nodelock_t                      lock;
341         /* Protected in the refcount routines. */
342         isc_refcount_t                  references;
343         /* Locked by lock. */
344         isc_boolean_t                   exiting;
345 } rbtdb_nodelock_t;
346
347 typedef struct rbtdb_changed {
348         dns_rbtnode_t *                 node;
349         isc_boolean_t                   dirty;
350         ISC_LINK(struct rbtdb_changed)  link;
351 } rbtdb_changed_t;
352
353 typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
354
355 typedef struct rbtdb_version {
356         /* Not locked */
357         rbtdb_serial_t                  serial;
358         /*
359          * Protected in the refcount routines.
360          * XXXJT: should we change the lock policy based on the refcount
361          * performance?
362          */
363         isc_refcount_t                  references;
364         /* Locked by database lock. */
365         isc_boolean_t                   writer;
366         isc_boolean_t                   commit_ok;
367         rbtdb_changedlist_t             changed_list;
368         ISC_LINK(struct rbtdb_version)  link;
369 } rbtdb_version_t;
370
371 typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;
372
373 typedef struct {
374         /* Unlocked. */
375         dns_db_t                        common;
376 #if DNS_RBTDB_USERWLOCK
377         isc_rwlock_t                    lock;
378 #else
379         isc_mutex_t                     lock;
380 #endif
381         isc_rwlock_t                    tree_lock;
382         unsigned int                    node_lock_count;
383         rbtdb_nodelock_t *              node_locks;
384         dns_rbtnode_t *                 origin_node;
385         dns_stats_t *                   rrsetstats; /* cache DB only */
386         /* Locked by lock. */
387         unsigned int                    active;
388         isc_refcount_t                  references;
389         unsigned int                    attributes;
390         rbtdb_serial_t                  current_serial;
391         rbtdb_serial_t                  least_serial;
392         rbtdb_serial_t                  next_serial;
393         rbtdb_version_t *               current_version;
394         rbtdb_version_t *               future_version;
395         rbtdb_versionlist_t             open_versions;
396         isc_boolean_t                   overmem;
397         isc_task_t *                    task;
398         dns_dbnode_t                    *soanode;
399         dns_dbnode_t                    *nsnode;
400
401         /*
402          * This is a linked list used to implement the LRU cache.  There will
403          * be node_lock_count linked lists here.  Nodes in bucket 1 will be
404          * placed on the linked list rdatasets[1].
405          */
406         rdatasetheaderlist_t            *rdatasets;
407
408         /*%
409          * Temporary storage for stale cache nodes and dynamically deleted
410          * nodes that await being cleaned up.
411          */
412         rbtnodelist_t                   *deadnodes;
413
414         /*
415          * Heaps.  Each of these is used for TTL based expiry.
416          */
417         isc_heap_t                      **heaps;
418
419         /* Locked by tree_lock. */
420         dns_rbt_t *                     tree;
421         isc_boolean_t                   secure;
422
423         /* Unlocked */
424         unsigned int                    quantum;
425 } dns_rbtdb_t;
426
427 #define RBTDB_ATTR_LOADED               0x01
428 #define RBTDB_ATTR_LOADING              0x02
429
430 /*%
431  * Search Context
432  */
433 typedef struct {
434         dns_rbtdb_t *           rbtdb;
435         rbtdb_version_t *       rbtversion;
436         rbtdb_serial_t          serial;
437         unsigned int            options;
438         dns_rbtnodechain_t      chain;
439         isc_boolean_t           copy_name;
440         isc_boolean_t           need_cleanup;
441         isc_boolean_t           wild;
442         dns_rbtnode_t *         zonecut;
443         rdatasetheader_t *      zonecut_rdataset;
444         rdatasetheader_t *      zonecut_sigrdataset;
445         dns_fixedname_t         zonecut_name;
446         isc_stdtime_t           now;
447 } rbtdb_search_t;
448
449 /*%
450  * Load Context
451  */
452 typedef struct {
453         dns_rbtdb_t *           rbtdb;
454         isc_stdtime_t           now;
455 } rbtdb_load_t;
456
457 static void rdataset_disassociate(dns_rdataset_t *rdataset);
458 static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
459 static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
460 static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
461 static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
462 static unsigned int rdataset_count(dns_rdataset_t *rdataset);
463 static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
464                                         dns_name_t *name,
465                                         dns_rdataset_t *nsec,
466                                         dns_rdataset_t *nsecsig);
467 static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
468                                            dns_rdatasetadditional_t type,
469                                            dns_rdatatype_t qtype,
470                                            dns_acache_t *acache,
471                                            dns_zone_t **zonep,
472                                            dns_db_t **dbp,
473                                            dns_dbversion_t **versionp,
474                                            dns_dbnode_t **nodep,
475                                            dns_name_t *fname,
476                                            dns_message_t *msg,
477                                            isc_stdtime_t now);
478 static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
479                                            dns_rdatasetadditional_t type,
480                                            dns_rdatatype_t qtype,
481                                            dns_acache_t *acache,
482                                            dns_zone_t *zone,
483                                            dns_db_t *db,
484                                            dns_dbversion_t *version,
485                                            dns_dbnode_t *node,
486                                            dns_name_t *fname);
487 static isc_result_t rdataset_putadditional(dns_acache_t *acache,
488                                            dns_rdataset_t *rdataset,
489                                            dns_rdatasetadditional_t type,
490                                            dns_rdatatype_t qtype);
491 static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
492                                               isc_stdtime_t now);
493 static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
494                           isc_stdtime_t now);
495 static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
496                           isc_boolean_t tree_locked);
497 static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
498                           isc_stdtime_t now, isc_boolean_t tree_locked);
499 static void prune_tree(isc_task_t *task, isc_event_t *event);
500
501 static dns_rdatasetmethods_t rdataset_methods = {
502         rdataset_disassociate,
503         rdataset_first,
504         rdataset_next,
505         rdataset_current,
506         rdataset_clone,
507         rdataset_count,
508         NULL,
509         rdataset_getnoqname,
510         rdataset_getadditional,
511         rdataset_setadditional,
512         rdataset_putadditional
513 };
514
515 static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
516 static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
517 static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
518 static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
519                                  dns_rdataset_t *rdataset);
520
521 static dns_rdatasetitermethods_t rdatasetiter_methods = {
522         rdatasetiter_destroy,
523         rdatasetiter_first,
524         rdatasetiter_next,
525         rdatasetiter_current
526 };
527
528 typedef struct rbtdb_rdatasetiter {
529         dns_rdatasetiter_t              common;
530         rdatasetheader_t *              current;
531 } rbtdb_rdatasetiter_t;
532
533 static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
534 static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
535 static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
536 static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
537                                         dns_name_t *name);
538 static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
539 static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
540 static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
541                                            dns_dbnode_t **nodep,
542                                            dns_name_t *name);
543 static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
544 static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
545                                           dns_name_t *name);
546
547 static dns_dbiteratormethods_t dbiterator_methods = {
548         dbiterator_destroy,
549         dbiterator_first,
550         dbiterator_last,
551         dbiterator_seek,
552         dbiterator_prev,
553         dbiterator_next,
554         dbiterator_current,
555         dbiterator_pause,
556         dbiterator_origin
557 };
558
559 #define DELETION_BATCH_MAX 64
560
561 /*
562  * If 'paused' is ISC_TRUE, then the tree lock is not being held.
563  */
564 typedef struct rbtdb_dbiterator {
565         dns_dbiterator_t                common;
566         isc_boolean_t                   paused;
567         isc_boolean_t                   new_origin;
568         isc_rwlocktype_t                tree_locked;
569         isc_result_t                    result;
570         dns_fixedname_t                 name;
571         dns_fixedname_t                 origin;
572         dns_rbtnodechain_t              chain;
573         dns_rbtnode_t                   *node;
574         dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
575         int                             delete;
576 } rbtdb_dbiterator_t;
577
578
579 #define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
580 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
581
582 static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
583                        isc_event_t *event);
584 static void overmem(dns_db_t *db, isc_boolean_t overmem);
585
586 /*%
587  * 'init_count' is used to initialize 'newheader->count' which inturn
588  * is used to determine where in the cycle rrset-order cyclic starts.
589  * We don't lock this as we don't care about simultaneous updates.
590  *
591  * Note:
592  *      Both init_count and header->count can be ISC_UINT32_MAX.
593  *      The count on the returned rdataset however can't be as
594  *      that indicates that the database does not implement cyclic
595  *      processing.
596  */
597 static unsigned int init_count;
598
599 /*
600  * Locking
601  *
602  * If a routine is going to lock more than one lock in this module, then
603  * the locking must be done in the following order:
604  *
605  *      Tree Lock
606  *
607  *      Node Lock       (Only one from the set may be locked at one time by
608  *                       any caller)
609  *
610  *      Database Lock
611  *
612  * Failure to follow this hierarchy can result in deadlock.
613  */
614
615 /*
616  * Deleting Nodes
617  *
618  * For zone databases the node for the origin of the zone MUST NOT be deleted.
619  */
620
621
622 /*
623  * DB Routines
624  */
625
626 static void
627 attach(dns_db_t *source, dns_db_t **targetp) {
628         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
629
630         REQUIRE(VALID_RBTDB(rbtdb));
631
632         isc_refcount_increment(&rbtdb->references, NULL);
633
634         *targetp = source;
635 }
636
637 static void
638 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
639         dns_rbtdb_t *rbtdb = event->ev_arg;
640
641         UNUSED(task);
642
643         free_rbtdb(rbtdb, ISC_TRUE, event);
644 }
645
646 static void
647 update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
648                   isc_boolean_t increment)
649 {
650         dns_rdatastatstype_t statattributes = 0;
651         dns_rdatastatstype_t base = 0;
652         dns_rdatastatstype_t type;
653
654         /* At the moment we count statistics only for cache DB */
655         INSIST(IS_CACHE(rbtdb));
656
657         if (NXDOMAIN(header))
658                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
659         else if (RBTDB_RDATATYPE_BASE(header->type) == 0) {
660                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
661                 base = RBTDB_RDATATYPE_EXT(header->type);
662         } else
663                 base = RBTDB_RDATATYPE_BASE(header->type);
664
665         type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
666         if (increment)
667                 dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
668         else
669                 dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
670 }
671
672 static void
673 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
674         int idx;
675         isc_heap_t *heap;
676         dns_ttl_t oldttl;
677
678         oldttl = header->rdh_ttl;
679         header->rdh_ttl = newttl;
680
681         /*
682          * It's possible the rbtdb is not a cache.  If this is the case,
683          * we will not have a heap, and we move on.  If we do, though,
684          * we might need to adjust things.
685          */
686         if (header->heap_index == 0 || newttl == oldttl)
687                 return;
688         idx = header->node->locknum;
689         if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
690             return;
691         heap = rbtdb->heaps[idx];
692
693         if (newttl < oldttl)
694                 isc_heap_increased(heap, header->heap_index);
695         else
696                 isc_heap_decreased(heap, header->heap_index);
697 }
698
699 /*%
700  * This function allows the heap code to rank the priority of each
701  * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
702  */
703 static isc_boolean_t
704 ttl_sooner(void *v1, void *v2) {
705         rdatasetheader_t *h1 = v1;
706         rdatasetheader_t *h2 = v2;
707
708         if (h1->rdh_ttl < h2->rdh_ttl)
709                 return (ISC_TRUE);
710         return (ISC_FALSE);
711 }
712
713 /*%
714  * This function sets the heap index into the header.
715  */
716 static void
717 ttl_set_index(void *what, unsigned int index) {
718         rdatasetheader_t *h = what;
719
720         h->heap_index = index;
721 }
722
723 /*%
724  * Work out how many nodes can be deleted in the time between two
725  * requests to the nameserver.  Smooth the resulting number and use it
726  * as a estimate for the number of nodes to be deleted in the next
727  * iteration.
728  */
729 static unsigned int
730 adjust_quantum(unsigned int old, isc_time_t *start) {
731         unsigned int pps = dns_pps;     /* packets per second */
732         unsigned int interval;
733         isc_uint64_t usecs;
734         isc_time_t end;
735         unsigned int new;
736
737         if (pps < 100)
738                 pps = 100;
739         isc_time_now(&end);
740
741         interval = 1000000 / pps;       /* interval in usec */
742         if (interval == 0)
743                 interval = 1;
744         usecs = isc_time_microdiff(&end, start);
745         if (usecs == 0) {
746                 /*
747                  * We were unable to measure the amount of time taken.
748                  * Double the nodes deleted next time.
749                  */
750                 old *= 2;
751                 if (old > 1000)
752                         old = 1000;
753                 return (old);
754         }
755         new = old * interval;
756         new /= (unsigned int)usecs;
757         if (new == 0)
758                 new = 1;
759         else if (new > 1000)
760                 new = 1000;
761
762         /* Smooth */
763         new = (new + old * 3) / 4;
764
765         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
766                       ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
767
768         return (new);
769 }
770
771 static void
772 free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
773         unsigned int i;
774         isc_ondestroy_t ondest;
775         isc_result_t result;
776         char buf[DNS_NAME_FORMATSIZE];
777         isc_time_t start;
778
779         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
780                 overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
781
782         REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
783         REQUIRE(rbtdb->future_version == NULL);
784
785         if (rbtdb->current_version != NULL) {
786                 unsigned int refs;
787
788                 isc_refcount_decrement(&rbtdb->current_version->references,
789                                        &refs);
790                 INSIST(refs == 0);
791                 UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
792                 isc_refcount_destroy(&rbtdb->current_version->references);
793                 isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
794                             sizeof(rbtdb_version_t));
795         }
796
797         /*
798          * We assume the number of remaining dead nodes is reasonably small;
799          * the overhead of unlinking all nodes here should be negligible.
800          */
801         for (i = 0; i < rbtdb->node_lock_count; i++) {
802                 dns_rbtnode_t *node;
803
804                 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
805                 while (node != NULL) {
806                         ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
807                         node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
808                 }
809         }
810
811         if (event == NULL)
812                 rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
813  again:
814         if (rbtdb->tree != NULL) {
815                 isc_time_now(&start);
816                 result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
817                 if (result == ISC_R_QUOTA) {
818                         INSIST(rbtdb->task != NULL);
819                         if (rbtdb->quantum != 0)
820                                 rbtdb->quantum = adjust_quantum(rbtdb->quantum,
821                                                                 &start);
822                         if (event == NULL)
823                                 event = isc_event_allocate(rbtdb->common.mctx,
824                                                            NULL,
825                                                          DNS_EVENT_FREESTORAGE,
826                                                            free_rbtdb_callback,
827                                                            rbtdb,
828                                                            sizeof(isc_event_t));
829                         if (event == NULL)
830                                 goto again;
831                         isc_task_send(rbtdb->task, &event);
832                         return;
833                 }
834                 INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
835         }
836         if (event != NULL)
837                 isc_event_free(&event);
838         if (log) {
839                 if (dns_name_dynamic(&rbtdb->common.origin))
840                         dns_name_format(&rbtdb->common.origin, buf,
841                                         sizeof(buf));
842                 else
843                         strcpy(buf, "<UNKNOWN>");
844                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
845                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
846                               "done free_rbtdb(%s)", buf);
847         }
848         if (dns_name_dynamic(&rbtdb->common.origin))
849                 dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
850         for (i = 0; i < rbtdb->node_lock_count; i++) {
851                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
852                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
853         }
854
855         /*
856          * Clean up LRU cache objects.
857          */
858         if (rbtdb->rdatasets != NULL) {
859                 for (i = 0; i < rbtdb->node_lock_count; i++)
860                         INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
861                 isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
862                             rbtdb->node_lock_count *
863                             sizeof(rdatasetheaderlist_t));
864         }
865         /*
866          * Clean up dead node buckets.
867          */
868         if (rbtdb->deadnodes != NULL) {
869                 for (i = 0; i < rbtdb->node_lock_count; i++)
870                         INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
871                 isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
872                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
873         }
874         /*
875          * Clean up TTL heap cache objects.
876          */
877         if (rbtdb->heaps != NULL) {
878                 for (i = 0; i < rbtdb->node_lock_count; i++)
879                         isc_heap_destroy(&rbtdb->heaps[i]);
880                 isc_mem_put(rbtdb->common.mctx, rbtdb->heaps,
881                             rbtdb->node_lock_count *
882                             sizeof(isc_heap_t *));
883         }
884
885         if (rbtdb->rrsetstats != NULL)
886                 dns_stats_detach(&rbtdb->rrsetstats);
887
888         isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
889                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
890         isc_rwlock_destroy(&rbtdb->tree_lock);
891         isc_refcount_destroy(&rbtdb->references);
892         if (rbtdb->task != NULL)
893                 isc_task_detach(&rbtdb->task);
894
895         RBTDB_DESTROYLOCK(&rbtdb->lock);
896         rbtdb->common.magic = 0;
897         rbtdb->common.impmagic = 0;
898         ondest = rbtdb->common.ondest;
899         isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
900         isc_ondestroy_notify(&ondest, rbtdb);
901 }
902
903 static inline void
904 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
905         isc_boolean_t want_free = ISC_FALSE;
906         unsigned int i;
907         unsigned int inactive = 0;
908
909         /* XXX check for open versions here */
910
911         if (rbtdb->soanode != NULL)
912                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
913         if (rbtdb->nsnode != NULL)
914                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
915
916         /*
917          * Even though there are no external direct references, there still
918          * may be nodes in use.
919          */
920         for (i = 0; i < rbtdb->node_lock_count; i++) {
921                 NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
922                 rbtdb->node_locks[i].exiting = ISC_TRUE;
923                 NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
924                 if (isc_refcount_current(&rbtdb->node_locks[i].references)
925                     == 0) {
926                         inactive++;
927                 }
928         }
929
930         if (inactive != 0) {
931                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
932                 rbtdb->active -= inactive;
933                 if (rbtdb->active == 0)
934                         want_free = ISC_TRUE;
935                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
936                 if (want_free) {
937                         char buf[DNS_NAME_FORMATSIZE];
938                         if (dns_name_dynamic(&rbtdb->common.origin))
939                                 dns_name_format(&rbtdb->common.origin, buf,
940                                                 sizeof(buf));
941                         else
942                                 strcpy(buf, "<UNKNOWN>");
943                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
944                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
945                                       "calling free_rbtdb(%s)", buf);
946                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
947                 }
948         }
949 }
950
951 static void
952 detach(dns_db_t **dbp) {
953         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
954         unsigned int refs;
955
956         REQUIRE(VALID_RBTDB(rbtdb));
957
958         isc_refcount_decrement(&rbtdb->references, &refs);
959
960         if (refs == 0)
961                 maybe_free_rbtdb(rbtdb);
962
963         *dbp = NULL;
964 }
965
966 static void
967 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
968         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
969         rbtdb_version_t *version;
970         unsigned int refs;
971
972         REQUIRE(VALID_RBTDB(rbtdb));
973
974         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
975         version = rbtdb->current_version;
976         isc_refcount_increment(&version->references, &refs);
977         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
978
979         *versionp = (dns_dbversion_t *)version;
980 }
981
982 static inline rbtdb_version_t *
983 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
984                  unsigned int references, isc_boolean_t writer)
985 {
986         isc_result_t result;
987         rbtdb_version_t *version;
988
989         version = isc_mem_get(mctx, sizeof(*version));
990         if (version == NULL)
991                 return (NULL);
992         version->serial = serial;
993         result = isc_refcount_init(&version->references, references);
994         if (result != ISC_R_SUCCESS) {
995                 isc_mem_put(mctx, version, sizeof(*version));
996                 return (NULL);
997         }
998         version->writer = writer;
999         version->commit_ok = ISC_FALSE;
1000         ISC_LIST_INIT(version->changed_list);
1001         ISC_LINK_INIT(version, link);
1002
1003         return (version);
1004 }
1005
1006 static isc_result_t
1007 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1008         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1009         rbtdb_version_t *version;
1010
1011         REQUIRE(VALID_RBTDB(rbtdb));
1012         REQUIRE(versionp != NULL && *versionp == NULL);
1013         REQUIRE(rbtdb->future_version == NULL);
1014
1015         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1016         RUNTIME_CHECK(rbtdb->next_serial != 0);         /* XXX Error? */
1017         version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1018                                    ISC_TRUE);
1019         if (version != NULL) {
1020                 version->commit_ok = ISC_TRUE;
1021                 rbtdb->next_serial++;
1022                 rbtdb->future_version = version;
1023         }
1024         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1025
1026         if (version == NULL)
1027                 return (ISC_R_NOMEMORY);
1028
1029         *versionp = version;
1030
1031         return (ISC_R_SUCCESS);
1032 }
1033
1034 static void
1035 attachversion(dns_db_t *db, dns_dbversion_t *source,
1036               dns_dbversion_t **targetp)
1037 {
1038         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1039         rbtdb_version_t *rbtversion = source;
1040         unsigned int refs;
1041
1042         REQUIRE(VALID_RBTDB(rbtdb));
1043
1044         isc_refcount_increment(&rbtversion->references, &refs);
1045         INSIST(refs > 1);
1046
1047         *targetp = rbtversion;
1048 }
1049
1050 static rbtdb_changed_t *
1051 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1052             dns_rbtnode_t *node)
1053 {
1054         rbtdb_changed_t *changed;
1055         unsigned int refs;
1056
1057         /*
1058          * Caller must be holding the node lock if its reference must be
1059          * protected by the lock.
1060          */
1061
1062         changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1063
1064         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1065
1066         REQUIRE(version->writer);
1067
1068         if (changed != NULL) {
1069                 dns_rbtnode_refincrement(node, &refs);
1070                 INSIST(refs != 0);
1071                 changed->node = node;
1072                 changed->dirty = ISC_FALSE;
1073                 ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1074         } else
1075                 version->commit_ok = ISC_FALSE;
1076
1077         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1078
1079         return (changed);
1080 }
1081
1082 static void
1083 free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
1084                  acachectl_t *array)
1085 {
1086         unsigned int count;
1087         unsigned int i;
1088         unsigned char *raw;     /* RDATASLAB */
1089
1090         /*
1091          * The caller must be holding the corresponding node lock.
1092          */
1093
1094         if (array == NULL)
1095                 return;
1096
1097         raw = (unsigned char *)header + sizeof(*header);
1098         count = raw[0] * 256 + raw[1];
1099
1100         /*
1101          * Sanity check: since an additional cache entry has a reference to
1102          * the original DB node (in the callback arg), there should be no
1103          * acache entries when the node can be freed.
1104          */
1105         for (i = 0; i < count; i++)
1106                 INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1107
1108         isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1109 }
1110
1111 static inline void
1112 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1113
1114         if (dns_name_dynamic(&(*noqname)->name))
1115                 dns_name_free(&(*noqname)->name, mctx);
1116         if ((*noqname)->nsec != NULL)
1117                 isc_mem_put(mctx, (*noqname)->nsec,
1118                             dns_rdataslab_size((*noqname)->nsec, 0));
1119         if ((*noqname)->nsecsig != NULL)
1120                 isc_mem_put(mctx, (*noqname)->nsecsig,
1121                             dns_rdataslab_size((*noqname)->nsecsig, 0));
1122         isc_mem_put(mctx, *noqname, sizeof(**noqname));
1123         *noqname = NULL;
1124 }
1125
1126 static inline void
1127 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
1128 {
1129         ISC_LINK_INIT(h, lru_link);
1130         h->heap_index = 0;
1131
1132 #if TRACE_HEADER
1133         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1134                 fprintf(stderr, "initialized header: %p\n", h);
1135 #else
1136         UNUSED(rbtdb);
1137 #endif
1138 }
1139
1140 static inline rdatasetheader_t *
1141 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
1142 {
1143         rdatasetheader_t *h;
1144
1145         h = isc_mem_get(mctx, sizeof(*h));
1146         if (h == NULL)
1147                 return (NULL);
1148
1149 #if TRACE_HEADER
1150         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1151                 fprintf(stderr, "allocated header: %p\n", h);
1152 #endif
1153         init_rdataset(rbtdb, h);
1154         return (h);
1155 }
1156
1157 static inline void
1158 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
1159 {
1160         unsigned int size;
1161
1162         if (EXISTS(rdataset) &&
1163             (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
1164                 update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
1165         }
1166
1167         if (IS_CACHE(rbtdb) && ISC_LINK_LINKED(rdataset, lru_link)) {
1168                 int idx = rdataset->node->locknum;
1169                 ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link);
1170                 if (rdataset->heap_index != 0) {
1171                         isc_heap_delete(rbtdb->heaps[idx],
1172                                         rdataset->heap_index);
1173                 }
1174                 rdataset->heap_index = 0;
1175         }
1176
1177         if (rdataset->noqname != NULL)
1178                 free_noqname(mctx, &rdataset->noqname);
1179
1180         free_acachearray(mctx, rdataset, rdataset->additional_auth);
1181         free_acachearray(mctx, rdataset, rdataset->additional_glue);
1182
1183         if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
1184                 size = sizeof(*rdataset);
1185         else
1186                 size = dns_rdataslab_size((unsigned char *)rdataset,
1187                                           sizeof(*rdataset));
1188         isc_mem_put(mctx, rdataset, size);
1189 }
1190
1191 static inline void
1192 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1193         rdatasetheader_t *header, *dcurrent;
1194         isc_boolean_t make_dirty = ISC_FALSE;
1195
1196         /*
1197          * Caller must hold the node lock.
1198          */
1199
1200         /*
1201          * We set the IGNORE attribute on rdatasets with serial number
1202          * 'serial'.  When the reference count goes to zero, these rdatasets
1203          * will be cleaned up; until that time, they will be ignored.
1204          */
1205         for (header = node->data; header != NULL; header = header->next) {
1206                 if (header->serial == serial) {
1207                         header->attributes |= RDATASET_ATTR_IGNORE;
1208                         make_dirty = ISC_TRUE;
1209                 }
1210                 for (dcurrent = header->down;
1211                      dcurrent != NULL;
1212                      dcurrent = dcurrent->down) {
1213                         if (dcurrent->serial == serial) {
1214                                 dcurrent->attributes |= RDATASET_ATTR_IGNORE;
1215                                 make_dirty = ISC_TRUE;
1216                         }
1217                 }
1218         }
1219         if (make_dirty)
1220                 node->dirty = 1;
1221 }
1222
1223 static inline void
1224 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
1225 {
1226         rdatasetheader_t *d, *down_next;
1227
1228         for (d = top->down; d != NULL; d = down_next) {
1229                 down_next = d->down;
1230                 free_rdataset(rbtdb, mctx, d);
1231         }
1232         top->down = NULL;
1233 }
1234
1235 static inline void
1236 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1237         rdatasetheader_t *current, *top_prev, *top_next;
1238         isc_mem_t *mctx = rbtdb->common.mctx;
1239
1240         /*
1241          * Caller must be holding the node lock.
1242          */
1243
1244         top_prev = NULL;
1245         for (current = node->data; current != NULL; current = top_next) {
1246                 top_next = current->next;
1247                 clean_stale_headers(rbtdb, mctx, current);
1248                 /*
1249                  * If current is nonexistent or stale, we can clean it up.
1250                  */
1251                 if ((current->attributes &
1252                      (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
1253                         if (top_prev != NULL)
1254                                 top_prev->next = current->next;
1255                         else
1256                                 node->data = current->next;
1257                         free_rdataset(rbtdb, mctx, current);
1258                 } else
1259                         top_prev = current;
1260         }
1261         node->dirty = 0;
1262 }
1263
1264 static inline void
1265 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1266                 rbtdb_serial_t least_serial)
1267 {
1268         rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1269         rdatasetheader_t *top_prev, *top_next;
1270         isc_mem_t *mctx = rbtdb->common.mctx;
1271         isc_boolean_t still_dirty = ISC_FALSE;
1272
1273         /*
1274          * Caller must be holding the node lock.
1275          */
1276         REQUIRE(least_serial != 0);
1277
1278         top_prev = NULL;
1279         for (current = node->data; current != NULL; current = top_next) {
1280                 top_next = current->next;
1281
1282                 /*
1283                  * First, we clean up any instances of multiple rdatasets
1284                  * with the same serial number, or that have the IGNORE
1285                  * attribute.
1286                  */
1287                 dparent = current;
1288                 for (dcurrent = current->down;
1289                      dcurrent != NULL;
1290                      dcurrent = down_next) {
1291                         down_next = dcurrent->down;
1292                         INSIST(dcurrent->serial <= dparent->serial);
1293                         if (dcurrent->serial == dparent->serial ||
1294                             IGNORE(dcurrent)) {
1295                                 if (down_next != NULL)
1296                                         down_next->next = dparent;
1297                                 dparent->down = down_next;
1298                                 free_rdataset(rbtdb, mctx, dcurrent);
1299                         } else
1300                                 dparent = dcurrent;
1301                 }
1302
1303                 /*
1304                  * We've now eliminated all IGNORE datasets with the possible
1305                  * exception of current, which we now check.
1306                  */
1307                 if (IGNORE(current)) {
1308                         down_next = current->down;
1309                         if (down_next == NULL) {
1310                                 if (top_prev != NULL)
1311                                         top_prev->next = current->next;
1312                                 else
1313                                         node->data = current->next;
1314                                 free_rdataset(rbtdb, mctx, current);
1315                                 /*
1316                                  * current no longer exists, so we can
1317                                  * just continue with the loop.
1318                                  */
1319                                 continue;
1320                         } else {
1321                                 /*
1322                                  * Pull up current->down, making it the new
1323                                  * current.
1324                                  */
1325                                 if (top_prev != NULL)
1326                                         top_prev->next = down_next;
1327                                 else
1328                                         node->data = down_next;
1329                                 down_next->next = top_next;
1330                                 free_rdataset(rbtdb, mctx, current);
1331                                 current = down_next;
1332                         }
1333                 }
1334
1335                 /*
1336                  * We now try to find the first down node less than the
1337                  * least serial.
1338                  */
1339                 dparent = current;
1340                 for (dcurrent = current->down;
1341                      dcurrent != NULL;
1342                      dcurrent = down_next) {
1343                         down_next = dcurrent->down;
1344                         if (dcurrent->serial < least_serial)
1345                                 break;
1346                         dparent = dcurrent;
1347                 }
1348
1349                 /*
1350                  * If there is a such an rdataset, delete it and any older
1351                  * versions.
1352                  */
1353                 if (dcurrent != NULL) {
1354                         do {
1355                                 down_next = dcurrent->down;
1356                                 INSIST(dcurrent->serial <= least_serial);
1357                                 free_rdataset(rbtdb, mctx, dcurrent);
1358                                 dcurrent = down_next;
1359                         } while (dcurrent != NULL);
1360                         dparent->down = NULL;
1361                 }
1362
1363                 /*
1364                  * Note.  The serial number of 'current' might be less than
1365                  * least_serial too, but we cannot delete it because it is
1366                  * the most recent version, unless it is a NONEXISTENT
1367                  * rdataset.
1368                  */
1369                 if (current->down != NULL) {
1370                         still_dirty = ISC_TRUE;
1371                         top_prev = current;
1372                 } else {
1373                         /*
1374                          * If this is a NONEXISTENT rdataset, we can delete it.
1375                          */
1376                         if (NONEXISTENT(current)) {
1377                                 if (top_prev != NULL)
1378                                         top_prev->next = current->next;
1379                                 else
1380                                         node->data = current->next;
1381                                 free_rdataset(rbtdb, mctx, current);
1382                         } else
1383                                 top_prev = current;
1384                 }
1385         }
1386         if (!still_dirty)
1387                 node->dirty = 0;
1388 }
1389
1390 /*%
1391  * Clean up dead nodes.  These are nodes which have no references, and
1392  * have no data.  They are dead but we could not or chose not to delete
1393  * them when we deleted all the data at that node because we did not want
1394  * to wait for the tree write lock.
1395  *
1396  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1397  */
1398 static void
1399 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1400         dns_rbtnode_t *node;
1401         isc_result_t result;
1402         int count = 10;         /* XXXJT: should be adjustable */
1403
1404         node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1405         while (node != NULL && count > 0) {
1406                 ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1407
1408                 /*
1409                  * Since we're holding a tree write lock, it should be
1410                  * impossible for this node to be referenced by others.
1411                  */
1412                 INSIST(dns_rbtnode_refcurrent(node) == 0 &&
1413                        node->data == NULL);
1414
1415                 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1416                 if (result != ISC_R_SUCCESS)
1417                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1418                                       DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1419                                       "cleanup_dead_nodes: "
1420                                       "dns_rbt_deletenode: %s",
1421                                       isc_result_totext(result));
1422                 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1423                 count--;
1424         }
1425 }
1426
1427 /*
1428  * Caller must be holding the node lock if its reference must be protected
1429  * by the lock.
1430  */
1431 static inline void
1432 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1433         unsigned int lockrefs, noderefs;
1434         isc_refcount_t *lockref;
1435
1436         dns_rbtnode_refincrement0(node, &noderefs);
1437         if (noderefs == 1) {    /* this is the first reference to the node */
1438                 lockref = &rbtdb->node_locks[node->locknum].references;
1439                 isc_refcount_increment0(lockref, &lockrefs);
1440                 INSIST(lockrefs != 0);
1441         }
1442         INSIST(noderefs != 0);
1443 }
1444
1445 /*
1446  * This function is assumed to be called when a node is newly referenced
1447  * and can be in the deadnode list.  In that case the node must be retrieved
1448  * from the list because it is going to be used.  In addition, if the caller
1449  * happens to hold a write lock on the tree, it's a good chance to purge dead
1450  * nodes.
1451  * Note: while a new reference is gained in multiple places, there are only very
1452  * few cases where the node can be in the deadnode list (only empty nodes can
1453  * have been added to the list).
1454  */
1455 static inline void
1456 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1457                 isc_rwlocktype_t treelocktype)
1458 {
1459         isc_boolean_t need_relock = ISC_FALSE;
1460
1461         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
1462         new_reference(rbtdb, node);
1463
1464         NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1465                       isc_rwlocktype_read);
1466         if (ISC_LINK_LINKED(node, deadlink))
1467                 need_relock = ISC_TRUE;
1468         else if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1469                  treelocktype == isc_rwlocktype_write)
1470                 need_relock = ISC_TRUE;
1471         NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1472                         isc_rwlocktype_read);
1473         if (need_relock) {
1474                 NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1475                               isc_rwlocktype_write);
1476                 if (ISC_LINK_LINKED(node, deadlink))
1477                         ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
1478                                         node, deadlink);
1479                 if (treelocktype == isc_rwlocktype_write)
1480                         cleanup_dead_nodes(rbtdb, node->locknum);
1481                 NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1482                                 isc_rwlocktype_write);
1483         }
1484
1485         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
1486 }
1487
1488 /*
1489  * Caller must be holding the node lock; either the "strong", read or write
1490  * lock.  Note that the lock must be held even when node references are
1491  * atomically modified; in that case the decrement operation itself does not
1492  * have to be protected, but we must avoid a race condition where multiple
1493  * threads are decreasing the reference to zero simultaneously and at least
1494  * one of them is going to free the node.
1495  * This function returns ISC_TRUE if and only if the node reference decreases
1496  * to zero.
1497  */
1498 static isc_boolean_t
1499 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1500                     rbtdb_serial_t least_serial,
1501                     isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
1502                     isc_boolean_t pruning)
1503 {
1504         isc_result_t result;
1505         isc_boolean_t write_locked;
1506         rbtdb_nodelock_t *nodelock;
1507         unsigned int refs, nrefs;
1508         int bucket = node->locknum;
1509         isc_boolean_t no_reference;
1510
1511         nodelock = &rbtdb->node_locks[bucket];
1512
1513         /* Handle easy and typical case first. */
1514         if (!node->dirty && (node->data != NULL || node->down != NULL)) {
1515                 dns_rbtnode_refdecrement(node, &nrefs);
1516                 INSIST((int)nrefs >= 0);
1517                 if (nrefs == 0) {
1518                         isc_refcount_decrement(&nodelock->references, &refs);
1519                         INSIST((int)refs >= 0);
1520                 }
1521                 return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
1522         }
1523
1524         /* Upgrade the lock? */
1525         if (nlock == isc_rwlocktype_read) {
1526                 NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
1527                 NODE_WEAKLOCK(&nodelock->lock, isc_rwlocktype_write);
1528         }
1529         dns_rbtnode_refdecrement(node, &nrefs);
1530         INSIST((int)nrefs >= 0);
1531         if (nrefs > 0) {
1532                 /* Restore the lock? */
1533                 if (nlock == isc_rwlocktype_read)
1534                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1535                 return (ISC_FALSE);
1536         }
1537
1538         if (node->dirty && dns_rbtnode_refcurrent(node) == 0) {
1539                 if (IS_CACHE(rbtdb))
1540                         clean_cache_node(rbtdb, node);
1541                 else {
1542                         if (least_serial == 0) {
1543                                 /*
1544                                  * Caller doesn't know the least serial.
1545                                  * Get it.
1546                                  */
1547                                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1548                                 least_serial = rbtdb->least_serial;
1549                                 RBTDB_UNLOCK(&rbtdb->lock,
1550                                              isc_rwlocktype_read);
1551                         }
1552                         clean_zone_node(rbtdb, node, least_serial);
1553                 }
1554         }
1555
1556         isc_refcount_decrement(&nodelock->references, &refs);
1557         INSIST((int)refs >= 0);
1558
1559         /*
1560          * XXXDCL should this only be done for cache zones?
1561          */
1562         if (node->data != NULL || node->down != NULL) {
1563                 /* Restore the lock? */
1564                 if (nlock == isc_rwlocktype_read)
1565                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1566                 return (ISC_TRUE);
1567         }
1568
1569         /*
1570          * Attempt to switch to a write lock on the tree.  If this fails,
1571          * we will add this node to a linked list of nodes in this locking
1572          * bucket which we will free later.
1573          */
1574         if (tlock != isc_rwlocktype_write) {
1575                 /*
1576                  * Locking hierarchy notwithstanding, we don't need to free
1577                  * the node lock before acquiring the tree write lock because
1578                  * we only do a trylock.
1579                  */
1580                 if (tlock == isc_rwlocktype_read)
1581                         result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
1582                 else
1583                         result = isc_rwlock_trylock(&rbtdb->tree_lock,
1584                                                     isc_rwlocktype_write);
1585                 RUNTIME_CHECK(result == ISC_R_SUCCESS ||
1586                               result == ISC_R_LOCKBUSY);
1587
1588                 write_locked = ISC_TF(result == ISC_R_SUCCESS);
1589         } else
1590                 write_locked = ISC_TRUE;
1591
1592         no_reference = ISC_TRUE;
1593         if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
1594                 /*
1595                  * We can now delete the node if the reference counter is
1596                  * zero.  This should be typically the case, but a different
1597                  * thread may still gain a (new) reference just before the
1598                  * current thread locks the tree (e.g., in findnode()).
1599                  */
1600
1601                 /*
1602                  * If this node is the only one in the level it's in, deleting
1603                  * this node may recursively make its parent the only node in
1604                  * the parent level; if so, and if no one is currently using
1605                  * the parent node, this is almost the only opportunity to
1606                  * clean it up.  But the recursive cleanup is not that trivial
1607                  * since the child and parent may be in different lock buckets,
1608                  * which would cause a lock order reversal problem.  To avoid
1609                  * the trouble, we'll dispatch a separate event for batch
1610                  * cleaning.  We need to check whether we're deleting the node
1611                  * as a result of pruning to avoid infinite dispatching.
1612                  * Note: pruning happens only when a task has been set for the
1613                  * rbtdb.  If the user of the rbtdb chooses not to set a task,
1614                  * it's their responsibility to purge stale leaves (e.g. by
1615                  * periodic walk-through).
1616                  */
1617                 if (!pruning && node->parent != NULL &&
1618                     node->parent->down == node && node->left == NULL &&
1619                     node->right == NULL && rbtdb->task != NULL) {
1620                         isc_event_t *ev;
1621                         dns_db_t *db;
1622
1623                         ev = isc_event_allocate(rbtdb->common.mctx, NULL,
1624                                                 DNS_EVENT_RBTPRUNE,
1625                                                 prune_tree, node,
1626                                                 sizeof(isc_event_t));
1627                         if (ev != NULL) {
1628                                 new_reference(rbtdb, node);
1629                                 db = NULL;
1630                                 attach((dns_db_t *)rbtdb, &db);
1631                                 ev->ev_sender = db;
1632                                 isc_task_send(rbtdb->task, &ev);
1633                                 no_reference = ISC_FALSE;
1634                         } else {
1635                                 /*
1636                                  * XXX: this is a weird situation.  We could
1637                                  * ignore this error case, but then the stale
1638                                  * node will unlikely be purged except via a
1639                                  * rare condition such as manual cleanup.  So
1640                                  * we queue it in the deadnodes list, hoping
1641                                  * the memory shortage is temporary and the node
1642                                  * will be deleted later.
1643                                  */
1644                                 isc_log_write(dns_lctx,
1645                                               DNS_LOGCATEGORY_DATABASE,
1646                                               DNS_LOGMODULE_CACHE,
1647                                               ISC_LOG_INFO,
1648                                               "decrement_reference: failed to "
1649                                               "allocate pruning event");
1650                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1651                                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
1652                                                 deadlink);
1653                         }
1654                 } else {
1655                         if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1656                                 char printname[DNS_NAME_FORMATSIZE];
1657
1658                                 isc_log_write(dns_lctx,
1659                                               DNS_LOGCATEGORY_DATABASE,
1660                                               DNS_LOGMODULE_CACHE,
1661                                               ISC_LOG_DEBUG(1),
1662                                               "decrement_reference: "
1663                                               "delete from rbt: %p %s",
1664                                               node,
1665                                               dns_rbt_formatnodename(node,
1666                                                         printname,
1667                                                         sizeof(printname)));
1668                         }
1669
1670                         INSIST(!ISC_LINK_LINKED(node, deadlink));
1671                         result = dns_rbt_deletenode(rbtdb->tree, node,
1672                                                     ISC_FALSE);
1673                         if (result != ISC_R_SUCCESS) {
1674                                 isc_log_write(dns_lctx,
1675                                               DNS_LOGCATEGORY_DATABASE,
1676                                               DNS_LOGMODULE_CACHE,
1677                                               ISC_LOG_WARNING,
1678                                               "decrement_reference: "
1679                                               "dns_rbt_deletenode: %s",
1680                                               isc_result_totext(result));
1681                         }
1682                 }
1683         } else if (dns_rbtnode_refcurrent(node) == 0) {
1684                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1685                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
1686         }
1687
1688         /* Restore the lock? */
1689         if (nlock == isc_rwlocktype_read)
1690                 NODE_WEAKDOWNGRADE(&nodelock->lock);
1691
1692         /*
1693          * Relock a read lock, or unlock the write lock if no lock was held.
1694          */
1695         if (tlock == isc_rwlocktype_none)
1696                 if (write_locked)
1697                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1698
1699         if (tlock == isc_rwlocktype_read)
1700                 if (write_locked)
1701                         isc_rwlock_downgrade(&rbtdb->tree_lock);
1702
1703         return (no_reference);
1704 }
1705
1706 /*
1707  * Prune the tree by recursively cleaning-up single leaves.  In the worst
1708  * case, the number of iteration is the number of tree levels, which is at
1709  * most the maximum number of domain name labels, i.e, 127.  In practice, this
1710  * should be much smaller (only a few times), and even the worst case would be
1711  * acceptable for a single event.
1712  */
1713 static void
1714 prune_tree(isc_task_t *task, isc_event_t *event) {
1715         dns_rbtdb_t *rbtdb = event->ev_sender;
1716         dns_rbtnode_t *node = event->ev_arg;
1717         dns_rbtnode_t *parent;
1718         unsigned int locknum;
1719
1720         UNUSED(task);
1721
1722         isc_event_free(&event);
1723
1724         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1725         locknum = node->locknum;
1726         NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1727         do {
1728                 parent = node->parent;
1729                 decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
1730                                     isc_rwlocktype_write, ISC_TRUE);
1731
1732                 if (parent != NULL && parent->down == NULL) {
1733                         /*
1734                          * node was the only down child of the parent and has
1735                          * just been removed.  We'll then need to examine the
1736                          * parent.  Keep the lock if possible; otherwise,
1737                          * release the old lock and acquire one for the parent.
1738                          */
1739                         if (parent->locknum != locknum) {
1740                                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
1741                                             isc_rwlocktype_write);
1742                                 locknum = parent->locknum;
1743                                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
1744                                           isc_rwlocktype_write);
1745                         }
1746
1747                         /*
1748                          * We need to gain a reference to the node before
1749                          * decrementing it in the next iteration.  In addition,
1750                          * if the node is in the dead-nodes list, extract it
1751                          * from the list beforehand as we do in
1752                          * reactivate_node().
1753                          */
1754                         new_reference(rbtdb, parent);
1755                         if (ISC_LINK_LINKED(parent, deadlink)) {
1756                                 ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
1757                                                 parent, deadlink);
1758                         }
1759                 } else
1760                         parent = NULL;
1761
1762                 node = parent;
1763         } while (node != NULL);
1764         NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1765         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1766
1767         detach((dns_db_t **)&rbtdb);
1768 }
1769
1770 static inline void
1771 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1772                    rbtdb_changedlist_t *cleanup_list)
1773 {
1774         /*
1775          * Caller must be holding the database lock.
1776          */
1777
1778         rbtdb->least_serial = version->serial;
1779         *cleanup_list = version->changed_list;
1780         ISC_LIST_INIT(version->changed_list);
1781 }
1782
1783 static inline void
1784 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
1785         rbtdb_changed_t *changed, *next_changed;
1786
1787         /*
1788          * If the changed record is dirty, then
1789          * an update created multiple versions of
1790          * a given rdataset.  We keep this list
1791          * until we're the least open version, at
1792          * which point it's safe to get rid of any
1793          * older versions.
1794          *
1795          * If the changed record isn't dirty, then
1796          * we don't need it anymore since we're
1797          * committing and not rolling back.
1798          *
1799          * The caller must be holding the database lock.
1800          */
1801         for (changed = HEAD(version->changed_list);
1802              changed != NULL;
1803              changed = next_changed) {
1804                 next_changed = NEXT(changed, link);
1805                 if (!changed->dirty) {
1806                         UNLINK(version->changed_list,
1807                                changed, link);
1808                         APPEND(*cleanup_list,
1809                                changed, link);
1810                 }
1811         }
1812 }
1813
1814 static isc_boolean_t
1815 iszonesecure(dns_db_t *db, dns_dbnode_t *origin) {
1816         dns_rdataset_t keyset;
1817         dns_rdataset_t nsecset, signsecset;
1818         isc_boolean_t haszonekey = ISC_FALSE;
1819         isc_boolean_t hasnsec = ISC_FALSE;
1820         isc_result_t result;
1821
1822         dns_rdataset_init(&keyset);
1823         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_dnskey, 0,
1824                                      0, &keyset, NULL);
1825         if (result == ISC_R_SUCCESS) {
1826                 dns_rdata_t keyrdata = DNS_RDATA_INIT;
1827                 result = dns_rdataset_first(&keyset);
1828                 while (result == ISC_R_SUCCESS) {
1829                         dns_rdataset_current(&keyset, &keyrdata);
1830                         if (dns_zonekey_iszonekey(&keyrdata)) {
1831                                 haszonekey = ISC_TRUE;
1832                                 break;
1833                         }
1834                         result = dns_rdataset_next(&keyset);
1835                 }
1836                 dns_rdataset_disassociate(&keyset);
1837         }
1838         if (!haszonekey)
1839                 return (ISC_FALSE);
1840
1841         dns_rdataset_init(&nsecset);
1842         dns_rdataset_init(&signsecset);
1843         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_nsec, 0,
1844                                      0, &nsecset, &signsecset);
1845         if (result == ISC_R_SUCCESS) {
1846                 if (dns_rdataset_isassociated(&signsecset)) {
1847                         hasnsec = ISC_TRUE;
1848                         dns_rdataset_disassociate(&signsecset);
1849                 }
1850                 dns_rdataset_disassociate(&nsecset);
1851         }
1852         return (hasnsec);
1853 }
1854
1855 static void
1856 closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
1857         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1858         rbtdb_version_t *version, *cleanup_version, *least_greater;
1859         isc_boolean_t rollback = ISC_FALSE;
1860         rbtdb_changedlist_t cleanup_list;
1861         rbtdb_changed_t *changed, *next_changed;
1862         rbtdb_serial_t serial, least_serial;
1863         dns_rbtnode_t *rbtnode;
1864         unsigned int refs;
1865         isc_boolean_t writer;
1866
1867         REQUIRE(VALID_RBTDB(rbtdb));
1868         version = (rbtdb_version_t *)*versionp;
1869
1870         cleanup_version = NULL;
1871         ISC_LIST_INIT(cleanup_list);
1872
1873         isc_refcount_decrement(&version->references, &refs);
1874         if (refs > 0) {         /* typical and easy case first */
1875                 if (commit) {
1876                         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1877                         INSIST(!version->writer);
1878                         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1879                 }
1880                 goto end;
1881         }
1882
1883         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1884         serial = version->serial;
1885         writer = version->writer;
1886         if (version->writer) {
1887                 if (commit) {
1888                         unsigned cur_ref;
1889                         rbtdb_version_t *cur_version;
1890
1891                         INSIST(version->commit_ok);
1892                         INSIST(version == rbtdb->future_version);
1893                         /*
1894                          * The current version is going to be replaced.
1895                          * Release the (likely last) reference to it from the
1896                          * DB itself and unlink it from the open list.
1897                          */
1898                         cur_version = rbtdb->current_version;
1899                         isc_refcount_decrement(&cur_version->references,
1900                                                &cur_ref);
1901                         if (cur_ref == 0) {
1902                                 if (cur_version->serial == rbtdb->least_serial)
1903                                         INSIST(EMPTY(cur_version->changed_list));
1904                                 UNLINK(rbtdb->open_versions,
1905                                        cur_version, link);
1906                         }
1907                         if (EMPTY(rbtdb->open_versions)) {
1908                                 /*
1909                                  * We're going to become the least open
1910                                  * version.
1911                                  */
1912                                 make_least_version(rbtdb, version,
1913                                                    &cleanup_list);
1914                         } else {
1915                                 /*
1916                                  * Some other open version is the
1917                                  * least version.  We can't cleanup
1918                                  * records that were changed in this
1919                                  * version because the older versions
1920                                  * may still be in use by an open
1921                                  * version.
1922                                  *
1923                                  * We can, however, discard the
1924                                  * changed records for things that
1925                                  * we've added that didn't exist in
1926                                  * prior versions.
1927                                  */
1928                                 cleanup_nondirty(version, &cleanup_list);
1929                         }
1930                         /*
1931                          * If the (soon to be former) current version
1932                          * isn't being used by anyone, we can clean
1933                          * it up.
1934                          */
1935                         if (cur_ref == 0) {
1936                                 cleanup_version = cur_version;
1937                                 APPENDLIST(version->changed_list,
1938                                            cleanup_version->changed_list,
1939                                            link);
1940                         }
1941                         /*
1942                          * Become the current version.
1943                          */
1944                         version->writer = ISC_FALSE;
1945                         rbtdb->current_version = version;
1946                         rbtdb->current_serial = version->serial;
1947                         rbtdb->future_version = NULL;
1948
1949                         /*
1950                          * Keep the current version in the open list, and
1951                          * gain a reference for the DB itself (see the DB
1952                          * creation function below).  This must be the only
1953                          * case where we need to increment the counter from
1954                          * zero and need to use isc_refcount_increment0().
1955                          */
1956                         isc_refcount_increment0(&version->references,
1957                                                 &cur_ref);
1958                         INSIST(cur_ref == 1);
1959                         PREPEND(rbtdb->open_versions,
1960                                 rbtdb->current_version, link);
1961                 } else {
1962                         /*
1963                          * We're rolling back this transaction.
1964                          */
1965                         cleanup_list = version->changed_list;
1966                         ISC_LIST_INIT(version->changed_list);
1967                         rollback = ISC_TRUE;
1968                         cleanup_version = version;
1969                         rbtdb->future_version = NULL;
1970                 }
1971         } else {
1972                 if (version != rbtdb->current_version) {
1973                         /*
1974                          * There are no external or internal references
1975                          * to this version and it can be cleaned up.
1976                          */
1977                         cleanup_version = version;
1978
1979                         /*
1980                          * Find the version with the least serial
1981                          * number greater than ours.
1982                          */
1983                         least_greater = PREV(version, link);
1984                         if (least_greater == NULL)
1985                                 least_greater = rbtdb->current_version;
1986
1987                         INSIST(version->serial < least_greater->serial);
1988                         /*
1989                          * Is this the least open version?
1990                          */
1991                         if (version->serial == rbtdb->least_serial) {
1992                                 /*
1993                                  * Yes.  Install the new least open
1994                                  * version.
1995                                  */
1996                                 make_least_version(rbtdb,
1997                                                    least_greater,
1998                                                    &cleanup_list);
1999                         } else {
2000                                 /*
2001                                  * Add any unexecuted cleanups to
2002                                  * those of the least greater version.
2003                                  */
2004                                 APPENDLIST(least_greater->changed_list,
2005                                            version->changed_list,
2006                                            link);
2007                         }
2008                 } else if (version->serial == rbtdb->least_serial)
2009                         INSIST(EMPTY(version->changed_list));
2010                 UNLINK(rbtdb->open_versions, version, link);
2011         }
2012         least_serial = rbtdb->least_serial;
2013         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2014
2015         /*
2016          * Update the zone's secure status.
2017          */
2018         if (writer && commit && !IS_CACHE(rbtdb))
2019                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
2020
2021         if (cleanup_version != NULL) {
2022                 INSIST(EMPTY(cleanup_version->changed_list));
2023                 isc_mem_put(rbtdb->common.mctx, cleanup_version,
2024                             sizeof(*cleanup_version));
2025         }
2026
2027         if (!EMPTY(cleanup_list)) {
2028                 /*
2029                  * We acquire a tree write lock here in order to make sure
2030                  * that stale nodes will be removed in decrement_reference().
2031                  * If we didn't have the lock, those nodes could miss the
2032                  * chance to be removed until the server stops.  The write lock
2033                  * is expensive, but this event should be rare enough to justify
2034                  * the cost.
2035                  */
2036                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2037                 for (changed = HEAD(cleanup_list);
2038                      changed != NULL;
2039                      changed = next_changed) {
2040                         nodelock_t *lock;
2041
2042                         next_changed = NEXT(changed, link);
2043                         rbtnode = changed->node;
2044                         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2045
2046                         NODE_LOCK(lock, isc_rwlocktype_write);
2047                         /*
2048                          * This is a good opportunity to purge any dead nodes,
2049                          * so use it.
2050                          */
2051                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2052
2053                         if (rollback)
2054                                 rollback_node(rbtnode, serial);
2055                         decrement_reference(rbtdb, rbtnode, least_serial,
2056                                             isc_rwlocktype_write,
2057                                             isc_rwlocktype_write, ISC_FALSE);
2058
2059                         NODE_UNLOCK(lock, isc_rwlocktype_write);
2060
2061                         isc_mem_put(rbtdb->common.mctx, changed,
2062                                     sizeof(*changed));
2063                 }
2064                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2065         }
2066
2067   end:
2068         *versionp = NULL;
2069 }
2070
2071 /*
2072  * Add the necessary magic for the wildcard name 'name'
2073  * to be found in 'rbtdb'.
2074  *
2075  * In order for wildcard matching to work correctly in
2076  * zone_find(), we must ensure that a node for the wildcarding
2077  * level exists in the database, and has its 'find_callback'
2078  * and 'wild' bits set.
2079  *
2080  * E.g. if the wildcard name is "*.sub.example." then we
2081  * must ensure that "sub.example." exists and is marked as
2082  * a wildcard level.
2083  */
2084 static isc_result_t
2085 add_wildcard_magic(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2086         isc_result_t result;
2087         dns_name_t foundname;
2088         dns_offsets_t offsets;
2089         unsigned int n;
2090         dns_rbtnode_t *node = NULL;
2091
2092         dns_name_init(&foundname, offsets);
2093         n = dns_name_countlabels(name);
2094         INSIST(n >= 2);
2095         n--;
2096         dns_name_getlabelsequence(name, 1, n, &foundname);
2097         result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2098         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2099                 return (result);
2100         node->find_callback = 1;
2101         node->wild = 1;
2102         return (ISC_R_SUCCESS);
2103 }
2104
2105 static isc_result_t
2106 add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2107         isc_result_t result;
2108         dns_name_t foundname;
2109         dns_offsets_t offsets;
2110         unsigned int n, l, i;
2111
2112         dns_name_init(&foundname, offsets);
2113         n = dns_name_countlabels(name);
2114         l = dns_name_countlabels(&rbtdb->common.origin);
2115         i = l + 1;
2116         while (i < n) {
2117                 dns_rbtnode_t *node = NULL;     /* dummy */
2118                 dns_name_getlabelsequence(name, n - i, i, &foundname);
2119                 if (dns_name_iswildcard(&foundname)) {
2120                         result = add_wildcard_magic(rbtdb, &foundname);
2121                         if (result != ISC_R_SUCCESS)
2122                                 return (result);
2123                         result = dns_rbt_addnode(rbtdb->tree, &foundname,
2124                                                  &node);
2125                         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2126                                 return (result);
2127                 }
2128                 i++;
2129         }
2130         return (ISC_R_SUCCESS);
2131 }
2132
2133 static isc_result_t
2134 findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create,
2135          dns_dbnode_t **nodep)
2136 {
2137         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2138         dns_rbtnode_t *node = NULL;
2139         dns_name_t nodename;
2140         isc_result_t result;
2141         isc_rwlocktype_t locktype = isc_rwlocktype_read;
2142
2143         REQUIRE(VALID_RBTDB(rbtdb));
2144
2145         dns_name_init(&nodename, NULL);
2146         RWLOCK(&rbtdb->tree_lock, locktype);
2147         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &node, NULL,
2148                                   DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2149         if (result != ISC_R_SUCCESS) {
2150                 RWUNLOCK(&rbtdb->tree_lock, locktype);
2151                 if (!create) {
2152                         if (result == DNS_R_PARTIALMATCH)
2153                                 result = ISC_R_NOTFOUND;
2154                         return (result);
2155                 }
2156                 /*
2157                  * It would be nice to try to upgrade the lock instead of
2158                  * unlocking then relocking.
2159                  */
2160                 locktype = isc_rwlocktype_write;
2161                 RWLOCK(&rbtdb->tree_lock, locktype);
2162                 node = NULL;
2163                 result = dns_rbt_addnode(rbtdb->tree, name, &node);
2164                 if (result == ISC_R_SUCCESS) {
2165                         dns_rbt_namefromnode(node, &nodename);
2166 #ifdef DNS_RBT_USEHASH
2167                         node->locknum = node->hashval % rbtdb->node_lock_count;
2168 #else
2169                         node->locknum = dns_name_hash(&nodename, ISC_TRUE) %
2170                                 rbtdb->node_lock_count;
2171 #endif
2172                         add_empty_wildcards(rbtdb, name);
2173
2174                         if (dns_name_iswildcard(name)) {
2175                                 result = add_wildcard_magic(rbtdb, name);
2176                                 if (result != ISC_R_SUCCESS) {
2177                                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2178                                         return (result);
2179                                 }
2180                         }
2181                 } else if (result != ISC_R_EXISTS) {
2182                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2183                         return (result);
2184                 }
2185         }
2186         reactivate_node(rbtdb, node, locktype);
2187         RWUNLOCK(&rbtdb->tree_lock, locktype);
2188
2189         *nodep = (dns_dbnode_t *)node;
2190
2191         return (ISC_R_SUCCESS);
2192 }
2193
2194 static isc_result_t
2195 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2196         rbtdb_search_t *search = arg;
2197         rdatasetheader_t *header, *header_next;
2198         rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2199         rdatasetheader_t *found;
2200         isc_result_t result;
2201         dns_rbtnode_t *onode;
2202
2203         /*
2204          * We only want to remember the topmost zone cut, since it's the one
2205          * that counts, so we'll just continue if we've already found a
2206          * zonecut.
2207          */
2208         if (search->zonecut != NULL)
2209                 return (DNS_R_CONTINUE);
2210
2211         found = NULL;
2212         result = DNS_R_CONTINUE;
2213         onode = search->rbtdb->origin_node;
2214
2215         NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2216                   isc_rwlocktype_read);
2217
2218         /*
2219          * Look for an NS or DNAME rdataset active in our version.
2220          */
2221         ns_header = NULL;
2222         dname_header = NULL;
2223         sigdname_header = NULL;
2224         for (header = node->data; header != NULL; header = header_next) {
2225                 header_next = header->next;
2226                 if (header->type == dns_rdatatype_ns ||
2227                     header->type == dns_rdatatype_dname ||
2228                     header->type == RBTDB_RDATATYPE_SIGDNAME) {
2229                         do {
2230                                 if (header->serial <= search->serial &&
2231                                     !IGNORE(header)) {
2232                                         /*
2233                                          * Is this a "this rdataset doesn't
2234                                          * exist" record?
2235                                          */
2236                                         if (NONEXISTENT(header))
2237                                                 header = NULL;
2238                                         break;
2239                                 } else
2240                                         header = header->down;
2241                         } while (header != NULL);
2242                         if (header != NULL) {
2243                                 if (header->type == dns_rdatatype_dname)
2244                                         dname_header = header;
2245                                 else if (header->type ==
2246                                            RBTDB_RDATATYPE_SIGDNAME)
2247                                         sigdname_header = header;
2248                                 else if (node != onode ||
2249                                          IS_STUB(search->rbtdb)) {
2250                                         /*
2251                                          * We've found an NS rdataset that
2252                                          * isn't at the origin node.  We check
2253                                          * that they're not at the origin node,
2254                                          * because otherwise we'd erroneously
2255                                          * treat the zone top as if it were
2256                                          * a delegation.
2257                                          */
2258                                         ns_header = header;
2259                                 }
2260                         }
2261                 }
2262         }
2263
2264         /*
2265          * Did we find anything?
2266          */
2267         if (dname_header != NULL) {
2268                 /*
2269                  * Note that DNAME has precedence over NS if both exist.
2270                  */
2271                 found = dname_header;
2272                 search->zonecut_sigrdataset = sigdname_header;
2273         } else if (ns_header != NULL) {
2274                 found = ns_header;
2275                 search->zonecut_sigrdataset = NULL;
2276         }
2277
2278         if (found != NULL) {
2279                 /*
2280                  * We increment the reference count on node to ensure that
2281                  * search->zonecut_rdataset will still be valid later.
2282                  */
2283                 new_reference(search->rbtdb, node);
2284                 search->zonecut = node;
2285                 search->zonecut_rdataset = found;
2286                 search->need_cleanup = ISC_TRUE;
2287                 /*
2288                  * Since we've found a zonecut, anything beneath it is
2289                  * glue and is not subject to wildcard matching, so we
2290                  * may clear search->wild.
2291                  */
2292                 search->wild = ISC_FALSE;
2293                 if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
2294                         /*
2295                          * If the caller does not want to find glue, then
2296                          * this is the best answer and the search should
2297                          * stop now.
2298                          */
2299                         result = DNS_R_PARTIALMATCH;
2300                 } else {
2301                         dns_name_t *zcname;
2302
2303                         /*
2304                          * The search will continue beneath the zone cut.
2305                          * This may or may not be the best match.  In case it
2306                          * is, we need to remember the node name.
2307                          */
2308                         zcname = dns_fixedname_name(&search->zonecut_name);
2309                         RUNTIME_CHECK(dns_name_copy(name, zcname, NULL) ==
2310                                       ISC_R_SUCCESS);
2311                         search->copy_name = ISC_TRUE;
2312                 }
2313         } else {
2314                 /*
2315                  * There is no zonecut at this node which is active in this
2316                  * version.
2317                  *
2318                  * If this is a "wild" node and the caller hasn't disabled
2319                  * wildcard matching, remember that we've seen a wild node
2320                  * in case we need to go searching for wildcard matches
2321                  * later on.
2322                  */
2323                 if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0)
2324                         search->wild = ISC_TRUE;
2325         }
2326
2327         NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2328                     isc_rwlocktype_read);
2329
2330         return (result);
2331 }
2332
2333 static inline void
2334 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2335               rdatasetheader_t *header, isc_stdtime_t now,
2336               dns_rdataset_t *rdataset)
2337 {
2338         unsigned char *raw;     /* RDATASLAB */
2339
2340         /*
2341          * Caller must be holding the node reader lock.
2342          * XXXJT: technically, we need a writer lock, since we'll increment
2343          * the header count below.  However, since the actual counter value
2344          * doesn't matter, we prioritize performance here.  (We may want to
2345          * use atomic increment when available).
2346          */
2347
2348         if (rdataset == NULL)
2349                 return;
2350
2351         new_reference(rbtdb, node);
2352
2353         INSIST(rdataset->methods == NULL);      /* We must be disassociated. */
2354
2355         rdataset->methods = &rdataset_methods;
2356         rdataset->rdclass = rbtdb->common.rdclass;
2357         rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
2358         rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
2359         rdataset->ttl = header->rdh_ttl - now;
2360         rdataset->trust = header->trust;
2361         if (NXDOMAIN(header))
2362                 rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
2363         rdataset->private1 = rbtdb;
2364         rdataset->private2 = node;
2365         raw = (unsigned char *)header + sizeof(*header);
2366         rdataset->private3 = raw;
2367         rdataset->count = header->count++;
2368         if (rdataset->count == ISC_UINT32_MAX)
2369                 rdataset->count = 0;
2370
2371         /*
2372          * Reset iterator state.
2373          */
2374         rdataset->privateuint4 = 0;
2375         rdataset->private5 = NULL;
2376
2377         /*
2378          * Add noqname proof.
2379          */
2380         rdataset->private6 = header->noqname;
2381         if (rdataset->private6 != NULL)
2382                 rdataset->attributes |=  DNS_RDATASETATTR_NOQNAME;
2383 }
2384
2385 static inline isc_result_t
2386 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
2387                  dns_name_t *foundname, dns_rdataset_t *rdataset,
2388                  dns_rdataset_t *sigrdataset)
2389 {
2390         isc_result_t result;
2391         dns_name_t *zcname;
2392         rbtdb_rdatatype_t type;
2393         dns_rbtnode_t *node;
2394
2395         /*
2396          * The caller MUST NOT be holding any node locks.
2397          */
2398
2399         node = search->zonecut;
2400         type = search->zonecut_rdataset->type;
2401
2402         /*
2403          * If we have to set foundname, we do it before anything else.
2404          * If we were to set foundname after we had set nodep or bound the
2405          * rdataset, then we'd have to undo that work if dns_name_copy()
2406          * failed.  By setting foundname first, there's nothing to undo if
2407          * we have trouble.
2408          */
2409         if (foundname != NULL && search->copy_name) {
2410                 zcname = dns_fixedname_name(&search->zonecut_name);
2411                 result = dns_name_copy(zcname, foundname, NULL);
2412                 if (result != ISC_R_SUCCESS)
2413                         return (result);
2414         }
2415         if (nodep != NULL) {
2416                 /*
2417                  * Note that we don't have to increment the node's reference
2418                  * count here because we're going to use the reference we
2419                  * already have in the search block.
2420                  */
2421                 *nodep = node;
2422                 search->need_cleanup = ISC_FALSE;
2423         }
2424         if (rdataset != NULL) {
2425                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2426                           isc_rwlocktype_read);
2427                 bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
2428                               search->now, rdataset);
2429                 if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
2430                         bind_rdataset(search->rbtdb, node,
2431                                       search->zonecut_sigrdataset,
2432                                       search->now, sigrdataset);
2433                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2434                             isc_rwlocktype_read);
2435         }
2436
2437         if (type == dns_rdatatype_dname)
2438                 return (DNS_R_DNAME);
2439         return (DNS_R_DELEGATION);
2440 }
2441
2442 static inline isc_boolean_t
2443 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
2444            dns_rbtnode_t *node)
2445 {
2446         unsigned char *raw;     /* RDATASLAB */
2447         unsigned int count, size;
2448         dns_name_t ns_name;
2449         isc_boolean_t valid = ISC_FALSE;
2450         dns_offsets_t offsets;
2451         isc_region_t region;
2452         rdatasetheader_t *header;
2453
2454         /*
2455          * No additional locking is required.
2456          */
2457
2458         /*
2459          * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
2460          * if it occurs at a zone cut, but is not valid below it.
2461          */
2462         if (type == dns_rdatatype_ns) {
2463                 if (node != search->zonecut) {
2464                         return (ISC_FALSE);
2465                 }
2466         } else if (type != dns_rdatatype_a &&
2467                    type != dns_rdatatype_aaaa &&
2468                    type != dns_rdatatype_a6) {
2469                 return (ISC_FALSE);
2470         }
2471
2472         header = search->zonecut_rdataset;
2473         raw = (unsigned char *)header + sizeof(*header);
2474         count = raw[0] * 256 + raw[1];
2475 #if DNS_RDATASET_FIXED
2476         raw += 2 + (4 * count);
2477 #else
2478         raw += 2;
2479 #endif
2480
2481         while (count > 0) {
2482                 count--;
2483                 size = raw[0] * 256 + raw[1];
2484 #if DNS_RDATASET_FIXED
2485                 raw += 4;
2486 #else
2487                 raw += 2;
2488 #endif
2489                 region.base = raw;
2490                 region.length = size;
2491                 raw += size;
2492                 /*
2493                  * XXX Until we have rdata structures, we have no choice but
2494                  * to directly access the rdata format.
2495                  */
2496                 dns_name_init(&ns_name, offsets);
2497                 dns_name_fromregion(&ns_name, &region);
2498                 if (dns_name_compare(&ns_name, name) == 0) {
2499                         valid = ISC_TRUE;
2500                         break;
2501                 }
2502         }
2503
2504         return (valid);
2505 }
2506
2507 static inline isc_boolean_t
2508 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
2509             dns_name_t *name)
2510 {
2511         dns_fixedname_t fnext;
2512         dns_fixedname_t forigin;
2513         dns_name_t *next;
2514         dns_name_t *origin;
2515         dns_name_t prefix;
2516         dns_rbtdb_t *rbtdb;
2517         dns_rbtnode_t *node;
2518         isc_result_t result;
2519         isc_boolean_t answer = ISC_FALSE;
2520         rdatasetheader_t *header;
2521
2522         rbtdb = search->rbtdb;
2523
2524         dns_name_init(&prefix, NULL);
2525         dns_fixedname_init(&fnext);
2526         next = dns_fixedname_name(&fnext);
2527         dns_fixedname_init(&forigin);
2528         origin = dns_fixedname_name(&forigin);
2529
2530         result = dns_rbtnodechain_next(chain, NULL, NULL);
2531         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2532                 node = NULL;
2533                 result = dns_rbtnodechain_current(chain, &prefix,
2534                                                   origin, &node);
2535                 if (result != ISC_R_SUCCESS)
2536                         break;
2537                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2538                           isc_rwlocktype_read);
2539                 for (header = node->data;
2540                      header != NULL;
2541                      header = header->next) {
2542                         if (header->serial <= search->serial &&
2543                             !IGNORE(header) && EXISTS(header))
2544                                 break;
2545                 }
2546                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2547                             isc_rwlocktype_read);
2548                 if (header != NULL)
2549                         break;
2550                 result = dns_rbtnodechain_next(chain, NULL, NULL);
2551         }
2552         if (result == ISC_R_SUCCESS)
2553                 result = dns_name_concatenate(&prefix, origin, next, NULL);
2554         if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name))
2555                 answer = ISC_TRUE;
2556         return (answer);
2557 }
2558
2559 static inline isc_boolean_t
2560 activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) {
2561         dns_fixedname_t fnext;
2562         dns_fixedname_t forigin;
2563         dns_fixedname_t fprev;
2564         dns_name_t *next;
2565         dns_name_t *origin;
2566         dns_name_t *prev;
2567         dns_name_t name;
2568         dns_name_t rname;
2569         dns_name_t tname;
2570         dns_rbtdb_t *rbtdb;
2571         dns_rbtnode_t *node;
2572         dns_rbtnodechain_t chain;
2573         isc_boolean_t check_next = ISC_TRUE;
2574         isc_boolean_t check_prev = ISC_TRUE;
2575         isc_boolean_t answer = ISC_FALSE;
2576         isc_result_t result;
2577         rdatasetheader_t *header;
2578         unsigned int n;
2579
2580         rbtdb = search->rbtdb;
2581
2582         dns_name_init(&name, NULL);
2583         dns_name_init(&tname, NULL);
2584         dns_name_init(&rname, NULL);
2585         dns_fixedname_init(&fnext);
2586         next = dns_fixedname_name(&fnext);
2587         dns_fixedname_init(&fprev);
2588         prev = dns_fixedname_name(&fprev);
2589         dns_fixedname_init(&forigin);
2590         origin = dns_fixedname_name(&forigin);
2591
2592         /*
2593          * Find if qname is at or below a empty node.
2594          * Use our own copy of the chain.
2595          */
2596
2597         chain = search->chain;
2598         do {
2599                 node = NULL;
2600                 result = dns_rbtnodechain_current(&chain, &name,
2601                                                   origin, &node);
2602                 if (result != ISC_R_SUCCESS)
2603                         break;
2604                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2605                           isc_rwlocktype_read);
2606                 for (header = node->data;
2607                      header != NULL;
2608                      header = header->next) {
2609                         if (header->serial <= search->serial &&
2610                             !IGNORE(header) && EXISTS(header))
2611                                 break;
2612                 }
2613                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2614                             isc_rwlocktype_read);
2615                 if (header != NULL)
2616                         break;
2617                 result = dns_rbtnodechain_prev(&chain, NULL, NULL);
2618         } while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
2619         if (result == ISC_R_SUCCESS)
2620                 result = dns_name_concatenate(&name, origin, prev, NULL);
2621         if (result != ISC_R_SUCCESS)
2622                 check_prev = ISC_FALSE;
2623
2624         result = dns_rbtnodechain_next(&chain, NULL, NULL);
2625         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2626                 node = NULL;
2627                 result = dns_rbtnodechain_current(&chain, &name,
2628                                                   origin, &node);
2629                 if (result != ISC_R_SUCCESS)
2630                         break;
2631                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2632                           isc_rwlocktype_read);
2633                 for (header = node->data;
2634                      header != NULL;
2635                      header = header->next) {
2636                         if (header->serial <= search->serial &&
2637                             !IGNORE(header) && EXISTS(header))
2638                                 break;
2639                 }
2640                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2641                             isc_rwlocktype_read);
2642                 if (header != NULL)
2643                         break;
2644                 result = dns_rbtnodechain_next(&chain, NULL, NULL);
2645         }
2646         if (result == ISC_R_SUCCESS)
2647                 result = dns_name_concatenate(&name, origin, next, NULL);
2648         if (result != ISC_R_SUCCESS)
2649                 check_next = ISC_FALSE;
2650
2651         dns_name_clone(qname, &rname);
2652
2653         /*
2654          * Remove the wildcard label to find the terminal name.
2655          */
2656         n = dns_name_countlabels(wname);
2657         dns_name_getlabelsequence(wname, 1, n - 1, &tname);
2658
2659         do {
2660                 if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
2661                     (check_next && dns_name_issubdomain(next, &rname))) {
2662                         answer = ISC_TRUE;
2663                         break;
2664                 }
2665                 /*
2666                  * Remove the left hand label.
2667                  */
2668                 n = dns_name_countlabels(&rname);
2669                 dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
2670         } while (!dns_name_equal(&rname, &tname));
2671         return (answer);
2672 }
2673
2674 static inline isc_result_t
2675 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
2676               dns_name_t *qname)
2677 {
2678         unsigned int i, j;
2679         dns_rbtnode_t *node, *level_node, *wnode;
2680         rdatasetheader_t *header;
2681         isc_result_t result = ISC_R_NOTFOUND;
2682         dns_name_t name;
2683         dns_name_t *wname;
2684         dns_fixedname_t fwname;
2685         dns_rbtdb_t *rbtdb;
2686         isc_boolean_t done, wild, active;
2687         dns_rbtnodechain_t wchain;
2688
2689         /*
2690          * Caller must be holding the tree lock and MUST NOT be holding
2691          * any node locks.
2692          */
2693
2694         /*
2695          * Examine each ancestor level.  If the level's wild bit
2696          * is set, then construct the corresponding wildcard name and
2697          * search for it.  If the wildcard node exists, and is active in
2698          * this version, we're done.  If not, then we next check to see
2699          * if the ancestor is active in this version.  If so, then there
2700          * can be no possible wildcard match and again we're done.  If not,
2701          * continue the search.
2702          */
2703
2704         rbtdb = search->rbtdb;
2705         i = search->chain.level_matches;
2706         done = ISC_FALSE;
2707         node = *nodep;
2708         do {
2709                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2710                           isc_rwlocktype_read);
2711
2712                 /*
2713                  * First we try to figure out if this node is active in
2714                  * the search's version.  We do this now, even though we
2715                  * may not need the information, because it simplifies the
2716                  * locking and code flow.
2717                  */
2718                 for (header = node->data;
2719                      header != NULL;
2720                      header = header->next) {
2721                         if (header->serial <= search->serial &&
2722                             !IGNORE(header) && EXISTS(header))
2723                                 break;
2724                 }
2725                 if (header != NULL)
2726                         active = ISC_TRUE;
2727                 else
2728                         active = ISC_FALSE;
2729
2730                 if (node->wild)
2731                         wild = ISC_TRUE;
2732                 else
2733                         wild = ISC_FALSE;
2734
2735                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2736                             isc_rwlocktype_read);
2737
2738                 if (wild) {
2739                         /*
2740                          * Construct the wildcard name for this level.
2741                          */
2742                         dns_name_init(&name, NULL);
2743                         dns_rbt_namefromnode(node, &name);
2744                         dns_fixedname_init(&fwname);
2745                         wname = dns_fixedname_name(&fwname);
2746                         result = dns_name_concatenate(dns_wildcardname, &name,
2747                                                       wname, NULL);
2748                         j = i;
2749                         while (result == ISC_R_SUCCESS && j != 0) {
2750                                 j--;
2751                                 level_node = search->chain.levels[j];
2752                                 dns_name_init(&name, NULL);
2753                                 dns_rbt_namefromnode(level_node, &name);
2754                                 result = dns_name_concatenate(wname,
2755                                                               &name,
2756                                                               wname,
2757                                                               NULL);
2758                         }
2759                         if (result != ISC_R_SUCCESS)
2760                                 break;
2761
2762                         wnode = NULL;
2763                         dns_rbtnodechain_init(&wchain, NULL);
2764                         result = dns_rbt_findnode(rbtdb->tree, wname,
2765                                                   NULL, &wnode, &wchain,
2766                                                   DNS_RBTFIND_EMPTYDATA,
2767                                                   NULL, NULL);
2768                         if (result == ISC_R_SUCCESS) {
2769                                 nodelock_t *lock;
2770
2771                                 /*
2772                                  * We have found the wildcard node.  If it
2773                                  * is active in the search's version, we're
2774                                  * done.
2775                                  */
2776                                 lock = &rbtdb->node_locks[wnode->locknum].lock;
2777                                 NODE_LOCK(lock, isc_rwlocktype_read);
2778                                 for (header = wnode->data;
2779                                      header != NULL;
2780                                      header = header->next) {
2781                                         if (header->serial <= search->serial &&
2782                                             !IGNORE(header) && EXISTS(header))
2783                                                 break;
2784                                 }
2785                                 NODE_UNLOCK(lock, isc_rwlocktype_read);
2786                                 if (header != NULL ||
2787                                     activeempty(search, &wchain, wname)) {
2788                                         if (activeemtpynode(search, qname,
2789                                                             wname)) {
2790                                                 return (ISC_R_NOTFOUND);
2791                                         }
2792                                         /*
2793                                          * The wildcard node is active!
2794                                          *
2795                                          * Note: result is still ISC_R_SUCCESS
2796                                          * so we don't have to set it.
2797                                          */
2798                                         *nodep = wnode;
2799                                         break;
2800                                 }
2801                         } else if (result != ISC_R_NOTFOUND &&
2802                                    result != DNS_R_PARTIALMATCH) {
2803                                 /*
2804                                  * An error has occurred.  Bail out.
2805                                  */
2806                                 break;
2807                         }
2808                 }
2809
2810                 if (active) {
2811                         /*
2812                          * The level node is active.  Any wildcarding
2813                          * present at higher levels has no
2814                          * effect and we're done.
2815                          */
2816                         result = ISC_R_NOTFOUND;
2817                         break;
2818                 }
2819
2820                 if (i > 0) {
2821                         i--;
2822                         node = search->chain.levels[i];
2823                 } else
2824                         done = ISC_TRUE;
2825         } while (!done);
2826
2827         return (result);
2828 }
2829
2830 static inline isc_result_t
2831 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
2832                   dns_name_t *foundname, dns_rdataset_t *rdataset,
2833                   dns_rdataset_t *sigrdataset, isc_boolean_t need_sig)
2834 {
2835         dns_rbtnode_t *node;
2836         rdatasetheader_t *header, *header_next, *found, *foundsig;
2837         isc_boolean_t empty_node;
2838         isc_result_t result;
2839         dns_fixedname_t fname, forigin;
2840         dns_name_t *name, *origin;
2841
2842         do {
2843                 node = NULL;
2844                 dns_fixedname_init(&fname);
2845                 name = dns_fixedname_name(&fname);
2846                 dns_fixedname_init(&forigin);
2847                 origin = dns_fixedname_name(&forigin);
2848                 result = dns_rbtnodechain_current(&search->chain, name,
2849                                                   origin, &node);
2850                 if (result != ISC_R_SUCCESS)
2851                         return (result);
2852                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2853                           isc_rwlocktype_read);
2854                 found = NULL;
2855                 foundsig = NULL;
2856                 empty_node = ISC_TRUE;
2857                 for (header = node->data;
2858                      header != NULL;
2859                      header = header_next) {
2860                         header_next = header->next;
2861                         /*
2862                          * Look for an active, extant NSEC or RRSIG NSEC.
2863                          */
2864                         do {
2865                                 if (header->serial <= search->serial &&
2866                                     !IGNORE(header)) {
2867                                         /*
2868                                          * Is this a "this rdataset doesn't
2869                                          * exist" record?
2870                                          */
2871                                         if (NONEXISTENT(header))
2872                                                 header = NULL;
2873                                         break;
2874                                 } else
2875                                         header = header->down;
2876                         } while (header != NULL);
2877                         if (header != NULL) {
2878                                 /*
2879                                  * We now know that there is at least one
2880                                  * active rdataset at this node.
2881                                  */
2882                                 empty_node = ISC_FALSE;
2883                                 if (header->type == dns_rdatatype_nsec) {
2884                                         found = header;
2885                                         if (foundsig != NULL)
2886                                                 break;
2887                                 } else if (header->type ==
2888                                            RBTDB_RDATATYPE_SIGNSEC) {
2889                                         foundsig = header;
2890                                         if (found != NULL)
2891                                                 break;
2892                                 }
2893                         }
2894                 }
2895                 if (!empty_node) {
2896                         if (found != NULL &&
2897                             (foundsig != NULL || !need_sig))
2898                         {
2899                                 /*
2900                                  * We've found the right NSEC record.
2901                                  *
2902                                  * Note: for this to really be the right
2903                                  * NSEC record, it's essential that the NSEC
2904                                  * records of any nodes obscured by a zone
2905                                  * cut have been removed; we assume this is
2906                                  * the case.
2907                                  */
2908                                 result = dns_name_concatenate(name, origin,
2909                                                               foundname, NULL);
2910                                 if (result == ISC_R_SUCCESS) {
2911                                         if (nodep != NULL) {
2912                                                 new_reference(search->rbtdb,
2913                                                               node);
2914                                                 *nodep = node;
2915                                         }
2916                                         bind_rdataset(search->rbtdb, node,
2917                                                       found, search->now,
2918                                                       rdataset);
2919                                         if (foundsig != NULL)
2920                                                 bind_rdataset(search->rbtdb,
2921                                                               node,
2922                                                               foundsig,
2923                                                               search->now,
2924                                                               sigrdataset);
2925                                 }
2926                         } else if (found == NULL && foundsig == NULL) {
2927                                 /*
2928                                  * This node is active, but has no NSEC or
2929                                  * RRSIG NSEC.  That means it's glue or
2930                                  * other obscured zone data that isn't
2931                                  * relevant for our search.  Treat the
2932                                  * node as if it were empty and keep looking.
2933                                  */
2934                                 empty_node = ISC_TRUE;
2935                                 result = dns_rbtnodechain_prev(&search->chain,
2936                                                                NULL, NULL);
2937                         } else {
2938                                 /*
2939                                  * We found an active node, but either the
2940                                  * NSEC or the RRSIG NSEC is missing.  This
2941                                  * shouldn't happen.
2942                                  */
2943                                 result = DNS_R_BADDB;
2944                         }
2945                 } else {
2946                         /*
2947                          * This node isn't active.  We've got to keep
2948                          * looking.
2949                          */
2950                         result = dns_rbtnodechain_prev(&search->chain, NULL,
2951                                                        NULL);
2952                 }
2953                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2954                             isc_rwlocktype_read);
2955         } while (empty_node && result == ISC_R_SUCCESS);
2956
2957         /*
2958          * If the result is ISC_R_NOMORE, then we got to the beginning of
2959          * the database and didn't find a NSEC record.  This shouldn't
2960          * happen.
2961          */
2962         if (result == ISC_R_NOMORE)
2963                 result = DNS_R_BADDB;
2964
2965         return (result);
2966 }
2967
2968 static isc_result_t
2969 zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
2970           dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
2971           dns_dbnode_t **nodep, dns_name_t *foundname,
2972           dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
2973 {
2974         dns_rbtnode_t *node = NULL;
2975         isc_result_t result;
2976         rbtdb_search_t search;
2977         isc_boolean_t cname_ok = ISC_TRUE;
2978         isc_boolean_t close_version = ISC_FALSE;
2979         isc_boolean_t maybe_zonecut = ISC_FALSE;
2980         isc_boolean_t at_zonecut = ISC_FALSE;
2981         isc_boolean_t wild;
2982         isc_boolean_t empty_node;
2983         rdatasetheader_t *header, *header_next, *found, *nsecheader;
2984         rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
2985         rbtdb_rdatatype_t sigtype;
2986         isc_boolean_t active;
2987         dns_rbtnodechain_t chain;
2988         nodelock_t *lock;
2989
2990
2991         search.rbtdb = (dns_rbtdb_t *)db;
2992
2993         REQUIRE(VALID_RBTDB(search.rbtdb));
2994
2995         /*
2996          * We don't care about 'now'.
2997          */
2998         UNUSED(now);
2999
3000         /*
3001          * If the caller didn't supply a version, attach to the current
3002          * version.
3003          */
3004         if (version == NULL) {
3005                 currentversion(db, &version);
3006                 close_version = ISC_TRUE;
3007         }
3008
3009         search.rbtversion = version;
3010         search.serial = search.rbtversion->serial;
3011         search.options = options;
3012         search.copy_name = ISC_FALSE;
3013         search.need_cleanup = ISC_FALSE;
3014         search.wild = ISC_FALSE;
3015         search.zonecut = NULL;
3016         dns_fixedname_init(&search.zonecut_name);
3017         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3018         search.now = 0;
3019
3020         /*
3021          * 'wild' will be true iff. we've matched a wildcard.
3022          */
3023         wild = ISC_FALSE;
3024
3025         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3026
3027         /*
3028          * Search down from the root of the tree.  If, while going down, we
3029          * encounter a callback node, zone_zonecut_callback() will search the
3030          * rdatasets at the zone cut for active DNAME or NS rdatasets.
3031          */
3032         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3033                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3034                                   zone_zonecut_callback, &search);
3035
3036         if (result == DNS_R_PARTIALMATCH) {
3037         partial_match:
3038                 if (search.zonecut != NULL) {
3039                     result = setup_delegation(&search, nodep, foundname,
3040                                               rdataset, sigrdataset);
3041                     goto tree_exit;
3042                 }
3043
3044                 if (search.wild) {
3045                         /*
3046                          * At least one of the levels in the search chain
3047                          * potentially has a wildcard.  For each such level,
3048                          * we must see if there's a matching wildcard active
3049                          * in the current version.
3050                          */
3051                         result = find_wildcard(&search, &node, name);
3052                         if (result == ISC_R_SUCCESS) {
3053                                 result = dns_name_copy(name, foundname, NULL);
3054                                 if (result != ISC_R_SUCCESS)
3055                                         goto tree_exit;
3056                                 wild = ISC_TRUE;
3057                                 goto found;
3058                         }
3059                         else if (result != ISC_R_NOTFOUND)
3060                                 goto tree_exit;
3061                 }
3062
3063                 chain = search.chain;
3064                 active = activeempty(&search, &chain, name);
3065
3066                 /*
3067                  * If we're here, then the name does not exist, is not
3068                  * beneath a zonecut, and there's no matching wildcard.
3069                  */
3070                 if (search.rbtdb->secure ||
3071                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3072                 {
3073                         result = find_closest_nsec(&search, nodep, foundname,
3074                                                    rdataset, sigrdataset,
3075                                                    search.rbtdb->secure);
3076                         if (result == ISC_R_SUCCESS)
3077                                 result = active ? DNS_R_EMPTYNAME :
3078                                                   DNS_R_NXDOMAIN;
3079                 } else
3080                         result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
3081                 goto tree_exit;
3082         } else if (result != ISC_R_SUCCESS)
3083                 goto tree_exit;
3084
3085  found:
3086         /*
3087          * We have found a node whose name is the desired name, or we
3088          * have matched a wildcard.
3089          */
3090
3091         if (search.zonecut != NULL) {
3092                 /*
3093                  * If we're beneath a zone cut, we don't want to look for
3094                  * CNAMEs because they're not legitimate zone glue.
3095                  */
3096                 cname_ok = ISC_FALSE;
3097         } else {
3098                 /*
3099                  * The node may be a zone cut itself.  If it might be one,
3100                  * make sure we check for it later.
3101                  */
3102                 if (node->find_callback &&
3103                     (node != search.rbtdb->origin_node ||
3104                      IS_STUB(search.rbtdb)) &&
3105                     !dns_rdatatype_atparent(type))
3106                         maybe_zonecut = ISC_TRUE;
3107         }
3108
3109         /*
3110          * Certain DNSSEC types are not subject to CNAME matching
3111          * (RFC4035, section 2.5 and RFC3007).
3112          *
3113          * We don't check for RRSIG, because we don't store RRSIG records
3114          * directly.
3115          */
3116         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3117                 cname_ok = ISC_FALSE;
3118
3119         /*
3120          * We now go looking for rdata...
3121          */
3122
3123         NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3124                   isc_rwlocktype_read);
3125
3126         found = NULL;
3127         foundsig = NULL;
3128         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3129         nsecheader = NULL;
3130         nsecsig = NULL;
3131         cnamesig = NULL;
3132         empty_node = ISC_TRUE;
3133         for (header = node->data; header != NULL; header = header_next) {
3134                 header_next = header->next;
3135                 /*
3136                  * Look for an active, extant rdataset.
3137                  */
3138                 do {
3139                         if (header->serial <= search.serial &&
3140                             !IGNORE(header)) {
3141                                 /*
3142                                  * Is this a "this rdataset doesn't
3143                                  * exist" record?
3144                                  */
3145                                 if (NONEXISTENT(header))
3146                                         header = NULL;
3147                                 break;
3148                         } else
3149                                 header = header->down;
3150                 } while (header != NULL);
3151                 if (header != NULL) {
3152                         /*
3153                          * We now know that there is at least one active
3154                          * rdataset at this node.
3155                          */
3156                         empty_node = ISC_FALSE;
3157
3158                         /*
3159                          * Do special zone cut handling, if requested.
3160                          */
3161                         if (maybe_zonecut &&
3162                             header->type == dns_rdatatype_ns) {
3163                                 /*
3164                                  * We increment the reference count on node to
3165                                  * ensure that search->zonecut_rdataset will
3166                                  * still be valid later.
3167                                  */
3168                                 new_reference(search.rbtdb, node);
3169                                 search.zonecut = node;
3170                                 search.zonecut_rdataset = header;
3171                                 search.zonecut_sigrdataset = NULL;
3172                                 search.need_cleanup = ISC_TRUE;
3173                                 maybe_zonecut = ISC_FALSE;
3174                                 at_zonecut = ISC_TRUE;
3175                                 /*
3176                                  * It is not clear if KEY should still be
3177                                  * allowed at the parent side of the zone
3178                                  * cut or not.  It is needed for RFC3007
3179                                  * validated updates.
3180                                  */
3181                                 if ((search.options & DNS_DBFIND_GLUEOK) == 0
3182                                     && type != dns_rdatatype_nsec
3183                                     && type != dns_rdatatype_key) {
3184                                         /*
3185                                          * Glue is not OK, but any answer we
3186                                          * could return would be glue.  Return
3187                                          * the delegation.
3188                                          */
3189                                         found = NULL;
3190                                         break;
3191                                 }
3192                                 if (found != NULL && foundsig != NULL)
3193                                         break;
3194                         }
3195
3196                         /*
3197                          * If we found a type we were looking for,
3198                          * remember it.
3199                          */
3200                         if (header->type == type ||
3201                             type == dns_rdatatype_any ||
3202                             (header->type == dns_rdatatype_cname &&
3203                              cname_ok)) {
3204                                 /*
3205                                  * We've found the answer!
3206                                  */
3207                                 found = header;
3208                                 if (header->type == dns_rdatatype_cname &&
3209                                     cname_ok) {
3210                                         /*
3211                                          * We may be finding a CNAME instead
3212                                          * of the desired type.
3213                                          *
3214                                          * If we've already got the CNAME RRSIG,
3215                                          * use it, otherwise change sigtype
3216                                          * so that we find it.
3217                                          */
3218                                         if (cnamesig != NULL)
3219                                                 foundsig = cnamesig;
3220                                         else
3221                                                 sigtype =
3222                                                     RBTDB_RDATATYPE_SIGCNAME;
3223                                 }
3224                                 /*
3225                                  * If we've got all we need, end the search.
3226                                  */
3227                                 if (!maybe_zonecut && foundsig != NULL)
3228                                         break;
3229                         } else if (header->type == sigtype) {
3230                                 /*
3231                                  * We've found the RRSIG rdataset for our
3232                                  * target type.  Remember it.
3233                                  */
3234                                 foundsig = header;
3235                                 /*
3236                                  * If we've got all we need, end the search.
3237                                  */
3238                                 if (!maybe_zonecut && found != NULL)
3239                                         break;
3240                         } else if (header->type == dns_rdatatype_nsec) {
3241                                 /*
3242                                  * Remember a NSEC rdataset even if we're
3243                                  * not specifically looking for it, because
3244                                  * we might need it later.
3245                                  */
3246                                 nsecheader = header;
3247                         } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
3248                                 /*
3249                                  * If we need the NSEC rdataset, we'll also
3250                                  * need its signature.
3251                                  */
3252                                 nsecsig = header;
3253                         } else if (cname_ok &&
3254                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
3255                                 /*
3256                                  * If we get a CNAME match, we'll also need
3257                                  * its signature.
3258                                  */
3259                                 cnamesig = header;
3260                         }
3261                 }
3262         }
3263
3264         if (empty_node) {
3265                 /*
3266                  * We have an exact match for the name, but there are no
3267                  * active rdatasets in the desired version.  That means that
3268                  * this node doesn't exist in the desired version, and that
3269                  * we really have a partial match.
3270                  */
3271                 if (!wild) {
3272                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3273                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3274                         goto partial_match;
3275                 }
3276         }
3277
3278         /*
3279          * If we didn't find what we were looking for...
3280          */
3281         if (found == NULL) {
3282                 if (search.zonecut != NULL) {
3283                         /*
3284                          * We were trying to find glue at a node beneath a
3285                          * zone cut, but didn't.
3286                          *
3287                          * Return the delegation.
3288                          */
3289                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3290                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3291                         result = setup_delegation(&search, nodep, foundname,
3292                                                   rdataset, sigrdataset);
3293                         goto tree_exit;
3294                 }
3295                 /*
3296                  * The desired type doesn't exist.
3297                  */
3298                 result = DNS_R_NXRRSET;
3299                 if (search.rbtdb->secure &&
3300                     (nsecheader == NULL || nsecsig == NULL)) {
3301                         /*
3302                          * The zone is secure but there's no NSEC,
3303                          * or the NSEC has no signature!
3304                          */
3305                         if (!wild) {
3306                                 result = DNS_R_BADDB;
3307                                 goto node_exit;
3308                         }
3309
3310                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3311                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3312                         result = find_closest_nsec(&search, nodep, foundname,
3313                                                    rdataset, sigrdataset,
3314                                                    search.rbtdb->secure);
3315                         if (result == ISC_R_SUCCESS)
3316                                 result = DNS_R_EMPTYWILD;
3317                         goto tree_exit;
3318                 }
3319                 if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
3320                     nsecheader == NULL)
3321                 {
3322                         /*
3323                          * There's no NSEC record, and we were told
3324                          * to find one.
3325                          */
3326                         result = DNS_R_BADDB;
3327                         goto node_exit;
3328                 }
3329                 if (nodep != NULL) {
3330                         new_reference(search.rbtdb, node);
3331                         *nodep = node;
3332                 }
3333                 if (search.rbtdb->secure ||
3334                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3335                 {
3336                         bind_rdataset(search.rbtdb, node, nsecheader,
3337                                       0, rdataset);
3338                         if (nsecsig != NULL)
3339                                 bind_rdataset(search.rbtdb, node,
3340                                               nsecsig, 0, sigrdataset);
3341                 }
3342                 if (wild)
3343                         foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3344                 goto node_exit;
3345         }
3346
3347         /*
3348          * We found what we were looking for, or we found a CNAME.
3349          */
3350
3351         if (type != found->type &&
3352             type != dns_rdatatype_any &&
3353             found->type == dns_rdatatype_cname) {
3354                 /*
3355                  * We weren't doing an ANY query and we found a CNAME instead
3356                  * of the type we were looking for, so we need to indicate
3357                  * that result to the caller.
3358                  */
3359                 result = DNS_R_CNAME;
3360         } else if (search.zonecut != NULL) {
3361                 /*
3362                  * If we're beneath a zone cut, we must indicate that the
3363                  * result is glue, unless we're actually at the zone cut
3364                  * and the type is NSEC or KEY.
3365                  */
3366                 if (search.zonecut == node) {
3367                         /*
3368                          * It is not clear if KEY should still be
3369                          * allowed at the parent side of the zone
3370                          * cut or not.  It is needed for RFC3007
3371                          * validated updates.
3372                          */
3373                         if (type == dns_rdatatype_nsec ||
3374                             type == dns_rdatatype_key)
3375                                 result = ISC_R_SUCCESS;
3376                         else if (type == dns_rdatatype_any)
3377                                 result = DNS_R_ZONECUT;
3378                         else
3379                                 result = DNS_R_GLUE;
3380                 } else
3381                         result = DNS_R_GLUE;
3382                 /*
3383                  * We might have found data that isn't glue, but was occluded
3384                  * by a dynamic update.  If the caller cares about this, they
3385                  * will have told us to validate glue.
3386                  *
3387                  * XXX We should cache the glue validity state!
3388                  */
3389                 if (result == DNS_R_GLUE &&
3390                     (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
3391                     !valid_glue(&search, foundname, type, node)) {
3392                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3393                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3394                         result = setup_delegation(&search, nodep, foundname,
3395                                                   rdataset, sigrdataset);
3396                     goto tree_exit;
3397                 }
3398         } else {
3399                 /*
3400                  * An ordinary successful query!
3401                  */
3402                 result = ISC_R_SUCCESS;
3403         }
3404
3405         if (nodep != NULL) {
3406                 if (!at_zonecut)
3407                         new_reference(search.rbtdb, node);
3408                 else
3409                         search.need_cleanup = ISC_FALSE;
3410                 *nodep = node;
3411         }
3412
3413         if (type != dns_rdatatype_any) {
3414                 bind_rdataset(search.rbtdb, node, found, 0, rdataset);
3415                 if (foundsig != NULL)
3416                         bind_rdataset(search.rbtdb, node, foundsig, 0,
3417                                       sigrdataset);
3418         }
3419
3420         if (wild)
3421                 foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3422
3423  node_exit:
3424         NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3425                     isc_rwlocktype_read);
3426
3427  tree_exit:
3428         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3429
3430         /*
3431          * If we found a zonecut but aren't going to use it, we have to
3432          * let go of it.
3433          */
3434         if (search.need_cleanup) {
3435                 node = search.zonecut;
3436                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
3437
3438                 NODE_LOCK(lock, isc_rwlocktype_read);
3439                 decrement_reference(search.rbtdb, node, 0,
3440                                     isc_rwlocktype_read, isc_rwlocktype_none,
3441                                     ISC_FALSE);
3442                 NODE_UNLOCK(lock, isc_rwlocktype_read);
3443         }
3444
3445         if (close_version)
3446                 closeversion(db, &version, ISC_FALSE);
3447
3448         dns_rbtnodechain_reset(&search.chain);
3449
3450         return (result);
3451 }
3452
3453 static isc_result_t
3454 zone_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
3455                  isc_stdtime_t now, dns_dbnode_t **nodep,
3456                  dns_name_t *foundname,
3457                  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3458 {
3459         UNUSED(db);
3460         UNUSED(name);
3461         UNUSED(options);
3462         UNUSED(now);
3463         UNUSED(nodep);
3464         UNUSED(foundname);
3465         UNUSED(rdataset);
3466         UNUSED(sigrdataset);
3467
3468         FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
3469
3470         return (ISC_R_NOTIMPLEMENTED);
3471 }
3472
3473 static isc_result_t
3474 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
3475         rbtdb_search_t *search = arg;
3476         rdatasetheader_t *header, *header_prev, *header_next;
3477         rdatasetheader_t *dname_header, *sigdname_header;
3478         isc_result_t result;
3479         nodelock_t *lock;
3480         isc_rwlocktype_t locktype;
3481
3482         /* XXX comment */
3483
3484         REQUIRE(search->zonecut == NULL);
3485
3486         /*
3487          * Keep compiler silent.
3488          */
3489         UNUSED(name);
3490
3491         lock = &(search->rbtdb->node_locks[node->locknum].lock);
3492         locktype = isc_rwlocktype_read;
3493         NODE_LOCK(lock, locktype);
3494
3495         /*
3496          * Look for a DNAME or RRSIG DNAME rdataset.
3497          */
3498         dname_header = NULL;
3499         sigdname_header = NULL;
3500         header_prev = NULL;
3501         for (header = node->data; header != NULL; header = header_next) {
3502                 header_next = header->next;
3503                 if (header->rdh_ttl <= search->now) {
3504                         /*
3505                          * This rdataset is stale.  If no one else is
3506                          * using the node, we can clean it up right
3507                          * now, otherwise we mark it as stale, and
3508                          * the node as dirty, so it will get cleaned
3509                          * up later.
3510                          */
3511                         if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) &&
3512                             (locktype == isc_rwlocktype_write ||
3513                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3514                                 /*
3515                                  * We update the node's status only when we
3516                                  * can get write access; otherwise, we leave
3517                                  * others to this work.  Periodical cleaning
3518                                  * will eventually take the job as the last
3519                                  * resort.
3520                                  * We won't downgrade the lock, since other
3521                                  * rdatasets are probably stale, too.
3522                                  */
3523                                 locktype = isc_rwlocktype_write;
3524
3525                                 if (dns_rbtnode_refcurrent(node) == 0) {
3526                                         isc_mem_t *mctx;
3527
3528                                         /*
3529                                          * header->down can be non-NULL if the
3530                                          * refcount has just decremented to 0
3531                                          * but decrement_reference() has not
3532                                          * performed clean_cache_node(), in
3533                                          * which case we need to purge the
3534                                          * stale headers first.
3535                                          */
3536                                         mctx = search->rbtdb->common.mctx;
3537                                         clean_stale_headers(search->rbtdb,
3538                                                             mctx,
3539                                                             header);
3540                                         if (header_prev != NULL)
3541                                                 header_prev->next =
3542                                                         header->next;
3543                                         else
3544                                                 node->data = header->next;
3545                                         free_rdataset(search->rbtdb, mctx,
3546                                                       header);
3547                                 } else {
3548                                         header->attributes |=
3549                                                 RDATASET_ATTR_STALE;
3550                                         node->dirty = 1;
3551                                         header_prev = header;
3552                                 }
3553                         } else
3554                                 header_prev = header;
3555                 } else if (header->type == dns_rdatatype_dname &&
3556                            EXISTS(header)) {
3557                         dname_header = header;
3558                         header_prev = header;
3559                 } else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
3560                          EXISTS(header)) {
3561                         sigdname_header = header;
3562                         header_prev = header;
3563                 } else
3564                         header_prev = header;
3565         }
3566
3567         if (dname_header != NULL &&
3568             (!DNS_TRUST_PENDING(dname_header->trust) ||
3569              (search->options & DNS_DBFIND_PENDINGOK) != 0)) {
3570                 /*
3571                  * We increment the reference count on node to ensure that
3572                  * search->zonecut_rdataset will still be valid later.
3573                  */
3574                 new_reference(search->rbtdb, node);
3575                 INSIST(!ISC_LINK_LINKED(node, deadlink));
3576                 search->zonecut = node;
3577                 search->zonecut_rdataset = dname_header;
3578                 search->zonecut_sigrdataset = sigdname_header;
3579                 search->need_cleanup = ISC_TRUE;
3580                 result = DNS_R_PARTIALMATCH;
3581         } else
3582                 result = DNS_R_CONTINUE;
3583
3584         NODE_UNLOCK(lock, locktype);
3585
3586         return (result);
3587 }
3588
3589 static inline isc_result_t
3590 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
3591                      dns_dbnode_t **nodep, dns_name_t *foundname,
3592                      dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3593 {
3594         unsigned int i;
3595         dns_rbtnode_t *level_node;
3596         rdatasetheader_t *header, *header_prev, *header_next;
3597         rdatasetheader_t *found, *foundsig;
3598         isc_result_t result = ISC_R_NOTFOUND;
3599         dns_name_t name;
3600         dns_rbtdb_t *rbtdb;
3601         isc_boolean_t done;
3602         nodelock_t *lock;
3603         isc_rwlocktype_t locktype;
3604
3605         /*
3606          * Caller must be holding the tree lock.
3607          */
3608
3609         rbtdb = search->rbtdb;
3610         i = search->chain.level_matches;
3611         done = ISC_FALSE;
3612         do {
3613                 locktype = isc_rwlocktype_read;
3614                 lock = &rbtdb->node_locks[node->locknum].lock;
3615                 NODE_LOCK(lock, locktype);
3616
3617                 /*
3618                  * Look for NS and RRSIG NS rdatasets.
3619                  */
3620                 found = NULL;
3621                 foundsig = NULL;
3622                 header_prev = NULL;
3623                 for (header = node->data;
3624                      header != NULL;
3625                      header = header_next) {
3626                         header_next = header->next;
3627                         if (header->rdh_ttl <= search->now) {
3628                                 /*
3629                                  * This rdataset is stale.  If no one else is
3630                                  * using the node, we can clean it up right
3631                                  * now, otherwise we mark it as stale, and
3632                                  * the node as dirty, so it will get cleaned
3633                                  * up later.
3634                                  */
3635                                 if ((header->rdh_ttl <= search->now -
3636                                                     RBTDB_VIRTUAL) &&
3637                                     (locktype == isc_rwlocktype_write ||
3638                                      NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3639                                         /*
3640                                          * We update the node's status only
3641                                          * when we can get write access.
3642                                          */
3643                                         locktype = isc_rwlocktype_write;
3644
3645                                         if (dns_rbtnode_refcurrent(node)
3646                                             == 0) {
3647                                                 isc_mem_t *m;
3648
3649                                                 m = search->rbtdb->common.mctx;
3650                                                 clean_stale_headers(
3651                                                         search->rbtdb,
3652                                                         m, header);
3653                                                 if (header_prev != NULL)
3654                                                         header_prev->next =
3655                                                                 header->next;
3656                                                 else
3657                                                         node->data =
3658                                                                 header->next;
3659                                                 free_rdataset(rbtdb, m,
3660                                                               header);
3661                                         } else {
3662                                                 header->attributes |=
3663                                                         RDATASET_ATTR_STALE;
3664                                                 node->dirty = 1;
3665                                                 header_prev = header;
3666                                         }
3667                                 } else
3668                                         header_prev = header;
3669                         } else if (EXISTS(header)) {
3670                                 /*
3671                                  * We've found an extant rdataset.  See if
3672                                  * we're interested in it.
3673                                  */
3674                                 if (header->type == dns_rdatatype_ns) {
3675                                         found = header;
3676                                         if (foundsig != NULL)
3677                                                 break;
3678                                 } else if (header->type ==
3679                                            RBTDB_RDATATYPE_SIGNS) {
3680                                         foundsig = header;
3681                                         if (found != NULL)
3682                                                 break;
3683                                 }
3684                                 header_prev = header;
3685                         } else
3686                                 header_prev = header;
3687                 }
3688
3689                 if (found != NULL) {
3690                         /*
3691                          * If we have to set foundname, we do it before
3692                          * anything else.  If we were to set foundname after
3693                          * we had set nodep or bound the rdataset, then we'd
3694                          * have to undo that work if dns_name_concatenate()
3695                          * failed.  By setting foundname first, there's
3696                          * nothing to undo if we have trouble.
3697                          */
3698                         if (foundname != NULL) {
3699                                 dns_name_init(&name, NULL);
3700                                 dns_rbt_namefromnode(node, &name);
3701                                 result = dns_name_copy(&name, foundname, NULL);
3702                                 while (result == ISC_R_SUCCESS && i > 0) {
3703                                         i--;
3704                                         level_node = search->chain.levels[i];
3705                                         dns_name_init(&name, NULL);
3706                                         dns_rbt_namefromnode(level_node,
3707                                                              &name);
3708                                         result =
3709                                                 dns_name_concatenate(foundname,
3710                                                                      &name,
3711                                                                      foundname,
3712                                                                      NULL);
3713                                 }
3714                                 if (result != ISC_R_SUCCESS) {
3715                                         *nodep = NULL;
3716                                         goto node_exit;
3717                                 }
3718                         }
3719                         result = DNS_R_DELEGATION;
3720                         if (nodep != NULL) {
3721                                 new_reference(search->rbtdb, node);
3722                                 *nodep = node;
3723                         }
3724                         bind_rdataset(search->rbtdb, node, found, search->now,
3725                                       rdataset);
3726                         if (foundsig != NULL)
3727                                 bind_rdataset(search->rbtdb, node, foundsig,
3728                                               search->now, sigrdataset);
3729                         if (need_headerupdate(found, search->now) ||
3730                             (foundsig != NULL &&
3731                              need_headerupdate(foundsig, search->now))) {
3732                                 if (locktype != isc_rwlocktype_write) {
3733                                         NODE_UNLOCK(lock, locktype);
3734                                         NODE_LOCK(lock, isc_rwlocktype_write);
3735                                         locktype = isc_rwlocktype_write;
3736                                 }
3737                                 if (need_headerupdate(found, search->now))
3738                                         update_header(search->rbtdb, found,
3739                                                       search->now);
3740                                 if (foundsig != NULL &&
3741                                     need_headerupdate(foundsig, search->now)) {
3742                                         update_header(search->rbtdb, foundsig,
3743                                                       search->now);
3744                                 }
3745                         }
3746                 }
3747
3748         node_exit:
3749                 NODE_UNLOCK(lock, locktype);
3750
3751                 if (found == NULL && i > 0) {
3752                         i--;
3753                         node = search->chain.levels[i];
3754                 } else
3755                         done = ISC_TRUE;
3756
3757         } while (!done);
3758
3759         return (result);
3760 }
3761
3762 static isc_result_t
3763 find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3764                   isc_stdtime_t now, dns_name_t *foundname,
3765                   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3766 {
3767         dns_rbtnode_t *node;
3768         rdatasetheader_t *header, *header_next, *header_prev;
3769         rdatasetheader_t *found, *foundsig;
3770         isc_boolean_t empty_node;
3771         isc_result_t result;
3772         dns_fixedname_t fname, forigin;
3773         dns_name_t *name, *origin;
3774         rbtdb_rdatatype_t matchtype, sigmatchtype;
3775         nodelock_t *lock;
3776         isc_rwlocktype_t locktype;
3777
3778         matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
3779         sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
3780                                              dns_rdatatype_nsec);
3781
3782         do {
3783                 node = NULL;
3784                 dns_fixedname_init(&fname);
3785                 name = dns_fixedname_name(&fname);
3786                 dns_fixedname_init(&forigin);
3787                 origin = dns_fixedname_name(&forigin);
3788                 result = dns_rbtnodechain_current(&search->chain, name,
3789                                                   origin, &node);
3790                 if (result != ISC_R_SUCCESS)
3791                         return (result);
3792                 locktype = isc_rwlocktype_read;
3793                 lock = &(search->rbtdb->node_locks[node->locknum].lock);
3794                 NODE_LOCK(lock, locktype);
3795                 found = NULL;
3796                 foundsig = NULL;
3797                 empty_node = ISC_TRUE;
3798                 header_prev = NULL;
3799                 for (header = node->data;
3800                      header != NULL;
3801                      header = header_next) {
3802                         header_next = header->next;
3803                         if (header->rdh_ttl <= now) {
3804                                 /*
3805                                  * This rdataset is stale.  If no one else is
3806                                  * using the node, we can clean it up right
3807                                  * now, otherwise we mark it as stale, and the
3808                                  * node as dirty, so it will get cleaned up
3809                                  * later.
3810                                  */
3811                                 if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3812                                     (locktype == isc_rwlocktype_write ||
3813                                      NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3814                                         /*
3815                                          * We update the node's status only
3816                                          * when we can get write access.
3817                                          */
3818                                         locktype = isc_rwlocktype_write;
3819
3820                                         if (dns_rbtnode_refcurrent(node)
3821                                             == 0) {
3822                                                 isc_mem_t *m;
3823
3824                                                 m = search->rbtdb->common.mctx;
3825                                                 clean_stale_headers(
3826                                                         search->rbtdb,
3827                                                         m, header);
3828                                                 if (header_prev != NULL)
3829                                                         header_prev->next =
3830                                                                 header->next;
3831                                                 else
3832                                                         node->data = header->next;
3833                                                 free_rdataset(search->rbtdb, m,
3834                                                               header);
3835                                         } else {
3836                                                 header->attributes |=
3837                                                         RDATASET_ATTR_STALE;
3838                                                 node->dirty = 1;
3839                                                 header_prev = header;
3840                                         }
3841                                 } else
3842                                         header_prev = header;
3843                                 continue;
3844                         }
3845                         if (NONEXISTENT(header) ||
3846                             RBTDB_RDATATYPE_BASE(header->type) == 0) {
3847                                 header_prev = header;
3848                                 continue;
3849                         }
3850                         empty_node = ISC_FALSE;
3851                         if (header->type == matchtype)
3852                                 found = header;
3853                         else if (header->type == sigmatchtype)
3854                                 foundsig = header;
3855                         header_prev = header;
3856                 }
3857                 if (found != NULL) {
3858                         result = dns_name_concatenate(name, origin,
3859                                                       foundname, NULL);
3860                         if (result != ISC_R_SUCCESS)
3861                                 goto unlock_node;
3862                         bind_rdataset(search->rbtdb, node, found,
3863                                       now, rdataset);
3864                         if (foundsig != NULL)
3865                                 bind_rdataset(search->rbtdb, node, foundsig,
3866                                               now, sigrdataset);
3867                         new_reference(search->rbtdb, node);
3868                         *nodep = node;
3869                         result = DNS_R_COVERINGNSEC;
3870                 } else if (!empty_node) {
3871                         result = ISC_R_NOTFOUND;
3872                 } else
3873                         result = dns_rbtnodechain_prev(&search->chain, NULL,
3874                                                        NULL);
3875  unlock_node:
3876                 NODE_UNLOCK(lock, locktype);
3877         } while (empty_node && result == ISC_R_SUCCESS);
3878         return (result);
3879 }
3880
3881 static isc_result_t
3882 cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
3883            dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
3884            dns_dbnode_t **nodep, dns_name_t *foundname,
3885            dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3886 {
3887         dns_rbtnode_t *node = NULL;
3888         isc_result_t result;
3889         rbtdb_search_t search;
3890         isc_boolean_t cname_ok = ISC_TRUE;
3891         isc_boolean_t empty_node;
3892         nodelock_t *lock;
3893         isc_rwlocktype_t locktype;
3894         rdatasetheader_t *header, *header_prev, *header_next;
3895         rdatasetheader_t *found, *nsheader;
3896         rdatasetheader_t *foundsig, *nssig, *cnamesig;
3897         rdatasetheader_t *update, *updatesig;
3898         rbtdb_rdatatype_t sigtype, negtype;
3899
3900         UNUSED(version);
3901
3902         search.rbtdb = (dns_rbtdb_t *)db;
3903
3904         REQUIRE(VALID_RBTDB(search.rbtdb));
3905         REQUIRE(version == NULL);
3906
3907         if (now == 0)
3908                 isc_stdtime_get(&now);
3909
3910         search.rbtversion = NULL;
3911         search.serial = 1;
3912         search.options = options;
3913         search.copy_name = ISC_FALSE;
3914         search.need_cleanup = ISC_FALSE;
3915         search.wild = ISC_FALSE;
3916         search.zonecut = NULL;
3917         dns_fixedname_init(&search.zonecut_name);
3918         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3919         search.now = now;
3920         update = NULL;
3921         updatesig = NULL;
3922
3923         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3924
3925         /*
3926          * Search down from the root of the tree.  If, while going down, we
3927          * encounter a callback node, cache_zonecut_callback() will search the
3928          * rdatasets at the zone cut for a DNAME rdataset.
3929          */
3930         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3931                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3932                                   cache_zonecut_callback, &search);
3933
3934         if (result == DNS_R_PARTIALMATCH) {
3935                 if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
3936                         result = find_coveringnsec(&search, nodep, now,
3937                                                    foundname, rdataset,
3938                                                    sigrdataset);
3939                         if (result == DNS_R_COVERINGNSEC)
3940                                 goto tree_exit;
3941                 }
3942                 if (search.zonecut != NULL) {
3943                     result = setup_delegation(&search, nodep, foundname,
3944                                               rdataset, sigrdataset);
3945                     goto tree_exit;
3946                 } else {
3947                 find_ns:
3948                         result = find_deepest_zonecut(&search, node, nodep,
3949                                                       foundname, rdataset,
3950                                                       sigrdataset);
3951                         goto tree_exit;
3952                 }
3953         } else if (result != ISC_R_SUCCESS)
3954                 goto tree_exit;
3955
3956         /*
3957          * Certain DNSSEC types are not subject to CNAME matching
3958          * (RFC4035, section 2.5 and RFC3007).
3959          *
3960          * We don't check for RRSIG, because we don't store RRSIG records
3961          * directly.
3962          */
3963         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3964                 cname_ok = ISC_FALSE;
3965
3966         /*
3967          * We now go looking for rdata...
3968          */
3969
3970         lock = &(search.rbtdb->node_locks[node->locknum].lock);
3971         locktype = isc_rwlocktype_read;
3972         NODE_LOCK(lock, locktype);
3973
3974         found = NULL;
3975         foundsig = NULL;
3976         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3977         negtype = RBTDB_RDATATYPE_VALUE(0, type);
3978         nsheader = NULL;
3979         nssig = NULL;
3980         cnamesig = NULL;
3981         empty_node = ISC_TRUE;
3982         header_prev = NULL;
3983         for (header = node->data; header != NULL; header = header_next) {
3984                 header_next = header->next;
3985                 if (header->rdh_ttl <= now) {
3986                         /*
3987                          * This rdataset is stale.  If no one else is using the
3988                          * node, we can clean it up right now, otherwise we
3989                          * mark it as stale, and the node as dirty, so it will
3990                          * get cleaned up later.
3991                          */
3992                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3993                             (locktype == isc_rwlocktype_write ||
3994                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3995                                 /*
3996                                  * We update the node's status only when we
3997                                  * can get write access.
3998                                  */
3999                                 locktype = isc_rwlocktype_write;
4000
4001                                 if (dns_rbtnode_refcurrent(node) == 0) {
4002                                         isc_mem_t *mctx;
4003
4004                                         mctx = search.rbtdb->common.mctx;
4005                                         clean_stale_headers(search.rbtdb, mctx,
4006                                                             header);
4007                                         if (header_prev != NULL)
4008                                                 header_prev->next =
4009                                                         header->next;
4010                                         else
4011                                                 node->data = header->next;
4012                                         free_rdataset(search.rbtdb, mctx,
4013                                                       header);
4014                                 } else {
4015                                         header->attributes |=
4016                                                 RDATASET_ATTR_STALE;
4017                                         node->dirty = 1;
4018                                         header_prev = header;
4019                                 }
4020                         } else
4021                                 header_prev = header;
4022                 } else if (EXISTS(header)) {
4023                         /*
4024                          * We now know that there is at least one active
4025                          * non-stale rdataset at this node.
4026                          */
4027                         empty_node = ISC_FALSE;
4028
4029                         /*
4030                          * If we found a type we were looking for, remember
4031                          * it.
4032                          */
4033                         if (header->type == type ||
4034                             (type == dns_rdatatype_any &&
4035                              RBTDB_RDATATYPE_BASE(header->type) != 0) ||
4036                             (cname_ok && header->type ==
4037                              dns_rdatatype_cname)) {
4038                                 /*
4039                                  * We've found the answer.
4040                                  */
4041                                 found = header;
4042                                 if (header->type == dns_rdatatype_cname &&
4043                                     cname_ok &&
4044                                     cnamesig != NULL) {
4045                                         /*
4046                                          * If we've already got the CNAME RRSIG,
4047                                          * use it, otherwise change sigtype
4048                                          * so that we find it.
4049                                          */
4050                                         if (cnamesig != NULL)
4051                                                 foundsig = cnamesig;
4052                                         else
4053                                                 sigtype =
4054                                                     RBTDB_RDATATYPE_SIGCNAME;
4055                                         foundsig = cnamesig;
4056                                 }
4057                         } else if (header->type == sigtype) {
4058                                 /*
4059                                  * We've found the RRSIG rdataset for our
4060                                  * target type.  Remember it.
4061                                  */
4062                                 foundsig = header;
4063                         } else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
4064                                    header->type == negtype) {
4065                                 /*
4066                                  * We've found a negative cache entry.
4067                                  */
4068                                 found = header;
4069                         } else if (header->type == dns_rdatatype_ns) {
4070                                 /*
4071                                  * Remember a NS rdataset even if we're
4072                                  * not specifically looking for it, because
4073                                  * we might need it later.
4074                                  */
4075                                 nsheader = header;
4076                         } else if (header->type == RBTDB_RDATATYPE_SIGNS) {
4077                                 /*
4078                                  * If we need the NS rdataset, we'll also
4079                                  * need its signature.
4080                                  */
4081                                 nssig = header;
4082                         } else if (cname_ok &&
4083                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
4084                                 /*
4085                                  * If we get a CNAME match, we'll also need
4086                                  * its signature.
4087                                  */
4088                                 cnamesig = header;
4089                         }
4090                         header_prev = header;
4091                 } else
4092                         header_prev = header;
4093         }
4094
4095         if (empty_node) {
4096                 /*
4097                  * We have an exact match for the name, but there are no
4098                  * extant rdatasets.  That means that this node doesn't
4099                  * meaningfully exist, and that we really have a partial match.
4100                  */
4101                 NODE_UNLOCK(lock, locktype);
4102                 goto find_ns;
4103         }
4104
4105         /*
4106          * If we didn't find what we were looking for...
4107          */
4108         if (found == NULL ||
4109             (DNS_TRUST_ADDITIONAL(found->trust) &&
4110              ((options & DNS_DBFIND_ADDITIONALOK) == 0)) ||
4111             (found->trust == dns_trust_glue &&
4112              ((options & DNS_DBFIND_GLUEOK) == 0)) ||
4113             (DNS_TRUST_PENDING(found->trust) &&
4114              ((options & DNS_DBFIND_PENDINGOK) == 0))) {
4115                 /*
4116                  * If there is an NS rdataset at this node, then this is the
4117                  * deepest zone cut.
4118                  */
4119                 if (nsheader != NULL) {
4120                         if (nodep != NULL) {
4121                                 new_reference(search.rbtdb, node);
4122                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4123                                 *nodep = node;
4124                         }
4125                         bind_rdataset(search.rbtdb, node, nsheader, search.now,
4126                                       rdataset);
4127                         if (need_headerupdate(nsheader, search.now))
4128                                 update = nsheader;
4129                         if (nssig != NULL) {
4130                                 bind_rdataset(search.rbtdb, node, nssig,
4131                                               search.now, sigrdataset);
4132                                 if (need_headerupdate(nssig, search.now))
4133                                         updatesig = nssig;
4134                         }
4135                         result = DNS_R_DELEGATION;
4136                         goto node_exit;
4137                 }
4138
4139                 /*
4140                  * Go find the deepest zone cut.
4141                  */
4142                 NODE_UNLOCK(lock, locktype);
4143                 goto find_ns;
4144         }
4145
4146         /*
4147          * We found what we were looking for, or we found a CNAME.
4148          */
4149
4150         if (nodep != NULL) {
4151                 new_reference(search.rbtdb, node);
4152                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4153                 *nodep = node;
4154         }
4155
4156         if (RBTDB_RDATATYPE_BASE(found->type) == 0) {
4157                 /*
4158                  * We found a negative cache entry.
4159                  */
4160                 if (NXDOMAIN(found))
4161                         result = DNS_R_NCACHENXDOMAIN;
4162                 else
4163                         result = DNS_R_NCACHENXRRSET;
4164         } else if (type != found->type &&
4165                    type != dns_rdatatype_any &&
4166                    found->type == dns_rdatatype_cname) {
4167                 /*
4168                  * We weren't doing an ANY query and we found a CNAME instead
4169                  * of the type we were looking for, so we need to indicate
4170                  * that result to the caller.
4171                  */
4172                 result = DNS_R_CNAME;
4173         } else {
4174                 /*
4175                  * An ordinary successful query!
4176                  */
4177                 result = ISC_R_SUCCESS;
4178         }
4179
4180         if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
4181             result == DNS_R_NCACHENXRRSET) {
4182                 bind_rdataset(search.rbtdb, node, found, search.now,
4183                               rdataset);
4184                 if (need_headerupdate(found, search.now))
4185                         update = found;
4186                 if (foundsig != NULL) {
4187                         bind_rdataset(search.rbtdb, node, foundsig, search.now,
4188                                       sigrdataset);
4189                         if (need_headerupdate(foundsig, search.now))
4190                                 updatesig = foundsig;
4191                 }
4192         }
4193
4194  node_exit:
4195         if ((update != NULL || updatesig != NULL) &&
4196             locktype != isc_rwlocktype_write) {
4197                 NODE_UNLOCK(lock, locktype);
4198                 NODE_LOCK(lock, isc_rwlocktype_write);
4199                 locktype = isc_rwlocktype_write;
4200         }
4201         if (update != NULL && need_headerupdate(update, search.now))
4202                 update_header(search.rbtdb, update, search.now);
4203         if (updatesig != NULL && need_headerupdate(updatesig, search.now))
4204                 update_header(search.rbtdb, updatesig, search.now);
4205
4206         NODE_UNLOCK(lock, locktype);
4207
4208  tree_exit:
4209         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4210
4211         /*
4212          * If we found a zonecut but aren't going to use it, we have to
4213          * let go of it.
4214          */
4215         if (search.need_cleanup) {
4216                 node = search.zonecut;
4217                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
4218
4219                 NODE_LOCK(lock, isc_rwlocktype_read);
4220                 decrement_reference(search.rbtdb, node, 0,
4221                                     isc_rwlocktype_read, isc_rwlocktype_none,
4222                                     ISC_FALSE);
4223                 NODE_UNLOCK(lock, isc_rwlocktype_read);
4224         }
4225
4226         dns_rbtnodechain_reset(&search.chain);
4227
4228         return (result);
4229 }
4230
4231 static isc_result_t
4232 cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
4233                   isc_stdtime_t now, dns_dbnode_t **nodep,
4234                   dns_name_t *foundname,
4235                   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
4236 {
4237         dns_rbtnode_t *node = NULL;
4238         nodelock_t *lock;
4239         isc_result_t result;
4240         rbtdb_search_t search;
4241         rdatasetheader_t *header, *header_prev, *header_next;
4242         rdatasetheader_t *found, *foundsig;
4243         unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
4244         isc_rwlocktype_t locktype;
4245
4246         search.rbtdb = (dns_rbtdb_t *)db;
4247
4248         REQUIRE(VALID_RBTDB(search.rbtdb));
4249
4250         if (now == 0)
4251                 isc_stdtime_get(&now);
4252
4253         search.rbtversion = NULL;
4254         search.serial = 1;
4255         search.options = options;
4256         search.copy_name = ISC_FALSE;
4257         search.need_cleanup = ISC_FALSE;
4258         search.wild = ISC_FALSE;
4259         search.zonecut = NULL;
4260         dns_fixedname_init(&search.zonecut_name);
4261         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
4262         search.now = now;
4263
4264         if ((options & DNS_DBFIND_NOEXACT) != 0)
4265                 rbtoptions |= DNS_RBTFIND_NOEXACT;
4266
4267         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4268
4269         /*
4270          * Search down from the root of the tree.
4271          */
4272         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
4273                                   &search.chain, rbtoptions, NULL, &search);
4274
4275         if (result == DNS_R_PARTIALMATCH) {
4276         find_ns:
4277                 result = find_deepest_zonecut(&search, node, nodep, foundname,
4278                                               rdataset, sigrdataset);
4279                 goto tree_exit;
4280         } else if (result != ISC_R_SUCCESS)
4281                 goto tree_exit;
4282
4283         /*
4284          * We now go looking for an NS rdataset at the node.
4285          */
4286
4287         lock = &(search.rbtdb->node_locks[node->locknum].lock);
4288         locktype = isc_rwlocktype_read;
4289         NODE_LOCK(lock, locktype);
4290
4291         found = NULL;
4292         foundsig = NULL;
4293         header_prev = NULL;
4294         for (header = node->data; header != NULL; header = header_next) {
4295                 header_next = header->next;
4296                 if (header->rdh_ttl <= now) {
4297                         /*
4298                          * This rdataset is stale.  If no one else is using the
4299                          * node, we can clean it up right now, otherwise we
4300                          * mark it as stale, and the node as dirty, so it will
4301                          * get cleaned up later.
4302                          */
4303                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
4304                             (locktype == isc_rwlocktype_write ||
4305                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
4306                                 /*
4307                                  * We update the node's status only when we
4308                                  * can get write access.
4309                                  */
4310                                 locktype = isc_rwlocktype_write;
4311
4312                                 if (dns_rbtnode_refcurrent(node) == 0) {
4313                                         isc_mem_t *mctx;
4314
4315                                         mctx = search.rbtdb->common.mctx;
4316                                         clean_stale_headers(search.rbtdb, mctx,
4317                                                             header);
4318                                         if (header_prev != NULL)
4319                                                 header_prev->next =
4320                                                         header->next;
4321                                         else
4322                                                 node->data = header->next;
4323                                         free_rdataset(search.rbtdb, mctx,
4324                                                       header);
4325                                 } else {
4326                                         header->attributes |=
4327                                                 RDATASET_ATTR_STALE;
4328                                         node->dirty = 1;
4329                                         header_prev = header;
4330                                 }
4331                         } else
4332                                 header_prev = header;
4333                 } else if (EXISTS(header)) {
4334                         /*
4335                          * If we found a type we were looking for, remember
4336                          * it.
4337                          */
4338                         if (header->type == dns_rdatatype_ns) {
4339                                 /*
4340                                  * Remember a NS rdataset even if we're
4341                                  * not specifically looking for it, because
4342                                  * we might need it later.
4343                                  */
4344                                 found = header;
4345                         } else if (header->type == RBTDB_RDATATYPE_SIGNS) {
4346                                 /*
4347                                  * If we need the NS rdataset, we'll also
4348                                  * need its signature.
4349                                  */
4350                                 foundsig = header;
4351                         }
4352                         header_prev = header;
4353                 } else
4354                         header_prev = header;
4355         }
4356
4357         if (found == NULL) {
4358                 /*
4359                  * No NS records here.
4360                  */
4361                 NODE_UNLOCK(lock, locktype);
4362                 goto find_ns;
4363         }
4364
4365         if (nodep != NULL) {
4366                 new_reference(search.rbtdb, node);
4367                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4368                 *nodep = node;
4369         }
4370
4371         bind_rdataset(search.rbtdb, node, found, search.now, rdataset);
4372         if (foundsig != NULL)
4373                 bind_rdataset(search.rbtdb, node, foundsig, search.now,
4374                               sigrdataset);
4375
4376         if (need_headerupdate(found, search.now) ||
4377             (foundsig != NULL &&  need_headerupdate(foundsig, search.now))) {
4378                 if (locktype != isc_rwlocktype_write) {
4379                         NODE_UNLOCK(lock, locktype);
4380                         NODE_LOCK(lock, isc_rwlocktype_write);
4381                         locktype = isc_rwlocktype_write;
4382                 }
4383                 if (need_headerupdate(found, search.now))
4384                         update_header(search.rbtdb, found, search.now);
4385                 if (foundsig != NULL &&
4386                     need_headerupdate(foundsig, search.now)) {
4387                         update_header(search.rbtdb, foundsig, search.now);
4388                 }
4389         }
4390
4391         NODE_UNLOCK(lock, locktype);
4392
4393  tree_exit:
4394         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4395
4396         INSIST(!search.need_cleanup);
4397
4398         dns_rbtnodechain_reset(&search.chain);
4399
4400         if (result == DNS_R_DELEGATION)
4401                 result = ISC_R_SUCCESS;
4402
4403         return (result);
4404 }
4405
4406 static void
4407 attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) {
4408         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4409         dns_rbtnode_t *node = (dns_rbtnode_t *)source;
4410         unsigned int refs;
4411
4412         REQUIRE(VALID_RBTDB(rbtdb));
4413         REQUIRE(targetp != NULL && *targetp == NULL);
4414
4415         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
4416         dns_rbtnode_refincrement(node, &refs);
4417         INSIST(refs != 0);
4418         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
4419
4420         *targetp = source;
4421 }
4422
4423 static void
4424 detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
4425         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4426         dns_rbtnode_t *node;
4427         isc_boolean_t want_free = ISC_FALSE;
4428         isc_boolean_t inactive = ISC_FALSE;
4429         rbtdb_nodelock_t *nodelock;
4430
4431         REQUIRE(VALID_RBTDB(rbtdb));
4432         REQUIRE(targetp != NULL && *targetp != NULL);
4433
4434         node = (dns_rbtnode_t *)(*targetp);
4435         nodelock = &rbtdb->node_locks[node->locknum];
4436
4437         NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
4438
4439         if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
4440                                 isc_rwlocktype_none, ISC_FALSE)) {
4441                 if (isc_refcount_current(&nodelock->references) == 0 &&
4442                     nodelock->exiting) {
4443                         inactive = ISC_TRUE;
4444                 }
4445         }
4446
4447         NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
4448
4449         *targetp = NULL;
4450
4451         if (inactive) {
4452                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
4453                 rbtdb->active--;
4454                 if (rbtdb->active == 0)
4455                         want_free = ISC_TRUE;
4456                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
4457                 if (want_free) {
4458                         char buf[DNS_NAME_FORMATSIZE];
4459                         if (dns_name_dynamic(&rbtdb->common.origin))
4460                                 dns_name_format(&rbtdb->common.origin, buf,
4461                                                 sizeof(buf));
4462                         else
4463                                 strcpy(buf, "<UNKNOWN>");
4464                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
4465                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
4466                                       "calling free_rbtdb(%s)", buf);
4467                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
4468                 }
4469         }
4470 }
4471
4472 static isc_result_t
4473 expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) {
4474         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4475         dns_rbtnode_t *rbtnode = node;
4476         rdatasetheader_t *header;
4477         isc_boolean_t force_expire = ISC_FALSE;
4478         /*
4479          * These are the category and module used by the cache cleaner.
4480          */
4481         isc_boolean_t log = ISC_FALSE;
4482         isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE;
4483         isc_logmodule_t *module = DNS_LOGMODULE_CACHE;
4484         int level = ISC_LOG_DEBUG(2);
4485         char printname[DNS_NAME_FORMATSIZE];
4486
4487         REQUIRE(VALID_RBTDB(rbtdb));
4488
4489         /*
4490          * Caller must hold a tree lock.
4491          */
4492
4493         if (now == 0)
4494                 isc_stdtime_get(&now);
4495
4496         if (rbtdb->overmem) {
4497                 isc_uint32_t val;
4498
4499                 isc_random_get(&val);
4500                 /*
4501                  * XXXDCL Could stand to have a better policy, like LRU.
4502                  */
4503                 force_expire = ISC_TF(rbtnode->down == NULL && val % 4 == 0);
4504
4505                 /*
4506                  * Note that 'log' can be true IFF rbtdb->overmem is also true.
4507                  * rbtdb->overmem can currently only be true for cache
4508                  * databases -- hence all of the "overmem cache" log strings.
4509                  */
4510                 log = ISC_TF(isc_log_wouldlog(dns_lctx, level));
4511                 if (log)
4512                         isc_log_write(dns_lctx, category, module, level,
4513                                       "overmem cache: %s %s",
4514                                       force_expire ? "FORCE" : "check",
4515                                       dns_rbt_formatnodename(rbtnode,
4516                                                            printname,
4517                                                            sizeof(printname)));
4518         }
4519
4520         /*
4521          * We may not need write access, but this code path is not performance
4522          * sensitive, so it should be okay to always lock as a writer.
4523          */
4524         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4525                   isc_rwlocktype_write);
4526
4527         for (header = rbtnode->data; header != NULL; header = header->next)
4528                 if (header->rdh_ttl <= now - RBTDB_VIRTUAL) {
4529                         /*
4530                          * We don't check if refcurrent(rbtnode) == 0 and try
4531                          * to free like we do in cache_find(), because
4532                          * refcurrent(rbtnode) must be non-zero.  This is so
4533                          * because 'node' is an argument to the function.
4534                          */
4535                         header->attributes |= RDATASET_ATTR_STALE;
4536                         rbtnode->dirty = 1;
4537                         if (log)
4538                                 isc_log_write(dns_lctx, category, module,
4539                                               level, "overmem cache: stale %s",
4540                                               printname);
4541                 } else if (force_expire) {
4542                         if (! RETAIN(header)) {
4543                                 set_ttl(rbtdb, header, 0);
4544                                 header->attributes |= RDATASET_ATTR_STALE;
4545                                 rbtnode->dirty = 1;
4546                         } else if (log) {
4547                                 isc_log_write(dns_lctx, category, module,
4548                                               level, "overmem cache: "
4549                                               "reprieve by RETAIN() %s",
4550                                               printname);
4551                         }
4552                 } else if (rbtdb->overmem && log)
4553                         isc_log_write(dns_lctx, category, module, level,
4554                                       "overmem cache: saved %s", printname);
4555
4556         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4557                     isc_rwlocktype_write);
4558
4559         return (ISC_R_SUCCESS);
4560 }
4561
4562 static void
4563 overmem(dns_db_t *db, isc_boolean_t overmem) {
4564         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4565
4566         if (IS_CACHE(rbtdb))
4567                 rbtdb->overmem = overmem;
4568 }
4569
4570 static void
4571 printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) {
4572         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4573         dns_rbtnode_t *rbtnode = node;
4574         isc_boolean_t first;
4575
4576         REQUIRE(VALID_RBTDB(rbtdb));
4577
4578         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4579                   isc_rwlocktype_read);
4580
4581         fprintf(out, "node %p, %u references, locknum = %u\n",
4582                 rbtnode, dns_rbtnode_refcurrent(rbtnode),
4583                 rbtnode->locknum);
4584         if (rbtnode->data != NULL) {
4585                 rdatasetheader_t *current, *top_next;
4586
4587                 for (current = rbtnode->data; current != NULL;
4588                      current = top_next) {
4589                         top_next = current->next;
4590                         first = ISC_TRUE;
4591                         fprintf(out, "\ttype %u", current->type);
4592                         do {
4593                                 if (!first)
4594                                         fprintf(out, "\t");
4595                                 first = ISC_FALSE;
4596                                 fprintf(out,
4597                                         "\tserial = %lu, ttl = %u, "
4598                                         "trust = %u, attributes = %u\n",
4599                                         (unsigned long)current->serial,
4600                                         current->rdh_ttl,
4601                                         current->trust,
4602                                         current->attributes);
4603                                 current = current->down;
4604                         } while (current != NULL);
4605                 }
4606         } else
4607                 fprintf(out, "(empty)\n");
4608
4609         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4610                     isc_rwlocktype_read);
4611 }
4612
4613 static isc_result_t
4614 createiterator(dns_db_t *db, isc_boolean_t relative_names,
4615                dns_dbiterator_t **iteratorp)
4616 {
4617         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4618         rbtdb_dbiterator_t *rbtdbiter;
4619
4620         REQUIRE(VALID_RBTDB(rbtdb));
4621
4622         rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
4623         if (rbtdbiter == NULL)
4624                 return (ISC_R_NOMEMORY);
4625
4626         rbtdbiter->common.methods = &dbiterator_methods;
4627         rbtdbiter->common.db = NULL;
4628         dns_db_attach(db, &rbtdbiter->common.db);
4629         rbtdbiter->common.relative_names = relative_names;
4630         rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
4631         rbtdbiter->common.cleaning = ISC_FALSE;
4632         rbtdbiter->paused = ISC_TRUE;
4633         rbtdbiter->tree_locked = isc_rwlocktype_none;
4634         rbtdbiter->result = ISC_R_SUCCESS;
4635         dns_fixedname_init(&rbtdbiter->name);
4636         dns_fixedname_init(&rbtdbiter->origin);
4637         rbtdbiter->node = NULL;
4638         rbtdbiter->delete = 0;
4639         memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions));
4640         dns_rbtnodechain_init(&rbtdbiter->chain, db->mctx);
4641
4642         *iteratorp = (dns_dbiterator_t *)rbtdbiter;
4643
4644         return (ISC_R_SUCCESS);
4645 }
4646
4647 static isc_result_t
4648 zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4649                   dns_rdatatype_t type, dns_rdatatype_t covers,
4650                   isc_stdtime_t now, dns_rdataset_t *rdataset,
4651                   dns_rdataset_t *sigrdataset)
4652 {
4653         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4654         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4655         rdatasetheader_t *header, *header_next, *found, *foundsig;
4656         rbtdb_serial_t serial;
4657         rbtdb_version_t *rbtversion = version;
4658         isc_boolean_t close_version = ISC_FALSE;
4659         rbtdb_rdatatype_t matchtype, sigmatchtype;
4660
4661         REQUIRE(VALID_RBTDB(rbtdb));
4662         REQUIRE(type != dns_rdatatype_any);
4663
4664         if (rbtversion == NULL) {
4665                 currentversion(db, (dns_dbversion_t **) (void *)(&rbtversion));
4666                 close_version = ISC_TRUE;
4667         }
4668         serial = rbtversion->serial;
4669         now = 0;
4670
4671         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4672                   isc_rwlocktype_read);
4673
4674         found = NULL;
4675         foundsig = NULL;
4676         matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
4677         if (covers == 0)
4678                 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4679         else
4680                 sigmatchtype = 0;
4681
4682         for (header = rbtnode->data; header != NULL; header = header_next) {
4683                 header_next = header->next;
4684                 do {
4685                         if (header->serial <= serial &&
4686                             !IGNORE(header)) {
4687                                 /*
4688                                  * Is this a "this rdataset doesn't
4689                                  * exist" record?
4690                                  */
4691                                 if (NONEXISTENT(header))
4692                                         header = NULL;
4693                                 break;
4694                         } else
4695                                 header = header->down;
4696                 } while (header != NULL);
4697                 if (header != NULL) {
4698                         /*
4699                          * We have an active, extant rdataset.  If it's a
4700                          * type we're looking for, remember it.
4701                          */
4702                         if (header->type == matchtype) {
4703                                 found = header;
4704                                 if (foundsig != NULL)
4705                                         break;
4706                         } else if (header->type == sigmatchtype) {
4707                                 foundsig = header;
4708                                 if (found != NULL)
4709                                         break;
4710                         }
4711                 }
4712         }
4713         if (found != NULL) {
4714                 bind_rdataset(rbtdb, rbtnode, found, now, rdataset);
4715                 if (foundsig != NULL)
4716                         bind_rdataset(rbtdb, rbtnode, foundsig, now,
4717                                       sigrdataset);
4718         }
4719
4720         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4721                     isc_rwlocktype_read);
4722
4723         if (close_version)
4724                 closeversion(db, (dns_dbversion_t **) (void *)(&rbtversion),
4725                              ISC_FALSE);
4726
4727         if (found == NULL)
4728                 return (ISC_R_NOTFOUND);
4729
4730         return (ISC_R_SUCCESS);
4731 }
4732
4733 static isc_result_t
4734 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4735                    dns_rdatatype_t type, dns_rdatatype_t covers,
4736                    isc_stdtime_t now, dns_rdataset_t *rdataset,
4737                    dns_rdataset_t *sigrdataset)
4738 {
4739         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4740         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4741         rdatasetheader_t *header, *header_next, *found, *foundsig;
4742         rbtdb_rdatatype_t matchtype, sigmatchtype, negtype;
4743         isc_result_t result;
4744         nodelock_t *lock;
4745         isc_rwlocktype_t locktype;
4746
4747         REQUIRE(VALID_RBTDB(rbtdb));
4748         REQUIRE(type != dns_rdatatype_any);
4749
4750         UNUSED(version);
4751
4752         result = ISC_R_SUCCESS;
4753
4754         if (now == 0)
4755                 isc_stdtime_get(&now);
4756
4757         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
4758         locktype = isc_rwlocktype_read;
4759         NODE_LOCK(lock, locktype);
4760
4761         found = NULL;
4762         foundsig = NULL;
4763         matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
4764         negtype = RBTDB_RDATATYPE_VALUE(0, type);
4765         if (covers == 0)
4766                 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4767         else
4768                 sigmatchtype = 0;
4769
4770         for (header = rbtnode->data; header != NULL; header = header_next) {
4771                 header_next = header->next;
4772                 if (header->rdh_ttl <= now) {
4773                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
4774                             (locktype == isc_rwlocktype_write ||
4775                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
4776                                 /*
4777                                  * We update the node's status only when we
4778                                  * can get write access.
4779                                  */
4780                                 locktype = isc_rwlocktype_write;
4781
4782                                 /*
4783                                  * We don't check if refcurrent(rbtnode) == 0
4784                                  * and try to free like we do in cache_find(),
4785                                  * because refcurrent(rbtnode) must be
4786                                  * non-zero.  This is so because 'node' is an
4787                                  * argument to the function.
4788                                  */
4789                                 header->attributes |= RDATASET_ATTR_STALE;
4790                                 rbtnode->dirty = 1;
4791                         }
4792                 } else if (EXISTS(header)) {
4793                         if (header->type == matchtype)
4794                                 found = header;
4795                         else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
4796                                  header->type == negtype)
4797                                 found = header;
4798                         else if (header->type == sigmatchtype)
4799                                 foundsig = header;
4800                 }
4801         }
4802         if (found != NULL) {
4803                 bind_rdataset(rbtdb, rbtnode, found, now, rdataset);
4804                 if (foundsig != NULL)
4805                         bind_rdataset(rbtdb, rbtnode, foundsig, now,
4806                                       sigrdataset);
4807         }
4808
4809         NODE_UNLOCK(lock, locktype);
4810
4811         if (found == NULL)
4812                 return (ISC_R_NOTFOUND);
4813
4814         if (RBTDB_RDATATYPE_BASE(found->type) == 0) {
4815                 /*
4816                  * We found a negative cache entry.
4817                  */
4818                 if (NXDOMAIN(found))
4819                         result = DNS_R_NCACHENXDOMAIN;
4820                 else
4821                         result = DNS_R_NCACHENXRRSET;
4822         }
4823
4824         return (result);
4825 }
4826
4827 static isc_result_t
4828 allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4829              isc_stdtime_t now, dns_rdatasetiter_t **iteratorp)
4830 {
4831         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4832         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4833         rbtdb_version_t *rbtversion = version;
4834         rbtdb_rdatasetiter_t *iterator;
4835         unsigned int refs;
4836
4837         REQUIRE(VALID_RBTDB(rbtdb));
4838
4839         iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
4840         if (iterator == NULL)
4841                 return (ISC_R_NOMEMORY);
4842
4843         if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
4844                 now = 0;
4845                 if (rbtversion == NULL)
4846                         currentversion(db,
4847                                  (dns_dbversion_t **) (void *)(&rbtversion));
4848                 else {
4849                         unsigned int refs;
4850
4851                         isc_refcount_increment(&rbtversion->references,
4852                                                &refs);
4853                         INSIST(refs > 1);
4854                 }
4855         } else {
4856                 if (now == 0)
4857                         isc_stdtime_get(&now);
4858                 rbtversion = NULL;
4859         }
4860
4861         iterator->common.magic = DNS_RDATASETITER_MAGIC;
4862         iterator->common.methods = &rdatasetiter_methods;
4863         iterator->common.db = db;
4864         iterator->common.node = node;
4865         iterator->common.version = (dns_dbversion_t *)rbtversion;
4866         iterator->common.now = now;
4867
4868         NODE_STRONGLOCK(&rbtdb->node_locks[rbtnode->locknum].lock);
4869
4870         dns_rbtnode_refincrement(rbtnode, &refs);
4871         INSIST(refs != 0);
4872
4873         iterator->current = NULL;
4874
4875         NODE_STRONGUNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock);
4876
4877         *iteratorp = (dns_rdatasetiter_t *)iterator;
4878
4879         return (ISC_R_SUCCESS);
4880 }
4881
4882 static isc_boolean_t
4883 cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) {
4884         rdatasetheader_t *header, *header_next;
4885         isc_boolean_t cname, other_data;
4886         dns_rdatatype_t rdtype;
4887
4888         /*
4889          * The caller must hold the node lock.
4890          */
4891
4892         /*
4893          * Look for CNAME and "other data" rdatasets active in our version.
4894          */
4895         cname = ISC_FALSE;
4896         other_data = ISC_FALSE;
4897         for (header = node->data; header != NULL; header = header_next) {
4898                 header_next = header->next;
4899                 if (header->type == dns_rdatatype_cname) {
4900                         /*
4901                          * Look for an active extant CNAME.
4902                          */
4903                         do {
4904                                 if (header->serial <= serial &&
4905                                     !IGNORE(header)) {
4906                                         /*
4907                                          * Is this a "this rdataset doesn't
4908                                          * exist" record?
4909                                          */
4910                                         if (NONEXISTENT(header))
4911                                                 header = NULL;
4912                                         break;
4913                                 } else
4914                                         header = header->down;
4915                         } while (header != NULL);
4916                         if (header != NULL)
4917                                 cname = ISC_TRUE;
4918                 } else {
4919                         /*
4920                          * Look for active extant "other data".
4921                          *
4922                          * "Other data" is any rdataset whose type is not
4923                          * KEY, NSEC, SIG or RRSIG.
4924                          */
4925                         rdtype = RBTDB_RDATATYPE_BASE(header->type);
4926                         if (rdtype != dns_rdatatype_key &&
4927                             rdtype != dns_rdatatype_sig &&
4928                             rdtype != dns_rdatatype_nsec &&
4929                             rdtype != dns_rdatatype_rrsig) {
4930                                 /*
4931                                  * Is it active and extant?
4932                                  */
4933                                 do {
4934                                         if (header->serial <= serial &&
4935                                             !IGNORE(header)) {
4936                                                 /*
4937                                                  * Is this a "this rdataset
4938                                                  * doesn't exist" record?
4939                                                  */
4940                                                 if (NONEXISTENT(header))
4941                                                         header = NULL;
4942                                                 break;
4943                                         } else
4944                                                 header = header->down;
4945                                 } while (header != NULL);
4946                                 if (header != NULL)
4947                                         other_data = ISC_TRUE;
4948                         }
4949                 }
4950         }
4951
4952         if (cname && other_data)
4953                 return (ISC_TRUE);
4954
4955         return (ISC_FALSE);
4956 }
4957
4958 static isc_result_t
4959 add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion,
4960     rdatasetheader_t *newheader, unsigned int options, isc_boolean_t loading,
4961     dns_rdataset_t *addedrdataset, isc_stdtime_t now)
4962 {
4963         rbtdb_changed_t *changed = NULL;
4964         rdatasetheader_t *topheader, *topheader_prev, *header;
4965         unsigned char *merged;
4966         isc_result_t result;
4967         isc_boolean_t header_nx;
4968         isc_boolean_t newheader_nx;
4969         isc_boolean_t merge;
4970         dns_rdatatype_t rdtype, covers;
4971         rbtdb_rdatatype_t negtype;
4972         dns_trust_t trust;
4973
4974         /*
4975          * Add an rdatasetheader_t to a node.
4976          */
4977
4978         /*
4979          * Caller must be holding the node lock.
4980          */
4981
4982         if ((options & DNS_DBADD_MERGE) != 0) {
4983                 REQUIRE(rbtversion != NULL);
4984                 merge = ISC_TRUE;
4985         } else
4986                 merge = ISC_FALSE;
4987
4988         if ((options & DNS_DBADD_FORCE) != 0)
4989                 trust = dns_trust_ultimate;
4990         else
4991                 trust = newheader->trust;
4992
4993         if (rbtversion != NULL && !loading) {
4994                 /*
4995                  * We always add a changed record, even if no changes end up
4996                  * being made to this node, because it's harmless and
4997                  * simplifies the code.
4998                  */
4999                 changed = add_changed(rbtdb, rbtversion, rbtnode);
5000                 if (changed == NULL) {
5001                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5002                         return (ISC_R_NOMEMORY);
5003                 }
5004         }
5005
5006         newheader_nx = NONEXISTENT(newheader) ? ISC_TRUE : ISC_FALSE;
5007         topheader_prev = NULL;
5008
5009         negtype = 0;
5010         if (rbtversion == NULL && !newheader_nx) {
5011                 rdtype = RBTDB_RDATATYPE_BASE(newheader->type);
5012                 if (rdtype == 0) {
5013                         /*
5014                          * We're adding a negative cache entry.
5015                          */
5016                         covers = RBTDB_RDATATYPE_EXT(newheader->type);
5017                         if (covers == dns_rdatatype_any) {
5018                                 /*
5019                                  * We're adding an negative cache entry
5020                                  * which covers all types (NXDOMAIN,
5021                                  * NODATA(QTYPE=ANY)).
5022                                  *
5023                                  * We make all other data stale so that the
5024                                  * only rdataset that can be found at this
5025                                  * node is the negative cache entry.
5026                                  */
5027                                 for (topheader = rbtnode->data;
5028                                      topheader != NULL;
5029                                      topheader = topheader->next) {
5030                                         set_ttl(rbtdb, topheader, 0);
5031                                         topheader->attributes |=
5032                                                 RDATASET_ATTR_STALE;
5033                                 }
5034                                 rbtnode->dirty = 1;
5035                                 goto find_header;
5036                         }
5037                         negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
5038                 } else {
5039                         /*
5040                          * We're adding something that isn't a
5041                          * negative cache entry.  Look for an extant
5042                          * non-stale NXDOMAIN/NODATA(QTYPE=ANY) negative
5043                          * cache entry.
5044                          */
5045                         for (topheader = rbtnode->data;
5046                              topheader != NULL;
5047                              topheader = topheader->next) {
5048                                 if (topheader->type ==
5049                                     RBTDB_RDATATYPE_NCACHEANY)
5050                                         break;
5051                         }
5052                         if (topheader != NULL && EXISTS(topheader) &&
5053                             topheader->rdh_ttl > now) {
5054                                 /*
5055                                  * Found one.
5056                                  */
5057                                 if (trust < topheader->trust) {
5058                                         /*
5059                                          * The NXDOMAIN/NODATA(QTYPE=ANY)
5060                                          * is more trusted.
5061                                          */
5062                                         free_rdataset(rbtdb,
5063                                                       rbtdb->common.mctx,
5064                                                       newheader);
5065                                         if (addedrdataset != NULL)
5066                                                 bind_rdataset(rbtdb, rbtnode,
5067                                                               topheader, now,
5068                                                               addedrdataset);
5069                                         return (DNS_R_UNCHANGED);
5070                                 }
5071                                 /*
5072                                  * The new rdataset is better.  Expire the
5073                                  * NXDOMAIN/NODATA(QTYPE=ANY).
5074                                  */
5075                                 set_ttl(rbtdb, topheader, 0);
5076                                 topheader->attributes |= RDATASET_ATTR_STALE;
5077                                 rbtnode->dirty = 1;
5078                                 topheader = NULL;
5079                                 goto find_header;
5080                         }
5081                         negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
5082                 }
5083         }
5084
5085         for (topheader = rbtnode->data;
5086              topheader != NULL;
5087              topheader = topheader->next) {
5088                 if (topheader->type == newheader->type ||
5089                     topheader->type == negtype)
5090                         break;
5091                 topheader_prev = topheader;
5092         }
5093
5094  find_header:
5095         /*
5096          * If header isn't NULL, we've found the right type.  There may be
5097          * IGNORE rdatasets between the top of the chain and the first real
5098          * data.  We skip over them.
5099          */
5100         header = topheader;
5101         while (header != NULL && IGNORE(header))
5102                 header = header->down;
5103         if (header != NULL) {
5104                 header_nx = NONEXISTENT(header) ? ISC_TRUE : ISC_FALSE;
5105
5106                 /*
5107                  * Deleting an already non-existent rdataset has no effect.
5108                  */
5109                 if (header_nx && newheader_nx) {
5110                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5111                         return (DNS_R_UNCHANGED);
5112                 }
5113
5114                 /*
5115                  * Trying to add an rdataset with lower trust to a cache DB
5116                  * has no effect, provided that the cache data isn't stale.
5117                  */
5118                 if (rbtversion == NULL && trust < header->trust &&
5119                     (header->rdh_ttl > now || header_nx)) {
5120                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5121                         if (addedrdataset != NULL)
5122                                 bind_rdataset(rbtdb, rbtnode, header, now,
5123                                               addedrdataset);
5124                         return (DNS_R_UNCHANGED);
5125                 }
5126
5127                 /*
5128                  * Don't merge if a nonexistent rdataset is involved.
5129                  */
5130                 if (merge && (header_nx || newheader_nx))
5131                         merge = ISC_FALSE;
5132
5133                 /*
5134                  * If 'merge' is ISC_TRUE, we'll try to create a new rdataset
5135                  * that is the union of 'newheader' and 'header'.
5136                  */
5137                 if (merge) {
5138                         unsigned int flags = 0;
5139                         INSIST(rbtversion->serial >= header->serial);
5140                         merged = NULL;
5141                         result = ISC_R_SUCCESS;
5142
5143                         if ((options & DNS_DBADD_EXACT) != 0)
5144                                 flags |= DNS_RDATASLAB_EXACT;
5145                         if ((options & DNS_DBADD_EXACTTTL) != 0 &&
5146                              newheader->rdh_ttl != header->rdh_ttl)
5147                                         result = DNS_R_NOTEXACT;
5148                         else if (newheader->rdh_ttl != header->rdh_ttl)
5149                                 flags |= DNS_RDATASLAB_FORCE;
5150                         if (result == ISC_R_SUCCESS)
5151                                 result = dns_rdataslab_merge(
5152                                              (unsigned char *)header,
5153                                              (unsigned char *)newheader,
5154                                              (unsigned int)(sizeof(*newheader)),
5155                                              rbtdb->common.mctx,
5156                                              rbtdb->common.rdclass,
5157                                              (dns_rdatatype_t)header->type,
5158                                              flags, &merged);
5159                         if (result == ISC_R_SUCCESS) {
5160                                 /*
5161                                  * If 'header' has the same serial number as
5162                                  * we do, we could clean it up now if we knew
5163                                  * that our caller had no references to it.
5164                                  * We don't know this, however, so we leave it
5165                                  * alone.  It will get cleaned up when
5166                                  * clean_zone_node() runs.
5167                                  */
5168                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5169                                               newheader);
5170                                 newheader = (rdatasetheader_t *)merged;
5171                         } else {
5172                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5173                                               newheader);
5174                                 return (result);
5175                         }
5176                 }
5177                 /*
5178                  * Don't replace existing NS, A and AAAA RRsets
5179                  * in the cache if they are already exist.  This
5180                  * prevents named being locked to old servers.
5181                  * Don't lower trust of existing record if the
5182                  * update is forced.
5183                  */
5184                 if (IS_CACHE(rbtdb) && header->rdh_ttl > now &&
5185                     header->type == dns_rdatatype_ns &&
5186                     !header_nx && !newheader_nx &&
5187                     header->trust >= newheader->trust &&
5188                     dns_rdataslab_equalx((unsigned char *)header,
5189                                          (unsigned char *)newheader,
5190                                          (unsigned int)(sizeof(*newheader)),
5191                                          rbtdb->common.rdclass,
5192                                          (dns_rdatatype_t)header->type)) {
5193                         /*
5194                          * Honour the new ttl if it is less than the
5195                          * older one.
5196                          */
5197                         if (header->rdh_ttl > newheader->rdh_ttl)
5198                                 set_ttl(rbtdb, header, newheader->rdh_ttl);
5199                         if (header->noqname == NULL &&
5200                             newheader->noqname != NULL) {
5201                                 header->noqname = newheader->noqname;
5202                                 newheader->noqname = NULL;
5203                         }
5204                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5205                         if (addedrdataset != NULL)
5206                                 bind_rdataset(rbtdb, rbtnode, header, now,
5207                                               addedrdataset);
5208                         return (ISC_R_SUCCESS);
5209                 }
5210                 if (IS_CACHE(rbtdb) && header->rdh_ttl > now &&
5211                     (header->type == dns_rdatatype_a ||
5212                      header->type == dns_rdatatype_aaaa) &&
5213                     !header_nx && !newheader_nx &&
5214                     header->trust >= newheader->trust &&
5215                     dns_rdataslab_equal((unsigned char *)header,
5216                                         (unsigned char *)newheader,
5217                                         (unsigned int)(sizeof(*newheader)))) {
5218                         /*
5219                          * Honour the new ttl if it is less than the
5220                          * older one.
5221                          */
5222                         if (header->rdh_ttl > newheader->rdh_ttl)
5223                                 set_ttl(rbtdb, header, newheader->rdh_ttl);
5224                         if (header->noqname == NULL &&
5225                             newheader->noqname != NULL) {
5226                                 header->noqname = newheader->noqname;
5227                                 newheader->noqname = NULL;
5228                         }
5229                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5230                         if (addedrdataset != NULL)
5231                                 bind_rdataset(rbtdb, rbtnode, header, now,
5232                                               addedrdataset);
5233                         return (ISC_R_SUCCESS);
5234                 }
5235                 INSIST(rbtversion == NULL ||
5236                        rbtversion->serial >= topheader->serial);
5237                 if (topheader_prev != NULL)
5238                         topheader_prev->next = newheader;
5239                 else
5240                         rbtnode->data = newheader;
5241                 newheader->next = topheader->next;
5242                 if (loading) {
5243                         /*
5244                          * There are no other references to 'header' when
5245                          * loading, so we MAY clean up 'header' now.
5246                          * Since we don't generate changed records when
5247                          * loading, we MUST clean up 'header' now.
5248                          */
5249                         newheader->down = NULL;
5250                         free_rdataset(rbtdb, rbtdb->common.mctx, header);
5251                 } else {
5252                         newheader->down = topheader;
5253                         topheader->next = newheader;
5254                         rbtnode->dirty = 1;
5255                         if (changed != NULL)
5256                                 changed->dirty = ISC_TRUE;
5257                         if (rbtversion == NULL) {
5258                                 set_ttl(rbtdb, header, 0);
5259                                 header->attributes |= RDATASET_ATTR_STALE;
5260                         }
5261                         if (IS_CACHE(rbtdb)) {
5262                                 int idx = newheader->node->locknum;
5263
5264                                 ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
5265                                                  newheader, lru_link);
5266
5267                                 /*
5268                                  * XXXMLG We don't check the return value
5269                                  * here.  If it fails, we will not do TTL
5270                                  * based expiry on this node.  However, we
5271                                  * will do it on the LRU side, so memory
5272                                  * will not leak... for long.
5273                                  */
5274                                 isc_heap_insert(rbtdb->heaps[idx], newheader);
5275                         }
5276                 }
5277         } else {
5278                 /*
5279                  * No non-IGNORED rdatasets of the given type exist at
5280                  * this node.
5281                  */
5282
5283                 /*
5284                  * If we're trying to delete the type, don't bother.
5285                  */
5286                 if (newheader_nx) {
5287                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5288                         return (DNS_R_UNCHANGED);
5289                 }
5290
5291                 if (topheader != NULL) {
5292                         /*
5293                          * We have an list of rdatasets of the given type,
5294                          * but they're all marked IGNORE.  We simply insert
5295                          * the new rdataset at the head of the list.
5296                          *
5297                          * Ignored rdatasets cannot occur during loading, so
5298                          * we INSIST on it.
5299                          */
5300                         INSIST(!loading);
5301                         INSIST(rbtversion == NULL ||
5302                                rbtversion->serial >= topheader->serial);
5303                         if (topheader_prev != NULL)
5304                                 topheader_prev->next = newheader;
5305                         else
5306                                 rbtnode->data = newheader;
5307                         newheader->next = topheader->next;
5308                         newheader->down = topheader;
5309                         topheader->next = newheader;
5310                         rbtnode->dirty = 1;
5311                         if (changed != NULL)
5312                                 changed->dirty = ISC_TRUE;
5313                 } else {
5314                         /*
5315                          * No rdatasets of the given type exist at the node.
5316                          */
5317                         newheader->next = rbtnode->data;
5318                         newheader->down = NULL;
5319                         rbtnode->data = newheader;
5320                 }
5321                 if (IS_CACHE(rbtdb)) {
5322                         int idx = newheader->node->locknum;
5323                         ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
5324                                          newheader, lru_link);
5325                         isc_heap_insert(rbtdb->heaps[idx], newheader);
5326                 }
5327         }
5328
5329         /*
5330          * Check if the node now contains CNAME and other data.
5331          */
5332         if (rbtversion != NULL &&
5333             cname_and_other_data(rbtnode, rbtversion->serial))
5334                 return (DNS_R_CNAMEANDOTHER);
5335
5336         if (addedrdataset != NULL)
5337                 bind_rdataset(rbtdb, rbtnode, newheader, now, addedrdataset);
5338
5339         return (ISC_R_SUCCESS);
5340 }
5341
5342 static inline isc_boolean_t
5343 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
5344                 rbtdb_rdatatype_t type)
5345 {
5346         if (IS_CACHE(rbtdb)) {
5347                 if (type == dns_rdatatype_dname)
5348                         return (ISC_TRUE);
5349                 else
5350                         return (ISC_FALSE);
5351         } else if (type == dns_rdatatype_dname ||
5352                    (type == dns_rdatatype_ns &&
5353                     (node != rbtdb->origin_node || IS_STUB(rbtdb))))
5354                 return (ISC_TRUE);
5355         return (ISC_FALSE);
5356 }
5357
5358 static inline isc_result_t
5359 addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
5360            dns_rdataset_t *rdataset)
5361 {
5362         struct noqname *noqname;
5363         isc_mem_t *mctx = rbtdb->common.mctx;
5364         dns_name_t name;
5365         dns_rdataset_t nsec, nsecsig;
5366         isc_result_t result;
5367         isc_region_t r;
5368
5369         dns_name_init(&name, NULL);
5370         dns_rdataset_init(&nsec);
5371         dns_rdataset_init(&nsecsig);
5372
5373         result = dns_rdataset_getnoqname(rdataset, &name, &nsec, &nsecsig);
5374         RUNTIME_CHECK(result == ISC_R_SUCCESS);
5375
5376         noqname = isc_mem_get(mctx, sizeof(*noqname));
5377         if (noqname == NULL) {
5378                 result = ISC_R_NOMEMORY;
5379                 goto cleanup;
5380         }
5381         dns_name_init(&noqname->name, NULL);
5382         noqname->nsec = NULL;
5383         noqname->nsecsig = NULL;
5384         result = dns_name_dup(&name, mctx, &noqname->name);
5385         if (result != ISC_R_SUCCESS)
5386                 goto cleanup;
5387         result = dns_rdataslab_fromrdataset(&nsec, mctx, &r, 0);
5388         if (result != ISC_R_SUCCESS)
5389                 goto cleanup;
5390         noqname->nsec = r.base;
5391         result = dns_rdataslab_fromrdataset(&nsecsig, mctx, &r, 0);
5392         if (result != ISC_R_SUCCESS)
5393                 goto cleanup;
5394         noqname->nsecsig = r.base;
5395         dns_rdataset_disassociate(&nsec);
5396         dns_rdataset_disassociate(&nsecsig);
5397         newheader->noqname = noqname;
5398         return (ISC_R_SUCCESS);
5399
5400 cleanup:
5401         dns_rdataset_disassociate(&nsec);
5402         dns_rdataset_disassociate(&nsecsig);
5403         free_noqname(mctx, &noqname);
5404         return(result);
5405 }
5406
5407 static isc_result_t
5408 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5409             isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
5410             dns_rdataset_t *addedrdataset)
5411 {
5412         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5413         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5414         rbtdb_version_t *rbtversion = version;
5415         isc_region_t region;
5416         rdatasetheader_t *newheader;
5417         rdatasetheader_t *header;
5418         isc_result_t result;
5419         isc_boolean_t delegating;
5420         isc_boolean_t tree_locked = ISC_FALSE;
5421
5422         REQUIRE(VALID_RBTDB(rbtdb));
5423
5424         if (rbtversion == NULL) {
5425                 if (now == 0)
5426                         isc_stdtime_get(&now);
5427         } else
5428                 now = 0;
5429
5430         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5431                                             &region,
5432                                             sizeof(rdatasetheader_t));
5433         if (result != ISC_R_SUCCESS)
5434                 return (result);
5435
5436         newheader = (rdatasetheader_t *)region.base;
5437         init_rdataset(rbtdb, newheader);
5438         set_ttl(rbtdb, newheader, rdataset->ttl + now);
5439         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5440                                                 rdataset->covers);
5441         newheader->attributes = 0;
5442         newheader->noqname = NULL;
5443         newheader->count = init_count++;
5444         newheader->trust = rdataset->trust;
5445         newheader->additional_auth = NULL;
5446         newheader->additional_glue = NULL;
5447         newheader->last_used = now;
5448         newheader->node = rbtnode;
5449         if (rbtversion != NULL) {
5450                 newheader->serial = rbtversion->serial;
5451                 now = 0;
5452         } else {
5453                 newheader->serial = 1;
5454                 if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0)
5455                         newheader->attributes |= RDATASET_ATTR_NXDOMAIN;
5456                 if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
5457                         result = addnoqname(rbtdb, newheader, rdataset);
5458                         if (result != ISC_R_SUCCESS) {
5459                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5460                                               newheader);
5461                                 return (result);
5462                         }
5463                 }
5464         }
5465
5466         /*
5467          * If we're adding a delegation type (e.g. NS or DNAME for a zone,
5468          * just DNAME for the cache), then we need to set the callback bit
5469          * on the node.
5470          */
5471         if (delegating_type(rbtdb, rbtnode, rdataset->type))
5472                 delegating = ISC_TRUE;
5473         else
5474                 delegating = ISC_FALSE;
5475
5476         /*
5477          * If we're adding a delegation type or the DB is a cache in an overmem
5478          * state, hold an exclusive lock on the tree.  In the latter case
5479          * the lock does not necessarily have to be acquired but it will help
5480          * purge stale entries more effectively.
5481          */
5482         if (delegating || (IS_CACHE(rbtdb) && rbtdb->overmem)) {
5483                 tree_locked = ISC_TRUE;
5484                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5485         }
5486
5487         if (IS_CACHE(rbtdb) && rbtdb->overmem)
5488                 overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked);
5489
5490         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5491                   isc_rwlocktype_write);
5492
5493         if (rbtdb->rrsetstats != NULL) {
5494                 newheader->attributes |= RDATASET_ATTR_STATCOUNT;
5495                 update_rrsetstats(rbtdb, newheader, ISC_TRUE);
5496         }
5497
5498         if (IS_CACHE(rbtdb)) {
5499                 if (tree_locked)
5500                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
5501
5502                 header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
5503                 if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL)
5504                         expire_header(rbtdb, header, tree_locked);
5505
5506                 /*
5507                  * If we've been holding a write lock on the tree just for
5508                  * cleaning, we can release it now.  However, we still need the
5509                  * node lock.
5510                  */
5511                 if (tree_locked && !delegating) {
5512                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5513                         tree_locked = ISC_FALSE;
5514                 }
5515         }
5516
5517         result = add(rbtdb, rbtnode, rbtversion, newheader, options, ISC_FALSE,
5518                      addedrdataset, now);
5519         if (result == ISC_R_SUCCESS && delegating)
5520                 rbtnode->find_callback = 1;
5521
5522         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5523                     isc_rwlocktype_write);
5524
5525         if (tree_locked)
5526                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5527
5528         /*
5529          * Update the zone's secure status.  If version is non-NULL
5530          * this is deferred until closeversion() is called.
5531          */
5532         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5533                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5534
5535         return (result);
5536 }
5537
5538 static isc_result_t
5539 subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5540                  dns_rdataset_t *rdataset, unsigned int options,
5541                  dns_rdataset_t *newrdataset)
5542 {
5543         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5544         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5545         rbtdb_version_t *rbtversion = version;
5546         rdatasetheader_t *topheader, *topheader_prev, *header, *newheader;
5547         unsigned char *subresult;
5548         isc_region_t region;
5549         isc_result_t result;
5550         rbtdb_changed_t *changed;
5551
5552         REQUIRE(VALID_RBTDB(rbtdb));
5553
5554         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5555                                             &region,
5556                                             sizeof(rdatasetheader_t));
5557         if (result != ISC_R_SUCCESS)
5558                 return (result);
5559         newheader = (rdatasetheader_t *)region.base;
5560         init_rdataset(rbtdb, newheader);
5561         set_ttl(rbtdb, newheader, rdataset->ttl);
5562         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5563                                                 rdataset->covers);
5564         newheader->attributes = 0;
5565         newheader->serial = rbtversion->serial;
5566         newheader->trust = 0;
5567         newheader->noqname = NULL;
5568         newheader->count = init_count++;
5569         newheader->additional_auth = NULL;
5570         newheader->additional_glue = NULL;
5571         newheader->last_used = 0;
5572         newheader->node = rbtnode;
5573
5574         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5575                   isc_rwlocktype_write);
5576
5577         changed = add_changed(rbtdb, rbtversion, rbtnode);
5578         if (changed == NULL) {
5579                 free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5580                 NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5581                             isc_rwlocktype_write);
5582                 return (ISC_R_NOMEMORY);
5583         }
5584
5585         topheader_prev = NULL;
5586         for (topheader = rbtnode->data;
5587              topheader != NULL;
5588              topheader = topheader->next) {
5589                 if (topheader->type == newheader->type)
5590                         break;
5591                 topheader_prev = topheader;
5592         }
5593         /*
5594          * If header isn't NULL, we've found the right type.  There may be
5595          * IGNORE rdatasets between the top of the chain and the first real
5596          * data.  We skip over them.
5597          */
5598         header = topheader;
5599         while (header != NULL && IGNORE(header))
5600                 header = header->down;
5601         if (header != NULL && EXISTS(header)) {
5602                 unsigned int flags = 0;
5603                 subresult = NULL;
5604                 result = ISC_R_SUCCESS;
5605                 if ((options & DNS_DBSUB_EXACT) != 0) {
5606                         flags |= DNS_RDATASLAB_EXACT;
5607                         if (newheader->rdh_ttl != header->rdh_ttl)
5608                                 result = DNS_R_NOTEXACT;
5609                 }
5610                 if (result == ISC_R_SUCCESS)
5611                         result = dns_rdataslab_subtract(
5612                                         (unsigned char *)header,
5613                                         (unsigned char *)newheader,
5614                                         (unsigned int)(sizeof(*newheader)),
5615                                         rbtdb->common.mctx,
5616                                         rbtdb->common.rdclass,
5617                                         (dns_rdatatype_t)header->type,
5618                                         flags, &subresult);
5619                 if (result == ISC_R_SUCCESS) {
5620                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5621                         newheader = (rdatasetheader_t *)subresult;
5622                         /*
5623                          * We have to set the serial since the rdataslab
5624                          * subtraction routine copies the reserved portion of
5625                          * header, not newheader.
5626                          */
5627                         newheader->serial = rbtversion->serial;
5628                         /*
5629                          * XXXJT: dns_rdataslab_subtract() copied the pointers
5630                          * to additional info.  We need to clear these fields
5631                          * to avoid having duplicated references.
5632                          */
5633                         newheader->additional_auth = NULL;
5634                         newheader->additional_glue = NULL;
5635                 } else if (result == DNS_R_NXRRSET) {
5636                         /*
5637                          * This subtraction would remove all of the rdata;
5638                          * add a nonexistent header instead.
5639                          */
5640                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5641                         newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
5642                         if (newheader == NULL) {
5643                                 result = ISC_R_NOMEMORY;
5644                                 goto unlock;
5645                         }
5646                         set_ttl(rbtdb, newheader, 0);
5647                         newheader->type = topheader->type;
5648                         newheader->attributes = RDATASET_ATTR_NONEXISTENT;
5649                         newheader->trust = 0;
5650                         newheader->serial = rbtversion->serial;
5651                         newheader->noqname = NULL;
5652                         newheader->count = 0;
5653                         newheader->additional_auth = NULL;
5654                         newheader->additional_glue = NULL;
5655                         newheader->node = rbtnode;
5656                         newheader->last_used = 0;
5657                 } else {
5658                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5659                         goto unlock;
5660                 }
5661
5662                 /*
5663                  * If we're here, we want to link newheader in front of
5664                  * topheader.
5665                  */
5666                 INSIST(rbtversion->serial >= topheader->serial);
5667                 if (topheader_prev != NULL)
5668                         topheader_prev->next = newheader;
5669                 else
5670                         rbtnode->data = newheader;
5671                 newheader->next = topheader->next;
5672                 newheader->down = topheader;
5673                 topheader->next = newheader;
5674                 rbtnode->dirty = 1;
5675                 changed->dirty = ISC_TRUE;
5676         } else {
5677                 /*
5678                  * The rdataset doesn't exist, so we don't need to do anything
5679                  * to satisfy the deletion request.
5680                  */
5681                 free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5682                 if ((options & DNS_DBSUB_EXACT) != 0)
5683                         result = DNS_R_NOTEXACT;
5684                 else
5685                         result = DNS_R_UNCHANGED;
5686         }
5687
5688         if (result == ISC_R_SUCCESS && newrdataset != NULL)
5689                 bind_rdataset(rbtdb, rbtnode, newheader, 0, newrdataset);
5690
5691  unlock:
5692         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5693                     isc_rwlocktype_write);
5694
5695         /*
5696          * Update the zone's secure status.  If version is non-NULL
5697          * this is deferred until closeversion() is called.
5698          */
5699         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5700                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5701
5702         return (result);
5703 }
5704
5705 static isc_result_t
5706 deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5707                dns_rdatatype_t type, dns_rdatatype_t covers)
5708 {
5709         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5710         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5711         rbtdb_version_t *rbtversion = version;
5712         isc_result_t result;
5713         rdatasetheader_t *newheader;
5714
5715         REQUIRE(VALID_RBTDB(rbtdb));
5716
5717         if (type == dns_rdatatype_any)
5718                 return (ISC_R_NOTIMPLEMENTED);
5719         if (type == dns_rdatatype_rrsig && covers == 0)
5720                 return (ISC_R_NOTIMPLEMENTED);
5721
5722         newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
5723         if (newheader == NULL)
5724                 return (ISC_R_NOMEMORY);
5725         set_ttl(rbtdb, newheader, 0);
5726         newheader->type = RBTDB_RDATATYPE_VALUE(type, covers);
5727         newheader->attributes = RDATASET_ATTR_NONEXISTENT;
5728         newheader->trust = 0;
5729         newheader->noqname = NULL;
5730         newheader->additional_auth = NULL;
5731         newheader->additional_glue = NULL;
5732         if (rbtversion != NULL)
5733                 newheader->serial = rbtversion->serial;
5734         else
5735                 newheader->serial = 0;
5736         newheader->count = 0;
5737         newheader->last_used = 0;
5738         newheader->node = rbtnode;
5739
5740         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5741                   isc_rwlocktype_write);
5742
5743         result = add(rbtdb, rbtnode, rbtversion, newheader, DNS_DBADD_FORCE,
5744                      ISC_FALSE, NULL, 0);
5745
5746         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5747                     isc_rwlocktype_write);
5748
5749         /*
5750          * Update the zone's secure status.  If version is non-NULL
5751          * this is deferred until closeversion() is called.
5752          */
5753         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5754                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5755
5756         return (result);
5757 }
5758
5759 static isc_result_t
5760 loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) {
5761         rbtdb_load_t *loadctx = arg;
5762         dns_rbtdb_t *rbtdb = loadctx->rbtdb;
5763         dns_rbtnode_t *node;
5764         isc_result_t result;
5765         isc_region_t region;
5766         rdatasetheader_t *newheader;
5767
5768         /*
5769          * This routine does no node locking.  See comments in
5770          * 'load' below for more information on loading and
5771          * locking.
5772          */
5773
5774
5775         /*
5776          * SOA records are only allowed at top of zone.
5777          */
5778         if (rdataset->type == dns_rdatatype_soa &&
5779             !IS_CACHE(rbtdb) && !dns_name_equal(name, &rbtdb->common.origin))
5780                 return (DNS_R_NOTZONETOP);
5781
5782         add_empty_wildcards(rbtdb, name);
5783
5784         if (dns_name_iswildcard(name)) {
5785                 /*
5786                  * NS record owners cannot legally be wild cards.
5787                  */
5788                 if (rdataset->type == dns_rdatatype_ns)
5789                         return (DNS_R_INVALIDNS);
5790                 result = add_wildcard_magic(rbtdb, name);
5791                 if (result != ISC_R_SUCCESS)
5792                         return (result);
5793         }
5794
5795         node = NULL;
5796         result = dns_rbt_addnode(rbtdb->tree, name, &node);
5797         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
5798                 return (result);
5799         if (result != ISC_R_EXISTS) {
5800                 dns_name_t foundname;
5801                 dns_name_init(&foundname, NULL);
5802                 dns_rbt_namefromnode(node, &foundname);
5803 #ifdef DNS_RBT_USEHASH
5804                 node->locknum = node->hashval % rbtdb->node_lock_count;
5805 #else
5806                 node->locknum = dns_name_hash(&foundname, ISC_TRUE) %
5807                         rbtdb->node_lock_count;
5808 #endif
5809         }
5810
5811         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5812                                             &region,
5813                                             sizeof(rdatasetheader_t));
5814         if (result != ISC_R_SUCCESS)
5815                 return (result);
5816         newheader = (rdatasetheader_t *)region.base;
5817         init_rdataset(rbtdb, newheader);
5818         set_ttl(rbtdb, newheader,
5819                 rdataset->ttl + loadctx->now); /* XXX overflow check */
5820         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5821                                                 rdataset->covers);
5822         newheader->attributes = 0;
5823         newheader->trust = rdataset->trust;
5824         newheader->serial = 1;
5825         newheader->noqname = NULL;
5826         newheader->count = init_count++;
5827         newheader->additional_auth = NULL;
5828         newheader->additional_glue = NULL;
5829         /* won't be used, but initialize anyway */
5830         newheader->last_used = 0;
5831         newheader->node = node;
5832
5833         result = add(rbtdb, node, rbtdb->current_version, newheader,
5834                      DNS_DBADD_MERGE, ISC_TRUE, NULL, 0);
5835         if (result == ISC_R_SUCCESS &&
5836             delegating_type(rbtdb, node, rdataset->type))
5837                 node->find_callback = 1;
5838         else if (result == DNS_R_UNCHANGED)
5839                 result = ISC_R_SUCCESS;
5840
5841         return (result);
5842 }
5843
5844 static isc_result_t
5845 beginload(dns_db_t *db, dns_addrdatasetfunc_t *addp, dns_dbload_t **dbloadp) {
5846         rbtdb_load_t *loadctx;
5847         dns_rbtdb_t *rbtdb;
5848
5849         rbtdb = (dns_rbtdb_t *)db;
5850
5851         REQUIRE(VALID_RBTDB(rbtdb));
5852
5853         loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx));
5854         if (loadctx == NULL)
5855                 return (ISC_R_NOMEMORY);
5856
5857         loadctx->rbtdb = rbtdb;
5858         if (IS_CACHE(rbtdb))
5859                 isc_stdtime_get(&loadctx->now);
5860         else
5861                 loadctx->now = 0;
5862
5863         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5864
5865         REQUIRE((rbtdb->attributes & (RBTDB_ATTR_LOADED|RBTDB_ATTR_LOADING))
5866                 == 0);
5867         rbtdb->attributes |= RBTDB_ATTR_LOADING;
5868
5869         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5870
5871         *addp = loading_addrdataset;
5872         *dbloadp = loadctx;
5873
5874         return (ISC_R_SUCCESS);
5875 }
5876
5877 static isc_result_t
5878 endload(dns_db_t *db, dns_dbload_t **dbloadp) {
5879         rbtdb_load_t *loadctx;
5880         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5881
5882         REQUIRE(VALID_RBTDB(rbtdb));
5883         REQUIRE(dbloadp != NULL);
5884         loadctx = *dbloadp;
5885         REQUIRE(loadctx->rbtdb == rbtdb);
5886
5887         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5888
5889         REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0);
5890         REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0);
5891
5892         rbtdb->attributes &= ~RBTDB_ATTR_LOADING;
5893         rbtdb->attributes |= RBTDB_ATTR_LOADED;
5894
5895         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5896
5897         /*
5898          * If there's a KEY rdataset at the zone origin containing a
5899          * zone key, we consider the zone secure.
5900          */
5901         if (! IS_CACHE(rbtdb))
5902                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5903
5904         *dbloadp = NULL;
5905
5906         isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx));
5907
5908         return (ISC_R_SUCCESS);
5909 }
5910
5911 static isc_result_t
5912 dump(dns_db_t *db, dns_dbversion_t *version, const char *filename,
5913      dns_masterformat_t masterformat) {
5914         dns_rbtdb_t *rbtdb;
5915
5916         rbtdb = (dns_rbtdb_t *)db;
5917
5918         REQUIRE(VALID_RBTDB(rbtdb));
5919
5920         return (dns_master_dump2(rbtdb->common.mctx, db, version,
5921                                  &dns_master_style_default,
5922                                  filename, masterformat));
5923 }
5924
5925 static void
5926 delete_callback(void *data, void *arg) {
5927         dns_rbtdb_t *rbtdb = arg;
5928         rdatasetheader_t *current, *next;
5929
5930         for (current = data; current != NULL; current = next) {
5931                 next = current->next;
5932                 free_rdataset(rbtdb, rbtdb->common.mctx, current);
5933         }
5934 }
5935
5936 static isc_boolean_t
5937 issecure(dns_db_t *db) {
5938         dns_rbtdb_t *rbtdb;
5939         isc_boolean_t secure;
5940
5941         rbtdb = (dns_rbtdb_t *)db;
5942
5943         REQUIRE(VALID_RBTDB(rbtdb));
5944
5945         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5946         secure = rbtdb->secure;
5947         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5948
5949         return (secure);
5950 }
5951
5952 static unsigned int
5953 nodecount(dns_db_t *db) {
5954         dns_rbtdb_t *rbtdb;
5955         unsigned int count;
5956
5957         rbtdb = (dns_rbtdb_t *)db;
5958
5959         REQUIRE(VALID_RBTDB(rbtdb));
5960
5961         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5962         count = dns_rbt_nodecount(rbtdb->tree);
5963         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5964
5965         return (count);
5966 }
5967
5968 static void
5969 settask(dns_db_t *db, isc_task_t *task) {
5970         dns_rbtdb_t *rbtdb;
5971
5972         rbtdb = (dns_rbtdb_t *)db;
5973
5974         REQUIRE(VALID_RBTDB(rbtdb));
5975
5976         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5977         if (rbtdb->task != NULL)
5978                 isc_task_detach(&rbtdb->task);
5979         if (task != NULL)
5980                 isc_task_attach(task, &rbtdb->task);
5981         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5982 }
5983
5984 static isc_boolean_t
5985 ispersistent(dns_db_t *db) {
5986         UNUSED(db);
5987         return (ISC_FALSE);
5988 }
5989
5990 static isc_result_t
5991 getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) {
5992         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5993         dns_rbtnode_t *onode;
5994         isc_result_t result = ISC_R_SUCCESS;
5995
5996         REQUIRE(VALID_RBTDB(rbtdb));
5997         REQUIRE(nodep != NULL && *nodep == NULL);
5998
5999         /* Note that the access to origin_node doesn't require a DB lock */
6000         onode = (dns_rbtnode_t *)rbtdb->origin_node;
6001         if (onode != NULL) {
6002                 NODE_STRONGLOCK(&rbtdb->node_locks[onode->locknum].lock);
6003                 new_reference(rbtdb, onode);
6004                 NODE_STRONGUNLOCK(&rbtdb->node_locks[onode->locknum].lock);
6005
6006                 *nodep = rbtdb->origin_node;
6007         } else {
6008                 INSIST(!IS_CACHE(rbtdb));
6009                 result = ISC_R_NOTFOUND;
6010         }
6011
6012         return (result);
6013 }
6014
6015 static dns_stats_t *
6016 getrrsetstats(dns_db_t *db) {
6017         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6018
6019         REQUIRE(VALID_RBTDB(rbtdb));
6020         REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
6021
6022         return (rbtdb->rrsetstats);
6023 }
6024
6025 static dns_dbmethods_t zone_methods = {
6026         attach,
6027         detach,
6028         beginload,
6029         endload,
6030         dump,
6031         currentversion,
6032         newversion,
6033         attachversion,
6034         closeversion,
6035         findnode,
6036         zone_find,
6037         zone_findzonecut,
6038         attachnode,
6039         detachnode,
6040         expirenode,
6041         printnode,
6042         createiterator,
6043         zone_findrdataset,
6044         allrdatasets,
6045         addrdataset,
6046         subtractrdataset,
6047         deleterdataset,
6048         issecure,
6049         nodecount,
6050         ispersistent,
6051         overmem,
6052         settask,
6053         getoriginnode,
6054         NULL,
6055         NULL
6056 };
6057
6058 static dns_dbmethods_t cache_methods = {
6059         attach,
6060         detach,
6061         beginload,
6062         endload,
6063         dump,
6064         currentversion,
6065         newversion,
6066         attachversion,
6067         closeversion,
6068         findnode,
6069         cache_find,
6070         cache_findzonecut,
6071         attachnode,
6072         detachnode,
6073         expirenode,
6074         printnode,
6075         createiterator,
6076         cache_findrdataset,
6077         allrdatasets,
6078         addrdataset,
6079         subtractrdataset,
6080         deleterdataset,
6081         issecure,
6082         nodecount,
6083         ispersistent,
6084         overmem,
6085         settask,
6086         getoriginnode,
6087         NULL,
6088         getrrsetstats
6089 };
6090
6091 isc_result_t
6092 #ifdef DNS_RBTDB_VERSION64
6093 dns_rbtdb64_create
6094 #else
6095 dns_rbtdb_create
6096 #endif
6097                 (isc_mem_t *mctx, dns_name_t *origin, dns_dbtype_t type,
6098                  dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
6099                  void *driverarg, dns_db_t **dbp)
6100 {
6101         dns_rbtdb_t *rbtdb;
6102         isc_result_t result;
6103         int i;
6104         dns_name_t name;
6105
6106         /* Keep the compiler happy. */
6107         UNUSED(argc);
6108         UNUSED(argv);
6109         UNUSED(driverarg);
6110
6111         rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
6112         if (rbtdb == NULL)
6113                 return (ISC_R_NOMEMORY);
6114
6115         memset(rbtdb, '\0', sizeof(*rbtdb));
6116         dns_name_init(&rbtdb->common.origin, NULL);
6117         rbtdb->common.attributes = 0;
6118         if (type == dns_dbtype_cache) {
6119                 rbtdb->common.methods = &cache_methods;
6120                 rbtdb->common.attributes |= DNS_DBATTR_CACHE;
6121         } else if (type == dns_dbtype_stub) {
6122                 rbtdb->common.methods = &zone_methods;
6123                 rbtdb->common.attributes |= DNS_DBATTR_STUB;
6124         } else
6125                 rbtdb->common.methods = &zone_methods;
6126         rbtdb->common.rdclass = rdclass;
6127         rbtdb->common.mctx = NULL;
6128
6129         result = RBTDB_INITLOCK(&rbtdb->lock);
6130         if (result != ISC_R_SUCCESS)
6131                 goto cleanup_rbtdb;
6132
6133         result = isc_rwlock_init(&rbtdb->tree_lock, 0, 0);
6134         if (result != ISC_R_SUCCESS)
6135                 goto cleanup_lock;
6136
6137         /*
6138          * Initialize node_lock_count in a generic way to support future
6139          * extension which allows the user to specify this value on creation.
6140          * Note that when specified for a cache DB it must be larger than 1
6141          * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
6142          */
6143         if (rbtdb->node_lock_count == 0) {
6144                 if (IS_CACHE(rbtdb))
6145                         rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
6146                 else
6147                         rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
6148         } else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
6149                 result = ISC_R_RANGE;
6150                 goto cleanup_tree_lock;
6151         }
6152         INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
6153         rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
6154                                         sizeof(rbtdb_nodelock_t));
6155         if (rbtdb->node_locks == NULL) {
6156                 result = ISC_R_NOMEMORY;
6157                 goto cleanup_tree_lock;
6158         }
6159
6160         rbtdb->rrsetstats = NULL;
6161         if (IS_CACHE(rbtdb)) {
6162                 result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
6163                 if (result != ISC_R_SUCCESS)
6164                         goto cleanup_node_locks;
6165                 rbtdb->rdatasets = isc_mem_get(mctx, rbtdb->node_lock_count *
6166                                                sizeof(rdatasetheaderlist_t));
6167                 if (rbtdb->rdatasets == NULL) {
6168                         result = ISC_R_NOMEMORY;
6169                         goto cleanup_rrsetstats;
6170                 }
6171                 for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6172                         ISC_LIST_INIT(rbtdb->rdatasets[i]);
6173
6174                 /*
6175                  * Create the heaps.
6176                  */
6177                 rbtdb->heaps = isc_mem_get(mctx, rbtdb->node_lock_count *
6178                                            sizeof(isc_heap_t *));
6179                 if (rbtdb->heaps == NULL) {
6180                         result = ISC_R_NOMEMORY;
6181                         goto cleanup_rdatasets;
6182                 }
6183                 for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6184                         rbtdb->heaps[i] = NULL;
6185                 for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
6186                         result = isc_heap_create(mctx, ttl_sooner,
6187                                                  ttl_set_index, 0,
6188                                                  &rbtdb->heaps[i]);
6189                         if (result != ISC_R_SUCCESS)
6190                                 goto cleanup_heaps;
6191                 }
6192         } else {
6193                 rbtdb->rdatasets = NULL;
6194                 rbtdb->heaps = NULL;
6195         }
6196
6197         rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
6198                                        sizeof(rbtnodelist_t));
6199         if (rbtdb->deadnodes == NULL) {
6200                 result = ISC_R_NOMEMORY;
6201                 goto cleanup_heaps;
6202         }
6203         for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6204                 ISC_LIST_INIT(rbtdb->deadnodes[i]);
6205
6206         rbtdb->active = rbtdb->node_lock_count;
6207
6208         for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
6209                 result = NODE_INITLOCK(&rbtdb->node_locks[i].lock);
6210                 if (result == ISC_R_SUCCESS) {
6211                         result = isc_refcount_init(&rbtdb->node_locks[i].references, 0);
6212                         if (result != ISC_R_SUCCESS)
6213                                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
6214                 }
6215                 if (result != ISC_R_SUCCESS) {
6216                         while (i-- > 0) {
6217                                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
6218                                 isc_refcount_decrement(&rbtdb->node_locks[i].references, NULL);
6219                                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
6220                         }
6221                         goto cleanup_deadnodes;
6222                 }
6223                 rbtdb->node_locks[i].exiting = ISC_FALSE;
6224         }
6225
6226         /*
6227          * Attach to the mctx.  The database will persist so long as there
6228          * are references to it, and attaching to the mctx ensures that our
6229          * mctx won't disappear out from under us.
6230          */
6231         isc_mem_attach(mctx, &rbtdb->common.mctx);
6232
6233         /*
6234          * Must be initialized before free_rbtdb() is called.
6235          */
6236         isc_ondestroy_init(&rbtdb->common.ondest);
6237
6238         /*
6239          * Make a copy of the origin name.
6240          */
6241         result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
6242         if (result != ISC_R_SUCCESS) {
6243                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6244                 return (result);
6245         }
6246
6247         /*
6248          * Make the Red-Black Tree.
6249          */
6250         result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
6251         if (result != ISC_R_SUCCESS) {
6252                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6253                 return (result);
6254         }
6255         /*
6256          * In order to set the node callback bit correctly in zone databases,
6257          * we need to know if the node has the origin name of the zone.
6258          * In loading_addrdataset() we could simply compare the new name
6259          * to the origin name, but this is expensive.  Also, we don't know the
6260          * node name in addrdataset(), so we need another way of knowing the
6261          * zone's top.
6262          *
6263          * We now explicitly create a node for the zone's origin, and then
6264          * we simply remember the node's address.  This is safe, because
6265          * the top-of-zone node can never be deleted, nor can its address
6266          * change.
6267          */
6268         if (!IS_CACHE(rbtdb)) {
6269                 rbtdb->origin_node = NULL;
6270                 result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
6271                                          &rbtdb->origin_node);
6272                 if (result != ISC_R_SUCCESS) {
6273                         INSIST(result != ISC_R_EXISTS);
6274                         free_rbtdb(rbtdb, ISC_FALSE, NULL);
6275                         return (result);
6276                 }
6277                 /*
6278                  * We need to give the origin node the right locknum.
6279                  */
6280                 dns_name_init(&name, NULL);
6281                 dns_rbt_namefromnode(rbtdb->origin_node, &name);
6282 #ifdef DNS_RBT_USEHASH
6283                 rbtdb->origin_node->locknum =
6284                         rbtdb->origin_node->hashval %
6285                         rbtdb->node_lock_count;
6286 #else
6287                 rbtdb->origin_node->locknum =
6288                         dns_name_hash(&name, ISC_TRUE) %
6289                         rbtdb->node_lock_count;
6290 #endif
6291         }
6292
6293         /*
6294          * Misc. Initialization.
6295          */
6296         result = isc_refcount_init(&rbtdb->references, 1);
6297         if (result != ISC_R_SUCCESS) {
6298                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6299                 return (result);
6300         }
6301         rbtdb->attributes = 0;
6302         rbtdb->secure = ISC_FALSE;
6303         rbtdb->overmem = ISC_FALSE;
6304         rbtdb->task = NULL;
6305
6306         /*
6307          * Version Initialization.
6308          */
6309         rbtdb->current_serial = 1;
6310         rbtdb->least_serial = 1;
6311         rbtdb->next_serial = 2;
6312         rbtdb->current_version = allocate_version(mctx, 1, 1, ISC_FALSE);
6313         if (rbtdb->current_version == NULL) {
6314                 isc_refcount_decrement(&rbtdb->references, NULL);
6315                 isc_refcount_destroy(&rbtdb->references);
6316                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6317                 return (ISC_R_NOMEMORY);
6318         }
6319         rbtdb->future_version = NULL;
6320         ISC_LIST_INIT(rbtdb->open_versions);
6321         /*
6322          * Keep the current version in the open list so that list operation
6323          * won't happen in normal lookup operations.
6324          */
6325         PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
6326
6327         rbtdb->common.magic = DNS_DB_MAGIC;
6328         rbtdb->common.impmagic = RBTDB_MAGIC;
6329
6330         *dbp = (dns_db_t *)rbtdb;
6331
6332         return (ISC_R_SUCCESS);
6333
6334  cleanup_deadnodes:
6335         isc_mem_put(mctx, rbtdb->deadnodes,
6336                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
6337
6338  cleanup_heaps:
6339         if (rbtdb->heaps != NULL) {
6340                 for (i = 0 ; i < (int)rbtdb->node_lock_count ; i++)
6341                         if (rbtdb->heaps[i] != NULL)
6342                                 isc_heap_destroy(&rbtdb->heaps[i]);
6343                 isc_mem_put(mctx, rbtdb->heaps,
6344                             rbtdb->node_lock_count * sizeof(isc_heap_t *));
6345         }
6346
6347  cleanup_rdatasets:
6348         if (rbtdb->rdatasets != NULL)
6349                 isc_mem_put(mctx, rbtdb->rdatasets, rbtdb->node_lock_count *
6350                             sizeof(rdatasetheaderlist_t));
6351  cleanup_rrsetstats:
6352         if (rbtdb->rrsetstats != NULL)
6353                 dns_stats_detach(&rbtdb->rrsetstats);
6354
6355  cleanup_node_locks:
6356         isc_mem_put(mctx, rbtdb->node_locks,
6357                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
6358
6359  cleanup_tree_lock:
6360         isc_rwlock_destroy(&rbtdb->tree_lock);
6361
6362  cleanup_lock:
6363         RBTDB_DESTROYLOCK(&rbtdb->lock);
6364
6365  cleanup_rbtdb:
6366         isc_mem_put(mctx, rbtdb,  sizeof(*rbtdb));
6367         return (result);
6368 }
6369
6370
6371 /*
6372  * Slabbed Rdataset Methods
6373  */
6374
6375 static void
6376 rdataset_disassociate(dns_rdataset_t *rdataset) {
6377         dns_db_t *db = rdataset->private1;
6378         dns_dbnode_t *node = rdataset->private2;
6379
6380         detachnode(db, &node);
6381 }
6382
6383 static isc_result_t
6384 rdataset_first(dns_rdataset_t *rdataset) {
6385         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
6386         unsigned int count;
6387
6388         count = raw[0] * 256 + raw[1];
6389         if (count == 0) {
6390                 rdataset->private5 = NULL;
6391                 return (ISC_R_NOMORE);
6392         }
6393
6394 #if DNS_RDATASET_FIXED
6395         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0)
6396                 raw += 2 + (4 * count);
6397         else
6398 #endif
6399                 raw += 2;
6400
6401         /*
6402          * The privateuint4 field is the number of rdata beyond the
6403          * cursor position, so we decrement the total count by one
6404          * before storing it.
6405          *
6406          * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the
6407          * first record.  If DNS_RDATASETATTR_LOADORDER is set 'raw' points
6408          * to the first entry in the offset table.
6409          */
6410         count--;
6411         rdataset->privateuint4 = count;
6412         rdataset->private5 = raw;
6413
6414         return (ISC_R_SUCCESS);
6415 }
6416
6417 static isc_result_t
6418 rdataset_next(dns_rdataset_t *rdataset) {
6419         unsigned int count;
6420         unsigned int length;
6421         unsigned char *raw;     /* RDATASLAB */
6422
6423         count = rdataset->privateuint4;
6424         if (count == 0)
6425                 return (ISC_R_NOMORE);
6426         count--;
6427         rdataset->privateuint4 = count;
6428
6429         /*
6430          * Skip forward one record (length + 4) or one offset (4).
6431          */
6432         raw = rdataset->private5;
6433 #if DNS_RDATASET_FIXED
6434         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) {
6435 #endif
6436                 length = raw[0] * 256 + raw[1];
6437                 raw += length;
6438 #if DNS_RDATASET_FIXED
6439         }
6440         rdataset->private5 = raw + 4;           /* length(2) + order(2) */
6441 #else
6442         rdataset->private5 = raw + 2;           /* length(2) */
6443 #endif
6444
6445         return (ISC_R_SUCCESS);
6446 }
6447
6448 static void
6449 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) {
6450         unsigned char *raw = rdataset->private5;        /* RDATASLAB */
6451 #if DNS_RDATASET_FIXED
6452         unsigned int offset;
6453 #endif
6454         isc_region_t r;
6455
6456         REQUIRE(raw != NULL);
6457
6458         /*
6459          * Find the start of the record if not already in private5
6460          * then skip the length and order fields.
6461          */
6462 #if DNS_RDATASET_FIXED
6463         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) {
6464                 offset = (raw[0] << 24) + (raw[1] << 16) +
6465                          (raw[2] << 8) + raw[3];
6466                 raw = rdataset->private3;
6467                 raw += offset;
6468         }
6469 #endif
6470         r.length = raw[0] * 256 + raw[1];
6471
6472 #if DNS_RDATASET_FIXED
6473         raw += 4;
6474 #else
6475         raw += 2;
6476 #endif
6477         r.base = raw;
6478         dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r);
6479 }
6480
6481 static void
6482 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) {
6483         dns_db_t *db = source->private1;
6484         dns_dbnode_t *node = source->private2;
6485         dns_dbnode_t *cloned_node = NULL;
6486
6487         attachnode(db, node, &cloned_node);
6488         *target = *source;
6489
6490         /*
6491          * Reset iterator state.
6492          */
6493         target->privateuint4 = 0;
6494         target->private5 = NULL;
6495 }
6496
6497 static unsigned int
6498 rdataset_count(dns_rdataset_t *rdataset) {
6499         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
6500         unsigned int count;
6501
6502         count = raw[0] * 256 + raw[1];
6503
6504         return (count);
6505 }
6506
6507 static isc_result_t
6508 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
6509                     dns_rdataset_t *nsec, dns_rdataset_t *nsecsig)
6510 {
6511         dns_db_t *db = rdataset->private1;
6512         dns_dbnode_t *node = rdataset->private2;
6513         dns_dbnode_t *cloned_node;
6514         struct noqname *noqname = rdataset->private6;
6515
6516         cloned_node = NULL;
6517         attachnode(db, node, &cloned_node);
6518         nsec->methods = &rdataset_methods;
6519         nsec->rdclass = db->rdclass;
6520         nsec->type = dns_rdatatype_nsec;
6521         nsec->covers = 0;
6522         nsec->ttl = rdataset->ttl;
6523         nsec->trust = rdataset->trust;
6524         nsec->private1 = rdataset->private1;
6525         nsec->private2 = rdataset->private2;
6526         nsec->private3 = noqname->nsec;
6527         nsec->privateuint4 = 0;
6528         nsec->private5 = NULL;
6529         nsec->private6 = NULL;
6530
6531         cloned_node = NULL;
6532         attachnode(db, node, &cloned_node);
6533         nsecsig->methods = &rdataset_methods;
6534         nsecsig->rdclass = db->rdclass;
6535         nsecsig->type = dns_rdatatype_rrsig;
6536         nsecsig->covers = dns_rdatatype_nsec;
6537         nsecsig->ttl = rdataset->ttl;
6538         nsecsig->trust = rdataset->trust;
6539         nsecsig->private1 = rdataset->private1;
6540         nsecsig->private2 = rdataset->private2;
6541         nsecsig->private3 = noqname->nsecsig;
6542         nsecsig->privateuint4 = 0;
6543         nsecsig->private5 = NULL;
6544         nsec->private6 = NULL;
6545
6546         dns_name_clone(&noqname->name, name);
6547
6548         return (ISC_R_SUCCESS);
6549 }
6550
6551 /*
6552  * Rdataset Iterator Methods
6553  */
6554
6555 static void
6556 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) {
6557         rbtdb_rdatasetiter_t *rbtiterator;
6558
6559         rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
6560
6561         if (rbtiterator->common.version != NULL)
6562                 closeversion(rbtiterator->common.db,
6563                              &rbtiterator->common.version, ISC_FALSE);
6564         detachnode(rbtiterator->common.db, &rbtiterator->common.node);
6565         isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
6566                     sizeof(*rbtiterator));
6567
6568         *iteratorp = NULL;
6569 }
6570
6571 static isc_result_t
6572 rdatasetiter_first(dns_rdatasetiter_t *iterator) {
6573         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6574         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6575         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6576         rbtdb_version_t *rbtversion = rbtiterator->common.version;
6577         rdatasetheader_t *header, *top_next;
6578         rbtdb_serial_t serial;
6579         isc_stdtime_t now;
6580
6581         if (IS_CACHE(rbtdb)) {
6582                 serial = 1;
6583                 now = rbtiterator->common.now;
6584         } else {
6585                 serial = rbtversion->serial;
6586                 now = 0;
6587         }
6588
6589         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6590                   isc_rwlocktype_read);
6591
6592         for (header = rbtnode->data; header != NULL; header = top_next) {
6593                 top_next = header->next;
6594                 do {
6595                         if (header->serial <= serial && !IGNORE(header)) {
6596                                 /*
6597                                  * Is this a "this rdataset doesn't exist"
6598                                  * record?  Or is it too old in the cache?
6599                                  *
6600                                  * Note: unlike everywhere else, we
6601                                  * check for now > header->ttl instead
6602                                  * of now >= header->ttl.  This allows
6603                                  * ANY and RRSIG queries for 0 TTL
6604                                  * rdatasets to work.
6605                                  */
6606                                 if (NONEXISTENT(header) ||
6607                                     (now != 0 && now > header->rdh_ttl))
6608                                         header = NULL;
6609                                 break;
6610                         } else
6611                                 header = header->down;
6612                 } while (header != NULL);
6613                 if (header != NULL)
6614                         break;
6615         }
6616
6617         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6618                     isc_rwlocktype_read);
6619
6620         rbtiterator->current = header;
6621
6622         if (header == NULL)
6623                 return (ISC_R_NOMORE);
6624
6625         return (ISC_R_SUCCESS);
6626 }
6627
6628 static isc_result_t
6629 rdatasetiter_next(dns_rdatasetiter_t *iterator) {
6630         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6631         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6632         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6633         rbtdb_version_t *rbtversion = rbtiterator->common.version;
6634         rdatasetheader_t *header, *top_next;
6635         rbtdb_serial_t serial;
6636         isc_stdtime_t now;
6637         rbtdb_rdatatype_t type, negtype;
6638         dns_rdatatype_t rdtype, covers;
6639
6640         header = rbtiterator->current;
6641         if (header == NULL)
6642                 return (ISC_R_NOMORE);
6643
6644         if (IS_CACHE(rbtdb)) {
6645                 serial = 1;
6646                 now = rbtiterator->common.now;
6647         } else {
6648                 serial = rbtversion->serial;
6649                 now = 0;
6650         }
6651
6652         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6653                   isc_rwlocktype_read);
6654
6655         type = header->type;
6656         rdtype = RBTDB_RDATATYPE_BASE(header->type);
6657         if (rdtype == 0) {
6658                 covers = RBTDB_RDATATYPE_EXT(header->type);
6659                 negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
6660         } else
6661                 negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
6662         for (header = header->next; header != NULL; header = top_next) {
6663                 top_next = header->next;
6664                 /*
6665                  * If not walking back up the down list.
6666                  */
6667                 if (header->type != type && header->type != negtype) {
6668                         do {
6669                                 if (header->serial <= serial &&
6670                                     !IGNORE(header)) {
6671                                         /*
6672                                          * Is this a "this rdataset doesn't
6673                                          * exist" record?
6674                                          *
6675                                          * Note: unlike everywhere else, we
6676                                          * check for now > header->ttl instead
6677                                          * of now >= header->ttl.  This allows
6678                                          * ANY and RRSIG queries for 0 TTL
6679                                          * rdatasets to work.
6680                                          */
6681                                         if ((header->attributes &
6682                                              RDATASET_ATTR_NONEXISTENT) != 0 ||
6683                                             (now != 0 && now > header->rdh_ttl))
6684                                                 header = NULL;
6685                                         break;
6686                                 } else
6687                                         header = header->down;
6688                         } while (header != NULL);
6689                         if (header != NULL)
6690                                 break;
6691                 }
6692         }
6693
6694         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6695                     isc_rwlocktype_read);
6696
6697         rbtiterator->current = header;
6698
6699         if (header == NULL)
6700                 return (ISC_R_NOMORE);
6701
6702         return (ISC_R_SUCCESS);
6703 }
6704
6705 static void
6706 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) {
6707         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6708         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6709         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6710         rdatasetheader_t *header;
6711
6712         header = rbtiterator->current;
6713         REQUIRE(header != NULL);
6714
6715         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6716                   isc_rwlocktype_read);
6717
6718         bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
6719                       rdataset);
6720
6721         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6722                     isc_rwlocktype_read);
6723 }
6724
6725
6726 /*
6727  * Database Iterator Methods
6728  */
6729
6730 static inline void
6731 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
6732         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6733         dns_rbtnode_t *node = rbtdbiter->node;
6734
6735         if (node == NULL)
6736                 return;
6737
6738         INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
6739         reactivate_node(rbtdb, node, rbtdbiter->tree_locked);
6740 }
6741
6742 static inline void
6743 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
6744         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6745         dns_rbtnode_t *node = rbtdbiter->node;
6746         nodelock_t *lock;
6747
6748         if (node == NULL)
6749                 return;
6750
6751         lock = &rbtdb->node_locks[node->locknum].lock;
6752         NODE_LOCK(lock, isc_rwlocktype_read);
6753         decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
6754                             rbtdbiter->tree_locked, ISC_FALSE);
6755         NODE_UNLOCK(lock, isc_rwlocktype_read);
6756
6757         rbtdbiter->node = NULL;
6758 }
6759
6760 static void
6761 flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
6762         dns_rbtnode_t *node;
6763         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6764         isc_boolean_t was_read_locked = ISC_FALSE;
6765         nodelock_t *lock;
6766         int i;
6767
6768         if (rbtdbiter->delete != 0) {
6769                 /*
6770                  * Note that "%d node of %d in tree" can report things like
6771                  * "flush_deletions: 59 nodes of 41 in tree".  This means
6772                  * That some nodes appear on the deletions list more than
6773                  * once.  Only the last occurence will actually be deleted.
6774                  */
6775                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
6776                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
6777                               "flush_deletions: %d nodes of %d in tree",
6778                               rbtdbiter->delete,
6779                               dns_rbt_nodecount(rbtdb->tree));
6780
6781                 if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
6782                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6783                         was_read_locked = ISC_TRUE;
6784                 }
6785                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6786                 rbtdbiter->tree_locked = isc_rwlocktype_write;
6787
6788                 for (i = 0; i < rbtdbiter->delete; i++) {
6789                         node = rbtdbiter->deletions[i];
6790                         lock = &rbtdb->node_locks[node->locknum].lock;
6791
6792                         NODE_LOCK(lock, isc_rwlocktype_read);
6793                         decrement_reference(rbtdb, node, 0,
6794                                             isc_rwlocktype_read,
6795                                             rbtdbiter->tree_locked, ISC_FALSE);
6796                         NODE_UNLOCK(lock, isc_rwlocktype_read);
6797                 }
6798
6799                 rbtdbiter->delete = 0;
6800
6801                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6802                 if (was_read_locked) {
6803                         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6804                         rbtdbiter->tree_locked = isc_rwlocktype_read;
6805
6806                 } else {
6807                         rbtdbiter->tree_locked = isc_rwlocktype_none;
6808                 }
6809         }
6810 }
6811
6812 static inline void
6813 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
6814         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6815
6816         REQUIRE(rbtdbiter->paused);
6817         REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
6818
6819         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6820         rbtdbiter->tree_locked = isc_rwlocktype_read;
6821
6822         rbtdbiter->paused = ISC_FALSE;
6823 }
6824
6825 static void
6826 dbiterator_destroy(dns_dbiterator_t **iteratorp) {
6827         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
6828         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6829         dns_db_t *db = NULL;
6830
6831         if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
6832                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6833                 rbtdbiter->tree_locked = isc_rwlocktype_none;
6834         } else
6835                 INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
6836
6837         dereference_iter_node(rbtdbiter);
6838
6839         flush_deletions(rbtdbiter);
6840
6841         dns_db_attach(rbtdbiter->common.db, &db);
6842         dns_db_detach(&rbtdbiter->common.db);
6843
6844         dns_rbtnodechain_reset(&rbtdbiter->chain);
6845         isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
6846         dns_db_detach(&db);
6847
6848         *iteratorp = NULL;
6849 }
6850
6851 static isc_result_t
6852 dbiterator_first(dns_dbiterator_t *iterator) {
6853         isc_result_t result;
6854         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6855         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6856         dns_name_t *name, *origin;
6857
6858         if (rbtdbiter->result != ISC_R_SUCCESS &&
6859             rbtdbiter->result != ISC_R_NOMORE)
6860                 return (rbtdbiter->result);
6861
6862         if (rbtdbiter->paused)
6863                 resume_iteration(rbtdbiter);
6864
6865         dereference_iter_node(rbtdbiter);
6866
6867         name = dns_fixedname_name(&rbtdbiter->name);
6868         origin = dns_fixedname_name(&rbtdbiter->origin);
6869         dns_rbtnodechain_reset(&rbtdbiter->chain);
6870
6871         result = dns_rbtnodechain_first(&rbtdbiter->chain, rbtdb->tree, name,
6872                                         origin);
6873
6874         if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
6875                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6876                                                   NULL, &rbtdbiter->node);
6877                 if (result == ISC_R_SUCCESS) {
6878                         rbtdbiter->new_origin = ISC_TRUE;
6879                         reference_iter_node(rbtdbiter);
6880                 }
6881         } else {
6882                 INSIST(result == ISC_R_NOTFOUND);
6883                 result = ISC_R_NOMORE; /* The tree is empty. */
6884         }
6885
6886         rbtdbiter->result = result;
6887
6888         return (result);
6889 }
6890
6891 static isc_result_t
6892 dbiterator_last(dns_dbiterator_t *iterator) {
6893         isc_result_t result;
6894         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6895         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6896         dns_name_t *name, *origin;
6897
6898         if (rbtdbiter->result != ISC_R_SUCCESS &&
6899             rbtdbiter->result != ISC_R_NOMORE)
6900                 return (rbtdbiter->result);
6901
6902         if (rbtdbiter->paused)
6903                 resume_iteration(rbtdbiter);
6904
6905         dereference_iter_node(rbtdbiter);
6906
6907         name = dns_fixedname_name(&rbtdbiter->name);
6908         origin = dns_fixedname_name(&rbtdbiter->origin);
6909         dns_rbtnodechain_reset(&rbtdbiter->chain);
6910
6911         result = dns_rbtnodechain_last(&rbtdbiter->chain, rbtdb->tree, name,
6912                                        origin);
6913         if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
6914                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6915                                                   NULL, &rbtdbiter->node);
6916                 if (result == ISC_R_SUCCESS) {
6917                         rbtdbiter->new_origin = ISC_TRUE;
6918                         reference_iter_node(rbtdbiter);
6919                 }
6920         } else {
6921                 INSIST(result == ISC_R_NOTFOUND);
6922                 result = ISC_R_NOMORE; /* The tree is empty. */
6923         }
6924
6925         rbtdbiter->result = result;
6926
6927         return (result);
6928 }
6929
6930 static isc_result_t
6931 dbiterator_seek(dns_dbiterator_t *iterator, dns_name_t *name) {
6932         isc_result_t result;
6933         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6934         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6935         dns_name_t *iname, *origin;
6936
6937         if (rbtdbiter->result != ISC_R_SUCCESS &&
6938             rbtdbiter->result != ISC_R_NOMORE)
6939                 return (rbtdbiter->result);
6940
6941         if (rbtdbiter->paused)
6942                 resume_iteration(rbtdbiter);
6943
6944         dereference_iter_node(rbtdbiter);
6945
6946         iname = dns_fixedname_name(&rbtdbiter->name);
6947         origin = dns_fixedname_name(&rbtdbiter->origin);
6948         dns_rbtnodechain_reset(&rbtdbiter->chain);
6949
6950         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &rbtdbiter->node,
6951                                   &rbtdbiter->chain, DNS_RBTFIND_EMPTYDATA,
6952                                   NULL, NULL);
6953         if (result == ISC_R_SUCCESS) {
6954                 result = dns_rbtnodechain_current(&rbtdbiter->chain, iname,
6955                                                   origin, NULL);
6956                 if (result == ISC_R_SUCCESS) {
6957                         rbtdbiter->new_origin = ISC_TRUE;
6958                         reference_iter_node(rbtdbiter);
6959                 }
6960
6961         } else if (result == DNS_R_PARTIALMATCH)
6962                 result = ISC_R_NOTFOUND;
6963
6964         rbtdbiter->result = result;
6965
6966         return (result);
6967 }
6968
6969 static isc_result_t
6970 dbiterator_prev(dns_dbiterator_t *iterator) {
6971         isc_result_t result;
6972         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6973         dns_name_t *name, *origin;
6974
6975         REQUIRE(rbtdbiter->node != NULL);
6976
6977         if (rbtdbiter->result != ISC_R_SUCCESS)
6978                 return (rbtdbiter->result);
6979
6980         if (rbtdbiter->paused)
6981                 resume_iteration(rbtdbiter);
6982
6983         name = dns_fixedname_name(&rbtdbiter->name);
6984         origin = dns_fixedname_name(&rbtdbiter->origin);
6985         result = dns_rbtnodechain_prev(&rbtdbiter->chain, name, origin);
6986
6987         dereference_iter_node(rbtdbiter);
6988
6989         if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
6990                 rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN);
6991                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6992                                                   NULL, &rbtdbiter->node);
6993         }
6994
6995         if (result == ISC_R_SUCCESS)
6996                 reference_iter_node(rbtdbiter);
6997
6998         rbtdbiter->result = result;
6999
7000         return (result);
7001 }
7002
7003 static isc_result_t
7004 dbiterator_next(dns_dbiterator_t *iterator) {
7005         isc_result_t result;
7006         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7007         dns_name_t *name, *origin;
7008
7009         REQUIRE(rbtdbiter->node != NULL);
7010
7011         if (rbtdbiter->result != ISC_R_SUCCESS)
7012                 return (rbtdbiter->result);
7013
7014         if (rbtdbiter->paused)
7015                 resume_iteration(rbtdbiter);
7016
7017         name = dns_fixedname_name(&rbtdbiter->name);
7018         origin = dns_fixedname_name(&rbtdbiter->origin);
7019         result = dns_rbtnodechain_next(&rbtdbiter->chain, name, origin);
7020
7021         dereference_iter_node(rbtdbiter);
7022
7023         if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
7024                 rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN);
7025                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
7026                                                   NULL, &rbtdbiter->node);
7027         }
7028         if (result == ISC_R_SUCCESS)
7029                 reference_iter_node(rbtdbiter);
7030
7031         rbtdbiter->result = result;
7032
7033         return (result);
7034 }
7035
7036 static isc_result_t
7037 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
7038                    dns_name_t *name)
7039 {
7040         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
7041         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7042         dns_rbtnode_t *node = rbtdbiter->node;
7043         isc_result_t result;
7044         dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
7045         dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
7046
7047         REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
7048         REQUIRE(rbtdbiter->node != NULL);
7049
7050         if (rbtdbiter->paused)
7051                 resume_iteration(rbtdbiter);
7052
7053         if (name != NULL) {
7054                 if (rbtdbiter->common.relative_names)
7055                         origin = NULL;
7056                 result = dns_name_concatenate(nodename, origin, name, NULL);
7057                 if (result != ISC_R_SUCCESS)
7058                         return (result);
7059                 if (rbtdbiter->common.relative_names && rbtdbiter->new_origin)
7060                         result = DNS_R_NEWORIGIN;
7061         } else
7062                 result = ISC_R_SUCCESS;
7063
7064         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
7065         new_reference(rbtdb, node);
7066         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
7067
7068         *nodep = rbtdbiter->node;
7069
7070         if (iterator->cleaning && result == ISC_R_SUCCESS) {
7071                 isc_result_t expire_result;
7072
7073                 /*
7074                  * If the deletion array is full, flush it before trying
7075                  * to expire the current node.  The current node can't
7076                  * fully deleted while the iteration cursor is still on it.
7077                  */
7078                 if (rbtdbiter->delete == DELETION_BATCH_MAX)
7079                         flush_deletions(rbtdbiter);
7080
7081                 expire_result = expirenode(iterator->db, *nodep, 0);
7082
7083                 /*
7084                  * expirenode() currently always returns success.
7085                  */
7086                 if (expire_result == ISC_R_SUCCESS && node->down == NULL) {
7087                         unsigned int refs;
7088
7089                         rbtdbiter->deletions[rbtdbiter->delete++] = node;
7090                         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
7091                         dns_rbtnode_refincrement(node, &refs);
7092                         INSIST(refs != 0);
7093                         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
7094                 }
7095         }
7096
7097         return (result);
7098 }
7099
7100 static isc_result_t
7101 dbiterator_pause(dns_dbiterator_t *iterator) {
7102         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
7103         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7104
7105         if (rbtdbiter->result != ISC_R_SUCCESS &&
7106             rbtdbiter->result != ISC_R_NOMORE)
7107                 return (rbtdbiter->result);
7108
7109         if (rbtdbiter->paused)
7110                 return (ISC_R_SUCCESS);
7111
7112         rbtdbiter->paused = ISC_TRUE;
7113
7114         if (rbtdbiter->tree_locked != isc_rwlocktype_none) {
7115                 INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read);
7116                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7117                 rbtdbiter->tree_locked = isc_rwlocktype_none;
7118         }
7119
7120         flush_deletions(rbtdbiter);
7121
7122         return (ISC_R_SUCCESS);
7123 }
7124
7125 static isc_result_t
7126 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
7127         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7128         dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
7129
7130         if (rbtdbiter->result != ISC_R_SUCCESS)
7131                 return (rbtdbiter->result);
7132
7133         return (dns_name_copy(origin, name, NULL));
7134 }
7135
7136 /*%
7137  * Additional cache routines.
7138  */
7139 static isc_result_t
7140 rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type,
7141                        dns_rdatatype_t qtype, dns_acache_t *acache,
7142                        dns_zone_t **zonep, dns_db_t **dbp,
7143                        dns_dbversion_t **versionp, dns_dbnode_t **nodep,
7144                        dns_name_t *fname, dns_message_t *msg,
7145                        isc_stdtime_t now)
7146 {
7147         dns_rbtdb_t *rbtdb = rdataset->private1;
7148         dns_rbtnode_t *rbtnode = rdataset->private2;
7149         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7150         unsigned int current_count = rdataset->privateuint4;
7151         unsigned int count;
7152         rdatasetheader_t *header;
7153         nodelock_t *nodelock;
7154         unsigned int total_count;
7155         acachectl_t *acarray;
7156         dns_acacheentry_t *entry;
7157         isc_result_t result;
7158
7159         UNUSED(qtype); /* we do not use this value at least for now */
7160         UNUSED(acache);
7161
7162         header = (struct rdatasetheader *)(raw - sizeof(*header));
7163
7164         total_count = raw[0] * 256 + raw[1];
7165         INSIST(total_count > current_count);
7166         count = total_count - current_count - 1;
7167
7168         acarray = NULL;
7169
7170         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7171         NODE_LOCK(nodelock, isc_rwlocktype_read);
7172
7173         switch (type) {
7174         case dns_rdatasetadditional_fromauth:
7175                 acarray = header->additional_auth;
7176                 break;
7177         case dns_rdatasetadditional_fromcache:
7178                 acarray = NULL;
7179                 break;
7180         case dns_rdatasetadditional_fromglue:
7181                 acarray = header->additional_glue;
7182                 break;
7183         default:
7184                 INSIST(0);
7185         }
7186
7187         if (acarray == NULL) {
7188                 if (type != dns_rdatasetadditional_fromcache)
7189                         dns_acache_countquerymiss(acache);
7190                 NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7191                 return (ISC_R_NOTFOUND);
7192         }
7193
7194         if (acarray[count].entry == NULL) {
7195                 dns_acache_countquerymiss(acache);
7196                 NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7197                 return (ISC_R_NOTFOUND);
7198         }
7199
7200         entry = NULL;
7201         dns_acache_attachentry(acarray[count].entry, &entry);
7202
7203         NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7204
7205         result = dns_acache_getentry(entry, zonep, dbp, versionp,
7206                                      nodep, fname, msg, now);
7207
7208         dns_acache_detachentry(&entry);
7209
7210         return (result);
7211 }
7212
7213 static void
7214 acache_callback(dns_acacheentry_t *entry, void **arg) {
7215         dns_rbtdb_t *rbtdb;
7216         dns_rbtnode_t *rbtnode;
7217         nodelock_t *nodelock;
7218         acachectl_t *acarray = NULL;
7219         acache_cbarg_t *cbarg;
7220         unsigned int count;
7221
7222         REQUIRE(arg != NULL);
7223         cbarg = *arg;
7224
7225         /*
7226          * The caller must hold the entry lock.
7227          */
7228
7229         rbtdb = (dns_rbtdb_t *)cbarg->db;
7230         rbtnode = (dns_rbtnode_t *)cbarg->node;
7231
7232         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7233         NODE_LOCK(nodelock, isc_rwlocktype_write);
7234
7235         switch (cbarg->type) {
7236         case dns_rdatasetadditional_fromauth:
7237                 acarray = cbarg->header->additional_auth;
7238                 break;
7239         case dns_rdatasetadditional_fromglue:
7240                 acarray = cbarg->header->additional_glue;
7241                 break;
7242         default:
7243                 INSIST(0);
7244         }
7245
7246         count = cbarg->count;
7247         if (acarray != NULL && acarray[count].entry == entry) {
7248                 acarray[count].entry = NULL;
7249                 INSIST(acarray[count].cbarg == cbarg);
7250                 isc_mem_put(rbtdb->common.mctx, cbarg, sizeof(acache_cbarg_t));
7251                 acarray[count].cbarg = NULL;
7252         } else
7253                 isc_mem_put(rbtdb->common.mctx, cbarg, sizeof(acache_cbarg_t));
7254
7255         dns_acache_detachentry(&entry);
7256
7257         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7258
7259         dns_db_detachnode((dns_db_t *)rbtdb, (dns_dbnode_t **)(void*)&rbtnode);
7260         dns_db_detach((dns_db_t **)(void*)&rbtdb);
7261
7262         *arg = NULL;
7263 }
7264
7265 static void
7266 acache_cancelentry(isc_mem_t *mctx, dns_acacheentry_t *entry,
7267                       acache_cbarg_t **cbargp)
7268 {
7269         acache_cbarg_t *cbarg;
7270
7271         REQUIRE(mctx != NULL);
7272         REQUIRE(entry != NULL);
7273         REQUIRE(cbargp != NULL && *cbargp != NULL);
7274
7275         cbarg = *cbargp;
7276
7277         dns_acache_cancelentry(entry);
7278         dns_db_detachnode(cbarg->db, &cbarg->node);
7279         dns_db_detach(&cbarg->db);
7280
7281         isc_mem_put(mctx, cbarg, sizeof(acache_cbarg_t));
7282
7283         *cbargp = NULL;
7284 }
7285
7286 static isc_result_t
7287 rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type,
7288                        dns_rdatatype_t qtype, dns_acache_t *acache,
7289                        dns_zone_t *zone, dns_db_t *db,
7290                        dns_dbversion_t *version, dns_dbnode_t *node,
7291                        dns_name_t *fname)
7292 {
7293         dns_rbtdb_t *rbtdb = rdataset->private1;
7294         dns_rbtnode_t *rbtnode = rdataset->private2;
7295         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7296         unsigned int current_count = rdataset->privateuint4;
7297         rdatasetheader_t *header;
7298         unsigned int total_count, count;
7299         nodelock_t *nodelock;
7300         isc_result_t result;
7301         acachectl_t *acarray;
7302         dns_acacheentry_t *newentry, *oldentry = NULL;
7303         acache_cbarg_t *newcbarg, *oldcbarg = NULL;
7304
7305         UNUSED(qtype);
7306
7307         if (type == dns_rdatasetadditional_fromcache)
7308                 return (ISC_R_SUCCESS);
7309
7310         header = (struct rdatasetheader *)(raw - sizeof(*header));
7311
7312         total_count = raw[0] * 256 + raw[1];
7313         INSIST(total_count > current_count);
7314         count = total_count - current_count - 1; /* should be private data */
7315
7316         newcbarg = isc_mem_get(rbtdb->common.mctx, sizeof(*newcbarg));
7317         if (newcbarg == NULL)
7318                 return (ISC_R_NOMEMORY);
7319         newcbarg->type = type;
7320         newcbarg->count = count;
7321         newcbarg->header = header;
7322         newcbarg->db = NULL;
7323         dns_db_attach((dns_db_t *)rbtdb, &newcbarg->db);
7324         newcbarg->node = NULL;
7325         dns_db_attachnode((dns_db_t *)rbtdb, (dns_dbnode_t *)rbtnode,
7326                           &newcbarg->node);
7327         newentry = NULL;
7328         result = dns_acache_createentry(acache, (dns_db_t *)rbtdb,
7329                                         acache_callback, newcbarg, &newentry);
7330         if (result != ISC_R_SUCCESS)
7331                 goto fail;
7332         /* Set cache data in the new entry. */
7333         result = dns_acache_setentry(acache, newentry, zone, db,
7334                                      version, node, fname);
7335         if (result != ISC_R_SUCCESS)
7336                 goto fail;
7337
7338         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7339         NODE_LOCK(nodelock, isc_rwlocktype_write);
7340
7341         acarray = NULL;
7342         switch (type) {
7343         case dns_rdatasetadditional_fromauth:
7344                 acarray = header->additional_auth;
7345                 break;
7346         case dns_rdatasetadditional_fromglue:
7347                 acarray = header->additional_glue;
7348                 break;
7349         default:
7350                 INSIST(0);
7351         }
7352
7353         if (acarray == NULL) {
7354                 unsigned int i;
7355
7356                 acarray = isc_mem_get(rbtdb->common.mctx, total_count *
7357                                       sizeof(acachectl_t));
7358
7359                 if (acarray == NULL) {
7360                         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7361                         goto fail;
7362                 }
7363
7364                 for (i = 0; i < total_count; i++) {
7365                         acarray[i].entry = NULL;
7366                         acarray[i].cbarg = NULL;
7367                 }
7368         }
7369         switch (type) {
7370         case dns_rdatasetadditional_fromauth:
7371                 header->additional_auth = acarray;
7372                 break;
7373         case dns_rdatasetadditional_fromglue:
7374                 header->additional_glue = acarray;
7375                 break;
7376         default:
7377                 INSIST(0);
7378         }
7379
7380         if (acarray[count].entry != NULL) {
7381                 /*
7382                  * Swap the entry.  Delay cleaning-up the old entry since
7383                  * it would require a node lock.
7384                  */
7385                 oldentry = acarray[count].entry;
7386                 INSIST(acarray[count].cbarg != NULL);
7387                 oldcbarg = acarray[count].cbarg;
7388         }
7389         acarray[count].entry = newentry;
7390         acarray[count].cbarg = newcbarg;
7391
7392         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7393
7394         if (oldentry != NULL) {
7395                 acache_cancelentry(rbtdb->common.mctx, oldentry, &oldcbarg);
7396                 dns_acache_detachentry(&oldentry);
7397         }
7398
7399         return (ISC_R_SUCCESS);
7400
7401   fail:
7402         if (newcbarg != NULL) {
7403                 if (newentry != NULL) {
7404                         acache_cancelentry(rbtdb->common.mctx, newentry,
7405                                            &newcbarg);
7406                         dns_acache_detachentry(&newentry);
7407                 } else {
7408                         dns_db_detachnode((dns_db_t *)rbtdb, &newcbarg->node);
7409                         dns_db_detach(&newcbarg->db);
7410                         isc_mem_put(rbtdb->common.mctx, newcbarg,
7411                             sizeof(*newcbarg));
7412                 }
7413         }
7414
7415         return (result);
7416 }
7417
7418 static isc_result_t
7419 rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset,
7420                        dns_rdatasetadditional_t type, dns_rdatatype_t qtype)
7421 {
7422         dns_rbtdb_t *rbtdb = rdataset->private1;
7423         dns_rbtnode_t *rbtnode = rdataset->private2;
7424         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7425         unsigned int current_count = rdataset->privateuint4;
7426         rdatasetheader_t *header;
7427         nodelock_t *nodelock;
7428         unsigned int total_count, count;
7429         acachectl_t *acarray;
7430         dns_acacheentry_t *entry;
7431         acache_cbarg_t *cbarg;
7432
7433         UNUSED(qtype);          /* we do not use this value at least for now */
7434         UNUSED(acache);
7435
7436         if (type == dns_rdatasetadditional_fromcache)
7437                 return (ISC_R_SUCCESS);
7438
7439         header = (struct rdatasetheader *)(raw - sizeof(*header));
7440
7441         total_count = raw[0] * 256 + raw[1];
7442         INSIST(total_count > current_count);
7443         count = total_count - current_count - 1;
7444
7445         acarray = NULL;
7446         entry = NULL;
7447
7448         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7449         NODE_LOCK(nodelock, isc_rwlocktype_write);
7450
7451         switch (type) {
7452         case dns_rdatasetadditional_fromauth:
7453                 acarray = header->additional_auth;
7454                 break;
7455         case dns_rdatasetadditional_fromglue:
7456                 acarray = header->additional_glue;
7457                 break;
7458         default:
7459                 INSIST(0);
7460         }
7461
7462         if (acarray == NULL) {
7463                 NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7464                 return (ISC_R_NOTFOUND);
7465         }
7466
7467         entry = acarray[count].entry;
7468         if (entry == NULL) {
7469                 NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7470                 return (ISC_R_NOTFOUND);
7471         }
7472
7473         acarray[count].entry = NULL;
7474         cbarg = acarray[count].cbarg;
7475         acarray[count].cbarg = NULL;
7476
7477         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7478
7479         if (entry != NULL) {
7480                 if (cbarg != NULL)
7481                         acache_cancelentry(rbtdb->common.mctx, entry, &cbarg);
7482                 dns_acache_detachentry(&entry);
7483         }
7484
7485         return (ISC_R_SUCCESS);
7486 }
7487
7488 /*%
7489  * Routines for LRU-based cache management.
7490  */
7491
7492 /*%
7493  * See if a given cache entry that is being reused needs to be updated
7494  * in the LRU-list.  From the LRU management point of view, this function is
7495  * expected to return true for almost all cases.  When used with threads,
7496  * however, this may cause a non-negligible performance penalty because a
7497  * writer lock will have to be acquired before updating the list.
7498  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
7499  * function returns true if the entry has not been updated for some period of
7500  * time.  We differentiate the NS or glue address case and the others since
7501  * experiments have shown that the former tends to be accessed relatively
7502  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
7503  * may cause external queries at a higher level zone, involving more
7504  * transactions).
7505  *
7506  * Caller must hold the node (read or write) lock.
7507  */
7508 static inline isc_boolean_t
7509 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) {
7510         if ((header->attributes &
7511              (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0)
7512                 return (ISC_FALSE);
7513
7514 #if DNS_RBTDB_LIMITLRUUPDATE
7515         if (header->type == dns_rdatatype_ns ||
7516             (header->trust == dns_trust_glue &&
7517              (header->type == dns_rdatatype_a ||
7518               header->type == dns_rdatatype_aaaa))) {
7519                 /*
7520                  * Glue records are updated if at least 60 seconds have passed
7521                  * since the previous update time.
7522                  */
7523                 return (header->last_used + 60 <= now);
7524         }
7525
7526         /* Other records are updated if 5 minutes have passed. */
7527         return (header->last_used + 300 <= now);
7528 #else
7529         UNUSED(now);
7530
7531         return (ISC_TRUE);
7532 #endif
7533 }
7534
7535 /*%
7536  * Update the timestamp of a given cache entry and move it to the head
7537  * of the corresponding LRU list.
7538  *
7539  * Caller must hold the node (write) lock.
7540  *
7541  * Note that the we do NOT touch the heap here, as the TTL has not changed.
7542  */
7543 static void
7544 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
7545               isc_stdtime_t now)
7546 {
7547         /* To be checked: can we really assume this? XXXMLG */
7548         INSIST(ISC_LINK_LINKED(header, lru_link));
7549
7550         ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum],
7551                         header, lru_link);
7552         header->last_used = now;
7553         ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum],
7554                          header, lru_link);
7555 }
7556
7557 /*%
7558  * Purge some expired and/or stale (i.e. unused for some period) cache entries
7559  * under an overmem condition.  To recover from this condition quickly, up to
7560  * 2 entries will be purged.  This process is triggered while adding a new
7561  * entry, and we specifically avoid purging entries in the same LRU bucket as
7562  * the one to which the new entry will belong.  Otherwise, we might purge
7563  * entries of the same name of different RR types while adding RRsets from a
7564  * single response (consider the case where we're adding A and AAAA glue records
7565  * of the same NS name).
7566  */
7567 static void
7568 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
7569               isc_stdtime_t now, isc_boolean_t tree_locked)
7570 {
7571         rdatasetheader_t *header, *header_prev;
7572         unsigned int locknum;
7573         int purgecount = 2;
7574
7575         for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
7576              locknum != locknum_start && purgecount > 0;
7577              locknum = (locknum + 1) % rbtdb->node_lock_count) {
7578                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
7579                           isc_rwlocktype_write);
7580
7581                 header = isc_heap_element(rbtdb->heaps[locknum], 1);
7582                 if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) {
7583                         expire_header(rbtdb, header, tree_locked);
7584                         purgecount--;
7585                 }
7586
7587                 for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
7588                      header != NULL && purgecount > 0;
7589                      header = header_prev) {
7590                         header_prev = ISC_LIST_PREV(header, lru_link);
7591                         /*
7592                          * Unlink the entry at this point to avoid checking it
7593                          * again even if it's currently used someone else and
7594                          * cannot be purged at this moment.  This entry won't be
7595                          * referenced any more (so unlinking is safe) since the
7596                          * TTL was reset to 0.
7597                          */
7598                         ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header,
7599                                         lru_link);
7600                         expire_header(rbtdb, header, tree_locked);
7601                         purgecount--;
7602                 }
7603
7604                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
7605                                     isc_rwlocktype_write);
7606         }
7607 }
7608
7609 static void
7610 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
7611               isc_boolean_t tree_locked)
7612 {
7613         set_ttl(rbtdb, header, 0);
7614         header->attributes |= RDATASET_ATTR_STALE;
7615         header->node->dirty = 1;
7616
7617         /*
7618          * Caller must hold the node (write) lock.
7619          */
7620
7621         if (dns_rbtnode_refcurrent(header->node) == 0) {
7622                 /*
7623                  * If no one else is using the node, we can clean it up now.
7624                  * We first need to gain a new reference to the node to meet a
7625                  * requirement of decrement_reference().
7626                  */
7627                 new_reference(rbtdb, header->node);
7628                 decrement_reference(rbtdb, header->node, 0,
7629                                     isc_rwlocktype_write,
7630                                     tree_locked ? isc_rwlocktype_write :
7631                                     isc_rwlocktype_none, ISC_FALSE);
7632         }
7633 }