vendor/BIND: Update to 9.5.2-P3
[dragonfly.git] / contrib / bind / lib / dns / rbtdb.c
1 /*
2  * Copyright (C) 2004-2010  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: rbtdb.c,v 1.248.12.18.2.5 2010/02/26 00:26:54 marka Exp $ */
19
20 /*! \file */
21
22 /*
23  * Principal Author: Bob Halley
24  */
25
26 #include <config.h>
27
28 #include <isc/heap.h>
29 #include <isc/event.h>
30 #include <isc/mem.h>
31 #include <isc/platform.h>
32 #include <isc/print.h>
33 #include <isc/mutex.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/rwlock.h>
37 #include <isc/string.h>
38 #include <isc/task.h>
39 #include <isc/time.h>
40 #include <isc/util.h>
41
42 #include <dns/acache.h>
43 #include <dns/db.h>
44 #include <dns/dbiterator.h>
45 #include <dns/events.h>
46 #include <dns/fixedname.h>
47 #include <dns/lib.h>
48 #include <dns/log.h>
49 #include <dns/masterdump.h>
50 #include <dns/rbt.h>
51 #include <dns/rdata.h>
52 #include <dns/rdataset.h>
53 #include <dns/rdatasetiter.h>
54 #include <dns/rdataslab.h>
55 #include <dns/result.h>
56 #include <dns/stats.h>
57 #include <dns/view.h>
58 #include <dns/zone.h>
59 #include <dns/zonekey.h>
60
61 #ifdef DNS_RBTDB_VERSION64
62 #include "rbtdb64.h"
63 #else
64 #include "rbtdb.h"
65 #endif
66
67 #ifdef DNS_RBTDB_VERSION64
68 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
69 #else
70 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
71 #endif
72
73 /*%
74  * Note that "impmagic" is not the first four bytes of the struct, so
75  * ISC_MAGIC_VALID cannot be used.
76  */
77 #define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
78                                  (rbtdb)->common.impmagic == RBTDB_MAGIC)
79
80 #ifdef DNS_RBTDB_VERSION64
81 typedef isc_uint64_t                    rbtdb_serial_t;
82 /*%
83  * Make casting easier in symbolic debuggers by using different names
84  * for the 64 bit version.
85  */
86 #define dns_rbtdb_t dns_rbtdb64_t
87 #define rdatasetheader_t rdatasetheader64_t
88 #define rbtdb_version_t rbtdb_version64_t
89 #else
90 typedef isc_uint32_t                    rbtdb_serial_t;
91 #endif
92
93 typedef isc_uint32_t                    rbtdb_rdatatype_t;
94
95 #define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
96 #define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
97 #define RBTDB_RDATATYPE_VALUE(b, e)     (((e) << 16) | (b))
98
99 #define RBTDB_RDATATYPE_SIGNSEC \
100                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
101 #define RBTDB_RDATATYPE_SIGNS \
102                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
103 #define RBTDB_RDATATYPE_SIGCNAME \
104                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
105 #define RBTDB_RDATATYPE_SIGDNAME \
106                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
107 #define RBTDB_RDATATYPE_NCACHEANY \
108                 RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
109
110 /*
111  * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
112  * Using rwlock is effective with regard to lookup performance only when
113  * it is implemented in an efficient way.
114  * Otherwise, it is generally wise to stick to the simple locking since rwlock
115  * would require more memory or can even make lookups slower due to its own
116  * overhead (when it internally calls mutex locks).
117  */
118 #ifdef ISC_RWLOCK_USEATOMIC
119 #define DNS_RBTDB_USERWLOCK 1
120 #else
121 #define DNS_RBTDB_USERWLOCK 0
122 #endif
123
124 #if DNS_RBTDB_USERWLOCK
125 #define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
126 #define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
127 #define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
128 #define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
129 #else
130 #define RBTDB_INITLOCK(l)       isc_mutex_init(l)
131 #define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
132 #define RBTDB_LOCK(l, t)        LOCK(l)
133 #define RBTDB_UNLOCK(l, t)      UNLOCK(l)
134 #endif
135
136 /*
137  * Since node locking is sensitive to both performance and memory footprint,
138  * we need some trick here.  If we have both high-performance rwlock and
139  * high performance and small-memory reference counters, we use rwlock for
140  * node lock and isc_refcount for node references.  In this case, we don't have
141  * to protect the access to the counters by locks.
142  * Otherwise, we simply use ordinary mutex lock for node locking, and use
143  * simple integers as reference counters which is protected by the lock.
144  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
145  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
146  * counters first and then protect other parts of a node as read-only data.
147  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
148  * provided for these special cases.  When we can use the efficient backend
149  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
150  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
151  * section including the access to the reference counter.
152  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
153  * section is also protected by NODE_STRONGLOCK().
154  */
155 #if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
156 typedef isc_rwlock_t nodelock_t;
157
158 #define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
159 #define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
160 #define NODE_LOCK(l, t)         RWLOCK((l), (t))
161 #define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
162 #define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)
163
164 #define NODE_STRONGLOCK(l)      ((void)0)
165 #define NODE_STRONGUNLOCK(l)    ((void)0)
166 #define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
167 #define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
168 #define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
169 #else
170 typedef isc_mutex_t nodelock_t;
171
172 #define NODE_INITLOCK(l)        isc_mutex_init(l)
173 #define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
174 #define NODE_LOCK(l, t)         LOCK(l)
175 #define NODE_UNLOCK(l, t)       UNLOCK(l)
176 #define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS
177
178 #define NODE_STRONGLOCK(l)      LOCK(l)
179 #define NODE_STRONGUNLOCK(l)    UNLOCK(l)
180 #define NODE_WEAKLOCK(l, t)     ((void)0)
181 #define NODE_WEAKUNLOCK(l, t)   ((void)0)
182 #define NODE_WEAKDOWNGRADE(l)   ((void)0)
183 #endif
184
185 /*%
186  * Whether to rate-limit updating the LRU to avoid possible thread contention.
187  * Our performance measurement has shown the cost is marginal, so it's defined
188  * to be 0 by default either with or without threads.
189  */
190 #ifndef DNS_RBTDB_LIMITLRUUPDATE
191 #define DNS_RBTDB_LIMITLRUUPDATE 0
192 #endif
193
194 /*
195  * Allow clients with a virtual time of up to 5 minutes in the past to see
196  * records that would have otherwise have expired.
197  */
198 #define RBTDB_VIRTUAL 300
199
200 struct noqname {
201         dns_name_t name;
202         void *     nsec;
203         void *     nsecsig;
204 };
205
206 typedef struct acachectl acachectl_t;
207
208 typedef struct rdatasetheader {
209         /*%
210          * Locked by the owning node's lock.
211          */
212         rbtdb_serial_t                  serial;
213         dns_ttl_t                       rdh_ttl;
214         rbtdb_rdatatype_t               type;
215         isc_uint16_t                    attributes;
216         dns_trust_t                     trust;
217         struct noqname                  *noqname;
218         /*%<
219          * We don't use the LIST macros, because the LIST structure has
220          * both head and tail pointers, and is doubly linked.
221          */
222
223         struct rdatasetheader           *next;
224         /*%<
225          * If this is the top header for an rdataset, 'next' points
226          * to the top header for the next rdataset (i.e., the next type).
227          * Otherwise, it points up to the header whose down pointer points
228          * at this header.
229          */
230
231         struct rdatasetheader           *down;
232         /*%<
233          * Points to the header for the next older version of
234          * this rdataset.
235          */
236
237         isc_uint32_t                    count;
238         /*%<
239          * Monotonously increased every time this rdataset is bound so that
240          * it is used as the base of the starting point in DNS responses
241          * when the "cyclic" rrset-order is required.  Since the ordering
242          * should not be so crucial, no lock is set for the counter for
243          * performance reasons.
244          */
245
246         acachectl_t                     *additional_auth;
247         acachectl_t                     *additional_glue;
248
249         dns_rbtnode_t                   *node;
250         isc_stdtime_t                   last_used;
251         ISC_LINK(struct rdatasetheader) lru_link;
252         /*%<
253          * Used for LRU-based cache management.  We should probably make
254          * these cache-DB specific.  We might also make it a pointer and
255          * ensure only the top header has a valid link to save memory.
256          * The linked-list is locked by the rbtdb->lrulock.
257          */
258
259         /*
260          * It's possible this should not be here anymore, but instead
261          * referenced from the bucket's heap directly.
262          */
263 #if 0
264         isc_heap_t                      *heap;
265 #endif
266         unsigned int                    heap_index;
267         /*%<
268          * Used for TTL-based cache cleaning.
269          */
270 } rdatasetheader_t;
271
272 typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
273 typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;
274
275 #define RDATASET_ATTR_NONEXISTENT       0x0001
276 #define RDATASET_ATTR_STALE             0x0002
277 #define RDATASET_ATTR_IGNORE            0x0004
278 #define RDATASET_ATTR_RETAIN            0x0008
279 #define RDATASET_ATTR_NXDOMAIN          0x0010
280 #define RDATASET_ATTR_RESIGN            0x0020
281 #define RDATASET_ATTR_STATCOUNT         0x0040
282
283 typedef struct acache_cbarg {
284         dns_rdatasetadditional_t        type;
285         unsigned int                    count;
286         dns_db_t                        *db;
287         dns_dbnode_t                    *node;
288         rdatasetheader_t                *header;
289 } acache_cbarg_t;
290
291 struct acachectl {
292         dns_acacheentry_t               *entry;
293         acache_cbarg_t                  *cbarg;
294 };
295
296 /*
297  * XXX
298  * When the cache will pre-expire data (due to memory low or other
299  * situations) before the rdataset's TTL has expired, it MUST
300  * respect the RETAIN bit and not expire the data until its TTL is
301  * expired.
302  */
303
304 #undef IGNORE                   /* WIN32 winbase.h defines this. */
305
306 #define EXISTS(header) \
307         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
308 #define NONEXISTENT(header) \
309         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
310 #define IGNORE(header) \
311         (((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
312 #define RETAIN(header) \
313         (((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
314 #define NXDOMAIN(header) \
315         (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
316
317 #define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
318
319 /*%
320  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
321  * There is a tradeoff issue about configuring this value: if this is too
322  * small, it may cause heavier contention between threads; if this is too large,
323  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
324  * The default value should work well for most environments, but this can
325  * also be configurable at compilation time via the
326  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
327  * 1 due to the assumption of overmem_purge().
328  */
329 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
330 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
331 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
332 #else
333 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
334 #endif
335 #else
336 #define DEFAULT_CACHE_NODE_LOCK_COUNT   16
337 #endif  /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
338
339 typedef struct {
340         nodelock_t                      lock;
341         /* Protected in the refcount routines. */
342         isc_refcount_t                  references;
343         /* Locked by lock. */
344         isc_boolean_t                   exiting;
345 } rbtdb_nodelock_t;
346
347 typedef struct rbtdb_changed {
348         dns_rbtnode_t *                 node;
349         isc_boolean_t                   dirty;
350         ISC_LINK(struct rbtdb_changed)  link;
351 } rbtdb_changed_t;
352
353 typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
354
355 typedef struct rbtdb_version {
356         /* Not locked */
357         rbtdb_serial_t                  serial;
358         /*
359          * Protected in the refcount routines.
360          * XXXJT: should we change the lock policy based on the refcount
361          * performance?
362          */
363         isc_refcount_t                  references;
364         /* Locked by database lock. */
365         isc_boolean_t                   writer;
366         isc_boolean_t                   commit_ok;
367         rbtdb_changedlist_t             changed_list;
368         ISC_LINK(struct rbtdb_version)  link;
369 } rbtdb_version_t;
370
371 typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;
372
373 typedef struct {
374         /* Unlocked. */
375         dns_db_t                        common;
376 #if DNS_RBTDB_USERWLOCK
377         isc_rwlock_t                    lock;
378 #else
379         isc_mutex_t                     lock;
380 #endif
381         isc_rwlock_t                    tree_lock;
382         unsigned int                    node_lock_count;
383         rbtdb_nodelock_t *              node_locks;
384         dns_rbtnode_t *                 origin_node;
385         dns_stats_t *                   rrsetstats; /* cache DB only */
386         /* Locked by lock. */
387         unsigned int                    active;
388         isc_refcount_t                  references;
389         unsigned int                    attributes;
390         rbtdb_serial_t                  current_serial;
391         rbtdb_serial_t                  least_serial;
392         rbtdb_serial_t                  next_serial;
393         rbtdb_version_t *               current_version;
394         rbtdb_version_t *               future_version;
395         rbtdb_versionlist_t             open_versions;
396         isc_boolean_t                   overmem;
397         isc_task_t *                    task;
398         dns_dbnode_t                    *soanode;
399         dns_dbnode_t                    *nsnode;
400
401         /*
402          * This is a linked list used to implement the LRU cache.  There will
403          * be node_lock_count linked lists here.  Nodes in bucket 1 will be
404          * placed on the linked list rdatasets[1].
405          */
406         rdatasetheaderlist_t            *rdatasets;
407
408         /*%
409          * Temporary storage for stale cache nodes and dynamically deleted
410          * nodes that await being cleaned up.
411          */
412         rbtnodelist_t                   *deadnodes;
413
414         /*
415          * Heaps.  Each of these is used for TTL based expiry.
416          */
417         isc_heap_t                      **heaps;
418
419         /* Locked by tree_lock. */
420         dns_rbt_t *                     tree;
421         isc_boolean_t                   secure;
422
423         /* Unlocked */
424         unsigned int                    quantum;
425 } dns_rbtdb_t;
426
427 #define RBTDB_ATTR_LOADED               0x01
428 #define RBTDB_ATTR_LOADING              0x02
429
430 /*%
431  * Search Context
432  */
433 typedef struct {
434         dns_rbtdb_t *           rbtdb;
435         rbtdb_version_t *       rbtversion;
436         rbtdb_serial_t          serial;
437         unsigned int            options;
438         dns_rbtnodechain_t      chain;
439         isc_boolean_t           copy_name;
440         isc_boolean_t           need_cleanup;
441         isc_boolean_t           wild;
442         dns_rbtnode_t *         zonecut;
443         rdatasetheader_t *      zonecut_rdataset;
444         rdatasetheader_t *      zonecut_sigrdataset;
445         dns_fixedname_t         zonecut_name;
446         isc_stdtime_t           now;
447 } rbtdb_search_t;
448
449 /*%
450  * Load Context
451  */
452 typedef struct {
453         dns_rbtdb_t *           rbtdb;
454         isc_stdtime_t           now;
455 } rbtdb_load_t;
456
457 static void rdataset_disassociate(dns_rdataset_t *rdataset);
458 static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
459 static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
460 static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
461 static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
462 static unsigned int rdataset_count(dns_rdataset_t *rdataset);
463 static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
464                                         dns_name_t *name,
465                                         dns_rdataset_t *nsec,
466                                         dns_rdataset_t *nsecsig);
467 static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
468                                            dns_rdatasetadditional_t type,
469                                            dns_rdatatype_t qtype,
470                                            dns_acache_t *acache,
471                                            dns_zone_t **zonep,
472                                            dns_db_t **dbp,
473                                            dns_dbversion_t **versionp,
474                                            dns_dbnode_t **nodep,
475                                            dns_name_t *fname,
476                                            dns_message_t *msg,
477                                            isc_stdtime_t now);
478 static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
479                                            dns_rdatasetadditional_t type,
480                                            dns_rdatatype_t qtype,
481                                            dns_acache_t *acache,
482                                            dns_zone_t *zone,
483                                            dns_db_t *db,
484                                            dns_dbversion_t *version,
485                                            dns_dbnode_t *node,
486                                            dns_name_t *fname);
487 static isc_result_t rdataset_putadditional(dns_acache_t *acache,
488                                            dns_rdataset_t *rdataset,
489                                            dns_rdatasetadditional_t type,
490                                            dns_rdatatype_t qtype);
491 static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
492                                               isc_stdtime_t now);
493 static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
494                           isc_stdtime_t now);
495 static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
496                           isc_boolean_t tree_locked);
497 static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
498                           isc_stdtime_t now, isc_boolean_t tree_locked);
499 static void prune_tree(isc_task_t *task, isc_event_t *event);
500 static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
501 static void rdataset_expire(dns_rdataset_t *rdataset);
502
503 static dns_rdatasetmethods_t rdataset_methods = {
504         rdataset_disassociate,
505         rdataset_first,
506         rdataset_next,
507         rdataset_current,
508         rdataset_clone,
509         rdataset_count,
510         NULL,
511         rdataset_getnoqname,
512         rdataset_getadditional,
513         rdataset_setadditional,
514         rdataset_putadditional,
515         rdataset_settrust,
516         rdataset_expire
517 };
518
519 static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
520 static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
521 static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
522 static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
523                                  dns_rdataset_t *rdataset);
524
525 static dns_rdatasetitermethods_t rdatasetiter_methods = {
526         rdatasetiter_destroy,
527         rdatasetiter_first,
528         rdatasetiter_next,
529         rdatasetiter_current
530 };
531
532 typedef struct rbtdb_rdatasetiter {
533         dns_rdatasetiter_t              common;
534         rdatasetheader_t *              current;
535 } rbtdb_rdatasetiter_t;
536
537 static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
538 static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
539 static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
540 static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
541                                         dns_name_t *name);
542 static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
543 static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
544 static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
545                                            dns_dbnode_t **nodep,
546                                            dns_name_t *name);
547 static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
548 static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
549                                           dns_name_t *name);
550
551 static dns_dbiteratormethods_t dbiterator_methods = {
552         dbiterator_destroy,
553         dbiterator_first,
554         dbiterator_last,
555         dbiterator_seek,
556         dbiterator_prev,
557         dbiterator_next,
558         dbiterator_current,
559         dbiterator_pause,
560         dbiterator_origin
561 };
562
563 #define DELETION_BATCH_MAX 64
564
565 /*
566  * If 'paused' is ISC_TRUE, then the tree lock is not being held.
567  */
568 typedef struct rbtdb_dbiterator {
569         dns_dbiterator_t                common;
570         isc_boolean_t                   paused;
571         isc_boolean_t                   new_origin;
572         isc_rwlocktype_t                tree_locked;
573         isc_result_t                    result;
574         dns_fixedname_t                 name;
575         dns_fixedname_t                 origin;
576         dns_rbtnodechain_t              chain;
577         dns_rbtnode_t                   *node;
578         dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
579         int                             delete;
580 } rbtdb_dbiterator_t;
581
582
583 #define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
584 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
585
586 static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
587                        isc_event_t *event);
588 static void overmem(dns_db_t *db, isc_boolean_t overmem);
589
590 /*%
591  * 'init_count' is used to initialize 'newheader->count' which inturn
592  * is used to determine where in the cycle rrset-order cyclic starts.
593  * We don't lock this as we don't care about simultaneous updates.
594  *
595  * Note:
596  *      Both init_count and header->count can be ISC_UINT32_MAX.
597  *      The count on the returned rdataset however can't be as
598  *      that indicates that the database does not implement cyclic
599  *      processing.
600  */
601 static unsigned int init_count;
602
603 /*
604  * Locking
605  *
606  * If a routine is going to lock more than one lock in this module, then
607  * the locking must be done in the following order:
608  *
609  *      Tree Lock
610  *
611  *      Node Lock       (Only one from the set may be locked at one time by
612  *                       any caller)
613  *
614  *      Database Lock
615  *
616  * Failure to follow this hierarchy can result in deadlock.
617  */
618
619 /*
620  * Deleting Nodes
621  *
622  * For zone databases the node for the origin of the zone MUST NOT be deleted.
623  */
624
625
626 /*
627  * DB Routines
628  */
629
630 static void
631 attach(dns_db_t *source, dns_db_t **targetp) {
632         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
633
634         REQUIRE(VALID_RBTDB(rbtdb));
635
636         isc_refcount_increment(&rbtdb->references, NULL);
637
638         *targetp = source;
639 }
640
641 static void
642 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
643         dns_rbtdb_t *rbtdb = event->ev_arg;
644
645         UNUSED(task);
646
647         free_rbtdb(rbtdb, ISC_TRUE, event);
648 }
649
650 static void
651 update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
652                   isc_boolean_t increment)
653 {
654         dns_rdatastatstype_t statattributes = 0;
655         dns_rdatastatstype_t base = 0;
656         dns_rdatastatstype_t type;
657
658         /* At the moment we count statistics only for cache DB */
659         INSIST(IS_CACHE(rbtdb));
660
661         if (NXDOMAIN(header))
662                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
663         else if (RBTDB_RDATATYPE_BASE(header->type) == 0) {
664                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
665                 base = RBTDB_RDATATYPE_EXT(header->type);
666         } else
667                 base = RBTDB_RDATATYPE_BASE(header->type);
668
669         type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
670         if (increment)
671                 dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
672         else
673                 dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
674 }
675
676 static void
677 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
678         int idx;
679         isc_heap_t *heap;
680         dns_ttl_t oldttl;
681
682         oldttl = header->rdh_ttl;
683         header->rdh_ttl = newttl;
684
685         /*
686          * It's possible the rbtdb is not a cache.  If this is the case,
687          * we will not have a heap, and we move on.  If we do, though,
688          * we might need to adjust things.
689          */
690         if (header->heap_index == 0 || newttl == oldttl)
691                 return;
692         idx = header->node->locknum;
693         if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
694             return;
695         heap = rbtdb->heaps[idx];
696
697         if (newttl < oldttl)
698                 isc_heap_increased(heap, header->heap_index);
699         else
700                 isc_heap_decreased(heap, header->heap_index);
701 }
702
703 /*%
704  * This function allows the heap code to rank the priority of each
705  * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
706  */
707 static isc_boolean_t
708 ttl_sooner(void *v1, void *v2) {
709         rdatasetheader_t *h1 = v1;
710         rdatasetheader_t *h2 = v2;
711
712         if (h1->rdh_ttl < h2->rdh_ttl)
713                 return (ISC_TRUE);
714         return (ISC_FALSE);
715 }
716
717 /*%
718  * This function sets the heap index into the header.
719  */
720 static void
721 ttl_set_index(void *what, unsigned int index) {
722         rdatasetheader_t *h = what;
723
724         h->heap_index = index;
725 }
726
727 /*%
728  * Work out how many nodes can be deleted in the time between two
729  * requests to the nameserver.  Smooth the resulting number and use it
730  * as a estimate for the number of nodes to be deleted in the next
731  * iteration.
732  */
733 static unsigned int
734 adjust_quantum(unsigned int old, isc_time_t *start) {
735         unsigned int pps = dns_pps;     /* packets per second */
736         unsigned int interval;
737         isc_uint64_t usecs;
738         isc_time_t end;
739         unsigned int new;
740
741         if (pps < 100)
742                 pps = 100;
743         isc_time_now(&end);
744
745         interval = 1000000 / pps;       /* interval in usec */
746         if (interval == 0)
747                 interval = 1;
748         usecs = isc_time_microdiff(&end, start);
749         if (usecs == 0) {
750                 /*
751                  * We were unable to measure the amount of time taken.
752                  * Double the nodes deleted next time.
753                  */
754                 old *= 2;
755                 if (old > 1000)
756                         old = 1000;
757                 return (old);
758         }
759         new = old * interval;
760         new /= (unsigned int)usecs;
761         if (new == 0)
762                 new = 1;
763         else if (new > 1000)
764                 new = 1000;
765
766         /* Smooth */
767         new = (new + old * 3) / 4;
768
769         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
770                       ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
771
772         return (new);
773 }
774
775 static void
776 free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
777         unsigned int i;
778         isc_ondestroy_t ondest;
779         isc_result_t result;
780         char buf[DNS_NAME_FORMATSIZE];
781         isc_time_t start;
782
783         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
784                 overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
785
786         REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
787         REQUIRE(rbtdb->future_version == NULL);
788
789         if (rbtdb->current_version != NULL) {
790                 unsigned int refs;
791
792                 isc_refcount_decrement(&rbtdb->current_version->references,
793                                        &refs);
794                 INSIST(refs == 0);
795                 UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
796                 isc_refcount_destroy(&rbtdb->current_version->references);
797                 isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
798                             sizeof(rbtdb_version_t));
799         }
800
801         /*
802          * We assume the number of remaining dead nodes is reasonably small;
803          * the overhead of unlinking all nodes here should be negligible.
804          */
805         for (i = 0; i < rbtdb->node_lock_count; i++) {
806                 dns_rbtnode_t *node;
807
808                 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
809                 while (node != NULL) {
810                         ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
811                         node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
812                 }
813         }
814
815         if (event == NULL)
816                 rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
817  again:
818         if (rbtdb->tree != NULL) {
819                 isc_time_now(&start);
820                 result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
821                 if (result == ISC_R_QUOTA) {
822                         INSIST(rbtdb->task != NULL);
823                         if (rbtdb->quantum != 0)
824                                 rbtdb->quantum = adjust_quantum(rbtdb->quantum,
825                                                                 &start);
826                         if (event == NULL)
827                                 event = isc_event_allocate(rbtdb->common.mctx,
828                                                            NULL,
829                                                          DNS_EVENT_FREESTORAGE,
830                                                            free_rbtdb_callback,
831                                                            rbtdb,
832                                                            sizeof(isc_event_t));
833                         if (event == NULL)
834                                 goto again;
835                         isc_task_send(rbtdb->task, &event);
836                         return;
837                 }
838                 INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
839         }
840         if (event != NULL)
841                 isc_event_free(&event);
842         if (log) {
843                 if (dns_name_dynamic(&rbtdb->common.origin))
844                         dns_name_format(&rbtdb->common.origin, buf,
845                                         sizeof(buf));
846                 else
847                         strcpy(buf, "<UNKNOWN>");
848                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
849                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
850                               "done free_rbtdb(%s)", buf);
851         }
852         if (dns_name_dynamic(&rbtdb->common.origin))
853                 dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
854         for (i = 0; i < rbtdb->node_lock_count; i++) {
855                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
856                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
857         }
858
859         /*
860          * Clean up LRU cache objects.
861          */
862         if (rbtdb->rdatasets != NULL) {
863                 for (i = 0; i < rbtdb->node_lock_count; i++)
864                         INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
865                 isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
866                             rbtdb->node_lock_count *
867                             sizeof(rdatasetheaderlist_t));
868         }
869         /*
870          * Clean up dead node buckets.
871          */
872         if (rbtdb->deadnodes != NULL) {
873                 for (i = 0; i < rbtdb->node_lock_count; i++)
874                         INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
875                 isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
876                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
877         }
878         /*
879          * Clean up TTL heap cache objects.
880          */
881         if (rbtdb->heaps != NULL) {
882                 for (i = 0; i < rbtdb->node_lock_count; i++)
883                         isc_heap_destroy(&rbtdb->heaps[i]);
884                 isc_mem_put(rbtdb->common.mctx, rbtdb->heaps,
885                             rbtdb->node_lock_count *
886                             sizeof(isc_heap_t *));
887         }
888
889         if (rbtdb->rrsetstats != NULL)
890                 dns_stats_detach(&rbtdb->rrsetstats);
891
892         isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
893                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
894         isc_rwlock_destroy(&rbtdb->tree_lock);
895         isc_refcount_destroy(&rbtdb->references);
896         if (rbtdb->task != NULL)
897                 isc_task_detach(&rbtdb->task);
898
899         RBTDB_DESTROYLOCK(&rbtdb->lock);
900         rbtdb->common.magic = 0;
901         rbtdb->common.impmagic = 0;
902         ondest = rbtdb->common.ondest;
903         isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
904         isc_ondestroy_notify(&ondest, rbtdb);
905 }
906
907 static inline void
908 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
909         isc_boolean_t want_free = ISC_FALSE;
910         unsigned int i;
911         unsigned int inactive = 0;
912
913         /* XXX check for open versions here */
914
915         if (rbtdb->soanode != NULL)
916                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
917         if (rbtdb->nsnode != NULL)
918                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
919
920         /*
921          * Even though there are no external direct references, there still
922          * may be nodes in use.
923          */
924         for (i = 0; i < rbtdb->node_lock_count; i++) {
925                 NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
926                 rbtdb->node_locks[i].exiting = ISC_TRUE;
927                 NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
928                 if (isc_refcount_current(&rbtdb->node_locks[i].references)
929                     == 0) {
930                         inactive++;
931                 }
932         }
933
934         if (inactive != 0) {
935                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
936                 rbtdb->active -= inactive;
937                 if (rbtdb->active == 0)
938                         want_free = ISC_TRUE;
939                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
940                 if (want_free) {
941                         char buf[DNS_NAME_FORMATSIZE];
942                         if (dns_name_dynamic(&rbtdb->common.origin))
943                                 dns_name_format(&rbtdb->common.origin, buf,
944                                                 sizeof(buf));
945                         else
946                                 strcpy(buf, "<UNKNOWN>");
947                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
948                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
949                                       "calling free_rbtdb(%s)", buf);
950                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
951                 }
952         }
953 }
954
955 static void
956 detach(dns_db_t **dbp) {
957         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
958         unsigned int refs;
959
960         REQUIRE(VALID_RBTDB(rbtdb));
961
962         isc_refcount_decrement(&rbtdb->references, &refs);
963
964         if (refs == 0)
965                 maybe_free_rbtdb(rbtdb);
966
967         *dbp = NULL;
968 }
969
970 static void
971 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
972         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
973         rbtdb_version_t *version;
974         unsigned int refs;
975
976         REQUIRE(VALID_RBTDB(rbtdb));
977
978         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
979         version = rbtdb->current_version;
980         isc_refcount_increment(&version->references, &refs);
981         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
982
983         *versionp = (dns_dbversion_t *)version;
984 }
985
986 static inline rbtdb_version_t *
987 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
988                  unsigned int references, isc_boolean_t writer)
989 {
990         isc_result_t result;
991         rbtdb_version_t *version;
992
993         version = isc_mem_get(mctx, sizeof(*version));
994         if (version == NULL)
995                 return (NULL);
996         version->serial = serial;
997         result = isc_refcount_init(&version->references, references);
998         if (result != ISC_R_SUCCESS) {
999                 isc_mem_put(mctx, version, sizeof(*version));
1000                 return (NULL);
1001         }
1002         version->writer = writer;
1003         version->commit_ok = ISC_FALSE;
1004         ISC_LIST_INIT(version->changed_list);
1005         ISC_LINK_INIT(version, link);
1006
1007         return (version);
1008 }
1009
1010 static isc_result_t
1011 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1012         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1013         rbtdb_version_t *version;
1014
1015         REQUIRE(VALID_RBTDB(rbtdb));
1016         REQUIRE(versionp != NULL && *versionp == NULL);
1017         REQUIRE(rbtdb->future_version == NULL);
1018
1019         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1020         RUNTIME_CHECK(rbtdb->next_serial != 0);         /* XXX Error? */
1021         version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1022                                    ISC_TRUE);
1023         if (version != NULL) {
1024                 version->commit_ok = ISC_TRUE;
1025                 rbtdb->next_serial++;
1026                 rbtdb->future_version = version;
1027         }
1028         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1029
1030         if (version == NULL)
1031                 return (ISC_R_NOMEMORY);
1032
1033         *versionp = version;
1034
1035         return (ISC_R_SUCCESS);
1036 }
1037
1038 static void
1039 attachversion(dns_db_t *db, dns_dbversion_t *source,
1040               dns_dbversion_t **targetp)
1041 {
1042         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1043         rbtdb_version_t *rbtversion = source;
1044         unsigned int refs;
1045
1046         REQUIRE(VALID_RBTDB(rbtdb));
1047
1048         isc_refcount_increment(&rbtversion->references, &refs);
1049         INSIST(refs > 1);
1050
1051         *targetp = rbtversion;
1052 }
1053
1054 static rbtdb_changed_t *
1055 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1056             dns_rbtnode_t *node)
1057 {
1058         rbtdb_changed_t *changed;
1059         unsigned int refs;
1060
1061         /*
1062          * Caller must be holding the node lock if its reference must be
1063          * protected by the lock.
1064          */
1065
1066         changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1067
1068         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1069
1070         REQUIRE(version->writer);
1071
1072         if (changed != NULL) {
1073                 dns_rbtnode_refincrement(node, &refs);
1074                 INSIST(refs != 0);
1075                 changed->node = node;
1076                 changed->dirty = ISC_FALSE;
1077                 ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1078         } else
1079                 version->commit_ok = ISC_FALSE;
1080
1081         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1082
1083         return (changed);
1084 }
1085
1086 static void
1087 free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
1088                  acachectl_t *array)
1089 {
1090         unsigned int count;
1091         unsigned int i;
1092         unsigned char *raw;     /* RDATASLAB */
1093
1094         /*
1095          * The caller must be holding the corresponding node lock.
1096          */
1097
1098         if (array == NULL)
1099                 return;
1100
1101         raw = (unsigned char *)header + sizeof(*header);
1102         count = raw[0] * 256 + raw[1];
1103
1104         /*
1105          * Sanity check: since an additional cache entry has a reference to
1106          * the original DB node (in the callback arg), there should be no
1107          * acache entries when the node can be freed.
1108          */
1109         for (i = 0; i < count; i++)
1110                 INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1111
1112         isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1113 }
1114
1115 static inline void
1116 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1117
1118         if (dns_name_dynamic(&(*noqname)->name))
1119                 dns_name_free(&(*noqname)->name, mctx);
1120         if ((*noqname)->nsec != NULL)
1121                 isc_mem_put(mctx, (*noqname)->nsec,
1122                             dns_rdataslab_size((*noqname)->nsec, 0));
1123         if ((*noqname)->nsecsig != NULL)
1124                 isc_mem_put(mctx, (*noqname)->nsecsig,
1125                             dns_rdataslab_size((*noqname)->nsecsig, 0));
1126         isc_mem_put(mctx, *noqname, sizeof(**noqname));
1127         *noqname = NULL;
1128 }
1129
1130 static inline void
1131 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
1132 {
1133         ISC_LINK_INIT(h, lru_link);
1134         h->heap_index = 0;
1135
1136 #if TRACE_HEADER
1137         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1138                 fprintf(stderr, "initialized header: %p\n", h);
1139 #else
1140         UNUSED(rbtdb);
1141 #endif
1142 }
1143
1144 static inline rdatasetheader_t *
1145 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
1146 {
1147         rdatasetheader_t *h;
1148
1149         h = isc_mem_get(mctx, sizeof(*h));
1150         if (h == NULL)
1151                 return (NULL);
1152
1153 #if TRACE_HEADER
1154         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1155                 fprintf(stderr, "allocated header: %p\n", h);
1156 #endif
1157         init_rdataset(rbtdb, h);
1158         return (h);
1159 }
1160
1161 static inline void
1162 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
1163 {
1164         unsigned int size;
1165
1166         if (EXISTS(rdataset) &&
1167             (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
1168                 update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
1169         }
1170
1171         if (IS_CACHE(rbtdb) && ISC_LINK_LINKED(rdataset, lru_link)) {
1172                 int idx = rdataset->node->locknum;
1173                 ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link);
1174                 if (rdataset->heap_index != 0) {
1175                         isc_heap_delete(rbtdb->heaps[idx],
1176                                         rdataset->heap_index);
1177                 }
1178                 rdataset->heap_index = 0;
1179         }
1180
1181         if (rdataset->noqname != NULL)
1182                 free_noqname(mctx, &rdataset->noqname);
1183
1184         free_acachearray(mctx, rdataset, rdataset->additional_auth);
1185         free_acachearray(mctx, rdataset, rdataset->additional_glue);
1186
1187         if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
1188                 size = sizeof(*rdataset);
1189         else
1190                 size = dns_rdataslab_size((unsigned char *)rdataset,
1191                                           sizeof(*rdataset));
1192         isc_mem_put(mctx, rdataset, size);
1193 }
1194
1195 static inline void
1196 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1197         rdatasetheader_t *header, *dcurrent;
1198         isc_boolean_t make_dirty = ISC_FALSE;
1199
1200         /*
1201          * Caller must hold the node lock.
1202          */
1203
1204         /*
1205          * We set the IGNORE attribute on rdatasets with serial number
1206          * 'serial'.  When the reference count goes to zero, these rdatasets
1207          * will be cleaned up; until that time, they will be ignored.
1208          */
1209         for (header = node->data; header != NULL; header = header->next) {
1210                 if (header->serial == serial) {
1211                         header->attributes |= RDATASET_ATTR_IGNORE;
1212                         make_dirty = ISC_TRUE;
1213                 }
1214                 for (dcurrent = header->down;
1215                      dcurrent != NULL;
1216                      dcurrent = dcurrent->down) {
1217                         if (dcurrent->serial == serial) {
1218                                 dcurrent->attributes |= RDATASET_ATTR_IGNORE;
1219                                 make_dirty = ISC_TRUE;
1220                         }
1221                 }
1222         }
1223         if (make_dirty)
1224                 node->dirty = 1;
1225 }
1226
1227 static inline void
1228 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
1229 {
1230         rdatasetheader_t *d, *down_next;
1231
1232         for (d = top->down; d != NULL; d = down_next) {
1233                 down_next = d->down;
1234                 free_rdataset(rbtdb, mctx, d);
1235         }
1236         top->down = NULL;
1237 }
1238
1239 static inline void
1240 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1241         rdatasetheader_t *current, *top_prev, *top_next;
1242         isc_mem_t *mctx = rbtdb->common.mctx;
1243
1244         /*
1245          * Caller must be holding the node lock.
1246          */
1247
1248         top_prev = NULL;
1249         for (current = node->data; current != NULL; current = top_next) {
1250                 top_next = current->next;
1251                 clean_stale_headers(rbtdb, mctx, current);
1252                 /*
1253                  * If current is nonexistent or stale, we can clean it up.
1254                  */
1255                 if ((current->attributes &
1256                      (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
1257                         if (top_prev != NULL)
1258                                 top_prev->next = current->next;
1259                         else
1260                                 node->data = current->next;
1261                         free_rdataset(rbtdb, mctx, current);
1262                 } else
1263                         top_prev = current;
1264         }
1265         node->dirty = 0;
1266 }
1267
1268 static inline void
1269 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1270                 rbtdb_serial_t least_serial)
1271 {
1272         rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1273         rdatasetheader_t *top_prev, *top_next;
1274         isc_mem_t *mctx = rbtdb->common.mctx;
1275         isc_boolean_t still_dirty = ISC_FALSE;
1276
1277         /*
1278          * Caller must be holding the node lock.
1279          */
1280         REQUIRE(least_serial != 0);
1281
1282         top_prev = NULL;
1283         for (current = node->data; current != NULL; current = top_next) {
1284                 top_next = current->next;
1285
1286                 /*
1287                  * First, we clean up any instances of multiple rdatasets
1288                  * with the same serial number, or that have the IGNORE
1289                  * attribute.
1290                  */
1291                 dparent = current;
1292                 for (dcurrent = current->down;
1293                      dcurrent != NULL;
1294                      dcurrent = down_next) {
1295                         down_next = dcurrent->down;
1296                         INSIST(dcurrent->serial <= dparent->serial);
1297                         if (dcurrent->serial == dparent->serial ||
1298                             IGNORE(dcurrent)) {
1299                                 if (down_next != NULL)
1300                                         down_next->next = dparent;
1301                                 dparent->down = down_next;
1302                                 free_rdataset(rbtdb, mctx, dcurrent);
1303                         } else
1304                                 dparent = dcurrent;
1305                 }
1306
1307                 /*
1308                  * We've now eliminated all IGNORE datasets with the possible
1309                  * exception of current, which we now check.
1310                  */
1311                 if (IGNORE(current)) {
1312                         down_next = current->down;
1313                         if (down_next == NULL) {
1314                                 if (top_prev != NULL)
1315                                         top_prev->next = current->next;
1316                                 else
1317                                         node->data = current->next;
1318                                 free_rdataset(rbtdb, mctx, current);
1319                                 /*
1320                                  * current no longer exists, so we can
1321                                  * just continue with the loop.
1322                                  */
1323                                 continue;
1324                         } else {
1325                                 /*
1326                                  * Pull up current->down, making it the new
1327                                  * current.
1328                                  */
1329                                 if (top_prev != NULL)
1330                                         top_prev->next = down_next;
1331                                 else
1332                                         node->data = down_next;
1333                                 down_next->next = top_next;
1334                                 free_rdataset(rbtdb, mctx, current);
1335                                 current = down_next;
1336                         }
1337                 }
1338
1339                 /*
1340                  * We now try to find the first down node less than the
1341                  * least serial.
1342                  */
1343                 dparent = current;
1344                 for (dcurrent = current->down;
1345                      dcurrent != NULL;
1346                      dcurrent = down_next) {
1347                         down_next = dcurrent->down;
1348                         if (dcurrent->serial < least_serial)
1349                                 break;
1350                         dparent = dcurrent;
1351                 }
1352
1353                 /*
1354                  * If there is a such an rdataset, delete it and any older
1355                  * versions.
1356                  */
1357                 if (dcurrent != NULL) {
1358                         do {
1359                                 down_next = dcurrent->down;
1360                                 INSIST(dcurrent->serial <= least_serial);
1361                                 free_rdataset(rbtdb, mctx, dcurrent);
1362                                 dcurrent = down_next;
1363                         } while (dcurrent != NULL);
1364                         dparent->down = NULL;
1365                 }
1366
1367                 /*
1368                  * Note.  The serial number of 'current' might be less than
1369                  * least_serial too, but we cannot delete it because it is
1370                  * the most recent version, unless it is a NONEXISTENT
1371                  * rdataset.
1372                  */
1373                 if (current->down != NULL) {
1374                         still_dirty = ISC_TRUE;
1375                         top_prev = current;
1376                 } else {
1377                         /*
1378                          * If this is a NONEXISTENT rdataset, we can delete it.
1379                          */
1380                         if (NONEXISTENT(current)) {
1381                                 if (top_prev != NULL)
1382                                         top_prev->next = current->next;
1383                                 else
1384                                         node->data = current->next;
1385                                 free_rdataset(rbtdb, mctx, current);
1386                         } else
1387                                 top_prev = current;
1388                 }
1389         }
1390         if (!still_dirty)
1391                 node->dirty = 0;
1392 }
1393
1394 /*%
1395  * Clean up dead nodes.  These are nodes which have no references, and
1396  * have no data.  They are dead but we could not or chose not to delete
1397  * them when we deleted all the data at that node because we did not want
1398  * to wait for the tree write lock.
1399  *
1400  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1401  */
1402 static void
1403 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1404         dns_rbtnode_t *node;
1405         isc_result_t result;
1406         int count = 10;         /* XXXJT: should be adjustable */
1407
1408         node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1409         while (node != NULL && count > 0) {
1410                 ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1411
1412                 /*
1413                  * Since we're holding a tree write lock, it should be
1414                  * impossible for this node to be referenced by others.
1415                  */
1416                 INSIST(dns_rbtnode_refcurrent(node) == 0 &&
1417                        node->data == NULL);
1418
1419                 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1420                 if (result != ISC_R_SUCCESS)
1421                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1422                                       DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1423                                       "cleanup_dead_nodes: "
1424                                       "dns_rbt_deletenode: %s",
1425                                       isc_result_totext(result));
1426                 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1427                 count--;
1428         }
1429 }
1430
1431 /*
1432  * Caller must be holding the node lock if its reference must be protected
1433  * by the lock.
1434  */
1435 static inline void
1436 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1437         unsigned int lockrefs, noderefs;
1438         isc_refcount_t *lockref;
1439
1440         dns_rbtnode_refincrement0(node, &noderefs);
1441         if (noderefs == 1) {    /* this is the first reference to the node */
1442                 lockref = &rbtdb->node_locks[node->locknum].references;
1443                 isc_refcount_increment0(lockref, &lockrefs);
1444                 INSIST(lockrefs != 0);
1445         }
1446         INSIST(noderefs != 0);
1447 }
1448
1449 /*
1450  * This function is assumed to be called when a node is newly referenced
1451  * and can be in the deadnode list.  In that case the node must be retrieved
1452  * from the list because it is going to be used.  In addition, if the caller
1453  * happens to hold a write lock on the tree, it's a good chance to purge dead
1454  * nodes.
1455  * Note: while a new reference is gained in multiple places, there are only very
1456  * few cases where the node can be in the deadnode list (only empty nodes can
1457  * have been added to the list).
1458  */
1459 static inline void
1460 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1461                 isc_rwlocktype_t treelocktype)
1462 {
1463         isc_boolean_t need_relock = ISC_FALSE;
1464
1465         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
1466         new_reference(rbtdb, node);
1467
1468         NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1469                       isc_rwlocktype_read);
1470         if (ISC_LINK_LINKED(node, deadlink))
1471                 need_relock = ISC_TRUE;
1472         else if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1473                  treelocktype == isc_rwlocktype_write)
1474                 need_relock = ISC_TRUE;
1475         NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1476                         isc_rwlocktype_read);
1477         if (need_relock) {
1478                 NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1479                               isc_rwlocktype_write);
1480                 if (ISC_LINK_LINKED(node, deadlink))
1481                         ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
1482                                         node, deadlink);
1483                 if (treelocktype == isc_rwlocktype_write)
1484                         cleanup_dead_nodes(rbtdb, node->locknum);
1485                 NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1486                                 isc_rwlocktype_write);
1487         }
1488
1489         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
1490 }
1491
1492 /*
1493  * Caller must be holding the node lock; either the "strong", read or write
1494  * lock.  Note that the lock must be held even when node references are
1495  * atomically modified; in that case the decrement operation itself does not
1496  * have to be protected, but we must avoid a race condition where multiple
1497  * threads are decreasing the reference to zero simultaneously and at least
1498  * one of them is going to free the node.
1499  * This function returns ISC_TRUE if and only if the node reference decreases
1500  * to zero.
1501  */
1502 static isc_boolean_t
1503 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1504                     rbtdb_serial_t least_serial,
1505                     isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
1506                     isc_boolean_t pruning)
1507 {
1508         isc_result_t result;
1509         isc_boolean_t write_locked;
1510         rbtdb_nodelock_t *nodelock;
1511         unsigned int refs, nrefs;
1512         int bucket = node->locknum;
1513         isc_boolean_t no_reference;
1514
1515         nodelock = &rbtdb->node_locks[bucket];
1516
1517         /* Handle easy and typical case first. */
1518         if (!node->dirty && (node->data != NULL || node->down != NULL)) {
1519                 dns_rbtnode_refdecrement(node, &nrefs);
1520                 INSIST((int)nrefs >= 0);
1521                 if (nrefs == 0) {
1522                         isc_refcount_decrement(&nodelock->references, &refs);
1523                         INSIST((int)refs >= 0);
1524                 }
1525                 return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
1526         }
1527
1528         /* Upgrade the lock? */
1529         if (nlock == isc_rwlocktype_read) {
1530                 NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
1531                 NODE_WEAKLOCK(&nodelock->lock, isc_rwlocktype_write);
1532         }
1533         dns_rbtnode_refdecrement(node, &nrefs);
1534         INSIST((int)nrefs >= 0);
1535         if (nrefs > 0) {
1536                 /* Restore the lock? */
1537                 if (nlock == isc_rwlocktype_read)
1538                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1539                 return (ISC_FALSE);
1540         }
1541
1542         if (node->dirty && dns_rbtnode_refcurrent(node) == 0) {
1543                 if (IS_CACHE(rbtdb))
1544                         clean_cache_node(rbtdb, node);
1545                 else {
1546                         if (least_serial == 0) {
1547                                 /*
1548                                  * Caller doesn't know the least serial.
1549                                  * Get it.
1550                                  */
1551                                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1552                                 least_serial = rbtdb->least_serial;
1553                                 RBTDB_UNLOCK(&rbtdb->lock,
1554                                              isc_rwlocktype_read);
1555                         }
1556                         clean_zone_node(rbtdb, node, least_serial);
1557                 }
1558         }
1559
1560         isc_refcount_decrement(&nodelock->references, &refs);
1561         INSIST((int)refs >= 0);
1562
1563         /*
1564          * XXXDCL should this only be done for cache zones?
1565          */
1566         if (node->data != NULL || node->down != NULL) {
1567                 /* Restore the lock? */
1568                 if (nlock == isc_rwlocktype_read)
1569                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1570                 return (ISC_TRUE);
1571         }
1572
1573         /*
1574          * Attempt to switch to a write lock on the tree.  If this fails,
1575          * we will add this node to a linked list of nodes in this locking
1576          * bucket which we will free later.
1577          */
1578         if (tlock != isc_rwlocktype_write) {
1579                 /*
1580                  * Locking hierarchy notwithstanding, we don't need to free
1581                  * the node lock before acquiring the tree write lock because
1582                  * we only do a trylock.
1583                  */
1584                 if (tlock == isc_rwlocktype_read)
1585                         result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
1586                 else
1587                         result = isc_rwlock_trylock(&rbtdb->tree_lock,
1588                                                     isc_rwlocktype_write);
1589                 RUNTIME_CHECK(result == ISC_R_SUCCESS ||
1590                               result == ISC_R_LOCKBUSY);
1591
1592                 write_locked = ISC_TF(result == ISC_R_SUCCESS);
1593         } else
1594                 write_locked = ISC_TRUE;
1595
1596         no_reference = ISC_TRUE;
1597         if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
1598                 /*
1599                  * We can now delete the node if the reference counter is
1600                  * zero.  This should be typically the case, but a different
1601                  * thread may still gain a (new) reference just before the
1602                  * current thread locks the tree (e.g., in findnode()).
1603                  */
1604
1605                 /*
1606                  * If this node is the only one in the level it's in, deleting
1607                  * this node may recursively make its parent the only node in
1608                  * the parent level; if so, and if no one is currently using
1609                  * the parent node, this is almost the only opportunity to
1610                  * clean it up.  But the recursive cleanup is not that trivial
1611                  * since the child and parent may be in different lock buckets,
1612                  * which would cause a lock order reversal problem.  To avoid
1613                  * the trouble, we'll dispatch a separate event for batch
1614                  * cleaning.  We need to check whether we're deleting the node
1615                  * as a result of pruning to avoid infinite dispatching.
1616                  * Note: pruning happens only when a task has been set for the
1617                  * rbtdb.  If the user of the rbtdb chooses not to set a task,
1618                  * it's their responsibility to purge stale leaves (e.g. by
1619                  * periodic walk-through).
1620                  */
1621                 if (!pruning && node->parent != NULL &&
1622                     node->parent->down == node && node->left == NULL &&
1623                     node->right == NULL && rbtdb->task != NULL) {
1624                         isc_event_t *ev;
1625                         dns_db_t *db;
1626
1627                         ev = isc_event_allocate(rbtdb->common.mctx, NULL,
1628                                                 DNS_EVENT_RBTPRUNE,
1629                                                 prune_tree, node,
1630                                                 sizeof(isc_event_t));
1631                         if (ev != NULL) {
1632                                 new_reference(rbtdb, node);
1633                                 db = NULL;
1634                                 attach((dns_db_t *)rbtdb, &db);
1635                                 ev->ev_sender = db;
1636                                 isc_task_send(rbtdb->task, &ev);
1637                                 no_reference = ISC_FALSE;
1638                         } else {
1639                                 /*
1640                                  * XXX: this is a weird situation.  We could
1641                                  * ignore this error case, but then the stale
1642                                  * node will unlikely be purged except via a
1643                                  * rare condition such as manual cleanup.  So
1644                                  * we queue it in the deadnodes list, hoping
1645                                  * the memory shortage is temporary and the node
1646                                  * will be deleted later.
1647                                  */
1648                                 isc_log_write(dns_lctx,
1649                                               DNS_LOGCATEGORY_DATABASE,
1650                                               DNS_LOGMODULE_CACHE,
1651                                               ISC_LOG_INFO,
1652                                               "decrement_reference: failed to "
1653                                               "allocate pruning event");
1654                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1655                                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
1656                                                 deadlink);
1657                         }
1658                 } else {
1659                         if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1660                                 char printname[DNS_NAME_FORMATSIZE];
1661
1662                                 isc_log_write(dns_lctx,
1663                                               DNS_LOGCATEGORY_DATABASE,
1664                                               DNS_LOGMODULE_CACHE,
1665                                               ISC_LOG_DEBUG(1),
1666                                               "decrement_reference: "
1667                                               "delete from rbt: %p %s",
1668                                               node,
1669                                               dns_rbt_formatnodename(node,
1670                                                         printname,
1671                                                         sizeof(printname)));
1672                         }
1673
1674                         INSIST(!ISC_LINK_LINKED(node, deadlink));
1675                         result = dns_rbt_deletenode(rbtdb->tree, node,
1676                                                     ISC_FALSE);
1677                         if (result != ISC_R_SUCCESS) {
1678                                 isc_log_write(dns_lctx,
1679                                               DNS_LOGCATEGORY_DATABASE,
1680                                               DNS_LOGMODULE_CACHE,
1681                                               ISC_LOG_WARNING,
1682                                               "decrement_reference: "
1683                                               "dns_rbt_deletenode: %s",
1684                                               isc_result_totext(result));
1685                         }
1686                 }
1687         } else if (dns_rbtnode_refcurrent(node) == 0) {
1688                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1689                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
1690         }
1691
1692         /* Restore the lock? */
1693         if (nlock == isc_rwlocktype_read)
1694                 NODE_WEAKDOWNGRADE(&nodelock->lock);
1695
1696         /*
1697          * Relock a read lock, or unlock the write lock if no lock was held.
1698          */
1699         if (tlock == isc_rwlocktype_none)
1700                 if (write_locked)
1701                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1702
1703         if (tlock == isc_rwlocktype_read)
1704                 if (write_locked)
1705                         isc_rwlock_downgrade(&rbtdb->tree_lock);
1706
1707         return (no_reference);
1708 }
1709
1710 /*
1711  * Prune the tree by recursively cleaning-up single leaves.  In the worst
1712  * case, the number of iteration is the number of tree levels, which is at
1713  * most the maximum number of domain name labels, i.e, 127.  In practice, this
1714  * should be much smaller (only a few times), and even the worst case would be
1715  * acceptable for a single event.
1716  */
1717 static void
1718 prune_tree(isc_task_t *task, isc_event_t *event) {
1719         dns_rbtdb_t *rbtdb = event->ev_sender;
1720         dns_rbtnode_t *node = event->ev_arg;
1721         dns_rbtnode_t *parent;
1722         unsigned int locknum;
1723
1724         UNUSED(task);
1725
1726         isc_event_free(&event);
1727
1728         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1729         locknum = node->locknum;
1730         NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1731         do {
1732                 parent = node->parent;
1733                 decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
1734                                     isc_rwlocktype_write, ISC_TRUE);
1735
1736                 if (parent != NULL && parent->down == NULL) {
1737                         /*
1738                          * node was the only down child of the parent and has
1739                          * just been removed.  We'll then need to examine the
1740                          * parent.  Keep the lock if possible; otherwise,
1741                          * release the old lock and acquire one for the parent.
1742                          */
1743                         if (parent->locknum != locknum) {
1744                                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
1745                                             isc_rwlocktype_write);
1746                                 locknum = parent->locknum;
1747                                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
1748                                           isc_rwlocktype_write);
1749                         }
1750
1751                         /*
1752                          * We need to gain a reference to the node before
1753                          * decrementing it in the next iteration.  In addition,
1754                          * if the node is in the dead-nodes list, extract it
1755                          * from the list beforehand as we do in
1756                          * reactivate_node().
1757                          */
1758                         new_reference(rbtdb, parent);
1759                         if (ISC_LINK_LINKED(parent, deadlink)) {
1760                                 ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
1761                                                 parent, deadlink);
1762                         }
1763                 } else
1764                         parent = NULL;
1765
1766                 node = parent;
1767         } while (node != NULL);
1768         NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1769         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1770
1771         detach((dns_db_t **)&rbtdb);
1772 }
1773
1774 static inline void
1775 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1776                    rbtdb_changedlist_t *cleanup_list)
1777 {
1778         /*
1779          * Caller must be holding the database lock.
1780          */
1781
1782         rbtdb->least_serial = version->serial;
1783         *cleanup_list = version->changed_list;
1784         ISC_LIST_INIT(version->changed_list);
1785 }
1786
1787 static inline void
1788 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
1789         rbtdb_changed_t *changed, *next_changed;
1790
1791         /*
1792          * If the changed record is dirty, then
1793          * an update created multiple versions of
1794          * a given rdataset.  We keep this list
1795          * until we're the least open version, at
1796          * which point it's safe to get rid of any
1797          * older versions.
1798          *
1799          * If the changed record isn't dirty, then
1800          * we don't need it anymore since we're
1801          * committing and not rolling back.
1802          *
1803          * The caller must be holding the database lock.
1804          */
1805         for (changed = HEAD(version->changed_list);
1806              changed != NULL;
1807              changed = next_changed) {
1808                 next_changed = NEXT(changed, link);
1809                 if (!changed->dirty) {
1810                         UNLINK(version->changed_list,
1811                                changed, link);
1812                         APPEND(*cleanup_list,
1813                                changed, link);
1814                 }
1815         }
1816 }
1817
1818 static isc_boolean_t
1819 iszonesecure(dns_db_t *db, dns_dbnode_t *origin) {
1820         dns_rdataset_t keyset;
1821         dns_rdataset_t nsecset, signsecset;
1822         isc_boolean_t haszonekey = ISC_FALSE;
1823         isc_boolean_t hasnsec = ISC_FALSE;
1824         isc_result_t result;
1825
1826         dns_rdataset_init(&keyset);
1827         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_dnskey, 0,
1828                                      0, &keyset, NULL);
1829         if (result == ISC_R_SUCCESS) {
1830                 dns_rdata_t keyrdata = DNS_RDATA_INIT;
1831                 result = dns_rdataset_first(&keyset);
1832                 while (result == ISC_R_SUCCESS) {
1833                         dns_rdataset_current(&keyset, &keyrdata);
1834                         if (dns_zonekey_iszonekey(&keyrdata)) {
1835                                 haszonekey = ISC_TRUE;
1836                                 break;
1837                         }
1838                         result = dns_rdataset_next(&keyset);
1839                 }
1840                 dns_rdataset_disassociate(&keyset);
1841         }
1842         if (!haszonekey)
1843                 return (ISC_FALSE);
1844
1845         dns_rdataset_init(&nsecset);
1846         dns_rdataset_init(&signsecset);
1847         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_nsec, 0,
1848                                      0, &nsecset, &signsecset);
1849         if (result == ISC_R_SUCCESS) {
1850                 if (dns_rdataset_isassociated(&signsecset)) {
1851                         hasnsec = ISC_TRUE;
1852                         dns_rdataset_disassociate(&signsecset);
1853                 }
1854                 dns_rdataset_disassociate(&nsecset);
1855         }
1856         return (hasnsec);
1857 }
1858
1859 static void
1860 closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
1861         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1862         rbtdb_version_t *version, *cleanup_version, *least_greater;
1863         isc_boolean_t rollback = ISC_FALSE;
1864         rbtdb_changedlist_t cleanup_list;
1865         rbtdb_changed_t *changed, *next_changed;
1866         rbtdb_serial_t serial, least_serial;
1867         dns_rbtnode_t *rbtnode;
1868         unsigned int refs;
1869         isc_boolean_t writer;
1870
1871         REQUIRE(VALID_RBTDB(rbtdb));
1872         version = (rbtdb_version_t *)*versionp;
1873
1874         cleanup_version = NULL;
1875         ISC_LIST_INIT(cleanup_list);
1876
1877         isc_refcount_decrement(&version->references, &refs);
1878         if (refs > 0) {         /* typical and easy case first */
1879                 if (commit) {
1880                         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1881                         INSIST(!version->writer);
1882                         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1883                 }
1884                 goto end;
1885         }
1886
1887         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1888         serial = version->serial;
1889         writer = version->writer;
1890         if (version->writer) {
1891                 if (commit) {
1892                         unsigned cur_ref;
1893                         rbtdb_version_t *cur_version;
1894
1895                         INSIST(version->commit_ok);
1896                         INSIST(version == rbtdb->future_version);
1897                         /*
1898                          * The current version is going to be replaced.
1899                          * Release the (likely last) reference to it from the
1900                          * DB itself and unlink it from the open list.
1901                          */
1902                         cur_version = rbtdb->current_version;
1903                         isc_refcount_decrement(&cur_version->references,
1904                                                &cur_ref);
1905                         if (cur_ref == 0) {
1906                                 if (cur_version->serial == rbtdb->least_serial)
1907                                         INSIST(EMPTY(cur_version->changed_list));
1908                                 UNLINK(rbtdb->open_versions,
1909                                        cur_version, link);
1910                         }
1911                         if (EMPTY(rbtdb->open_versions)) {
1912                                 /*
1913                                  * We're going to become the least open
1914                                  * version.
1915                                  */
1916                                 make_least_version(rbtdb, version,
1917                                                    &cleanup_list);
1918                         } else {
1919                                 /*
1920                                  * Some other open version is the
1921                                  * least version.  We can't cleanup
1922                                  * records that were changed in this
1923                                  * version because the older versions
1924                                  * may still be in use by an open
1925                                  * version.
1926                                  *
1927                                  * We can, however, discard the
1928                                  * changed records for things that
1929                                  * we've added that didn't exist in
1930                                  * prior versions.
1931                                  */
1932                                 cleanup_nondirty(version, &cleanup_list);
1933                         }
1934                         /*
1935                          * If the (soon to be former) current version
1936                          * isn't being used by anyone, we can clean
1937                          * it up.
1938                          */
1939                         if (cur_ref == 0) {
1940                                 cleanup_version = cur_version;
1941                                 APPENDLIST(version->changed_list,
1942                                            cleanup_version->changed_list,
1943                                            link);
1944                         }
1945                         /*
1946                          * Become the current version.
1947                          */
1948                         version->writer = ISC_FALSE;
1949                         rbtdb->current_version = version;
1950                         rbtdb->current_serial = version->serial;
1951                         rbtdb->future_version = NULL;
1952
1953                         /*
1954                          * Keep the current version in the open list, and
1955                          * gain a reference for the DB itself (see the DB
1956                          * creation function below).  This must be the only
1957                          * case where we need to increment the counter from
1958                          * zero and need to use isc_refcount_increment0().
1959                          */
1960                         isc_refcount_increment0(&version->references,
1961                                                 &cur_ref);
1962                         INSIST(cur_ref == 1);
1963                         PREPEND(rbtdb->open_versions,
1964                                 rbtdb->current_version, link);
1965                 } else {
1966                         /*
1967                          * We're rolling back this transaction.
1968                          */
1969                         cleanup_list = version->changed_list;
1970                         ISC_LIST_INIT(version->changed_list);
1971                         rollback = ISC_TRUE;
1972                         cleanup_version = version;
1973                         rbtdb->future_version = NULL;
1974                 }
1975         } else {
1976                 if (version != rbtdb->current_version) {
1977                         /*
1978                          * There are no external or internal references
1979                          * to this version and it can be cleaned up.
1980                          */
1981                         cleanup_version = version;
1982
1983                         /*
1984                          * Find the version with the least serial
1985                          * number greater than ours.
1986                          */
1987                         least_greater = PREV(version, link);
1988                         if (least_greater == NULL)
1989                                 least_greater = rbtdb->current_version;
1990
1991                         INSIST(version->serial < least_greater->serial);
1992                         /*
1993                          * Is this the least open version?
1994                          */
1995                         if (version->serial == rbtdb->least_serial) {
1996                                 /*
1997                                  * Yes.  Install the new least open
1998                                  * version.
1999                                  */
2000                                 make_least_version(rbtdb,
2001                                                    least_greater,
2002                                                    &cleanup_list);
2003                         } else {
2004                                 /*
2005                                  * Add any unexecuted cleanups to
2006                                  * those of the least greater version.
2007                                  */
2008                                 APPENDLIST(least_greater->changed_list,
2009                                            version->changed_list,
2010                                            link);
2011                         }
2012                 } else if (version->serial == rbtdb->least_serial)
2013                         INSIST(EMPTY(version->changed_list));
2014                 UNLINK(rbtdb->open_versions, version, link);
2015         }
2016         least_serial = rbtdb->least_serial;
2017         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2018
2019         /*
2020          * Update the zone's secure status.
2021          */
2022         if (writer && commit && !IS_CACHE(rbtdb))
2023                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
2024
2025         if (cleanup_version != NULL) {
2026                 INSIST(EMPTY(cleanup_version->changed_list));
2027                 isc_mem_put(rbtdb->common.mctx, cleanup_version,
2028                             sizeof(*cleanup_version));
2029         }
2030
2031         if (!EMPTY(cleanup_list)) {
2032                 /*
2033                  * We acquire a tree write lock here in order to make sure
2034                  * that stale nodes will be removed in decrement_reference().
2035                  * If we didn't have the lock, those nodes could miss the
2036                  * chance to be removed until the server stops.  The write lock
2037                  * is expensive, but this event should be rare enough to justify
2038                  * the cost.
2039                  */
2040                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2041                 for (changed = HEAD(cleanup_list);
2042                      changed != NULL;
2043                      changed = next_changed) {
2044                         nodelock_t *lock;
2045
2046                         next_changed = NEXT(changed, link);
2047                         rbtnode = changed->node;
2048                         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2049
2050                         NODE_LOCK(lock, isc_rwlocktype_write);
2051                         /*
2052                          * This is a good opportunity to purge any dead nodes,
2053                          * so use it.
2054                          */
2055                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2056
2057                         if (rollback)
2058                                 rollback_node(rbtnode, serial);
2059                         decrement_reference(rbtdb, rbtnode, least_serial,
2060                                             isc_rwlocktype_write,
2061                                             isc_rwlocktype_write, ISC_FALSE);
2062
2063                         NODE_UNLOCK(lock, isc_rwlocktype_write);
2064
2065                         isc_mem_put(rbtdb->common.mctx, changed,
2066                                     sizeof(*changed));
2067                 }
2068                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2069         }
2070
2071   end:
2072         *versionp = NULL;
2073 }
2074
2075 /*
2076  * Add the necessary magic for the wildcard name 'name'
2077  * to be found in 'rbtdb'.
2078  *
2079  * In order for wildcard matching to work correctly in
2080  * zone_find(), we must ensure that a node for the wildcarding
2081  * level exists in the database, and has its 'find_callback'
2082  * and 'wild' bits set.
2083  *
2084  * E.g. if the wildcard name is "*.sub.example." then we
2085  * must ensure that "sub.example." exists and is marked as
2086  * a wildcard level.
2087  */
2088 static isc_result_t
2089 add_wildcard_magic(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2090         isc_result_t result;
2091         dns_name_t foundname;
2092         dns_offsets_t offsets;
2093         unsigned int n;
2094         dns_rbtnode_t *node = NULL;
2095
2096         dns_name_init(&foundname, offsets);
2097         n = dns_name_countlabels(name);
2098         INSIST(n >= 2);
2099         n--;
2100         dns_name_getlabelsequence(name, 1, n, &foundname);
2101         result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2102         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2103                 return (result);
2104         node->find_callback = 1;
2105         node->wild = 1;
2106         return (ISC_R_SUCCESS);
2107 }
2108
2109 static isc_result_t
2110 add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2111         isc_result_t result;
2112         dns_name_t foundname;
2113         dns_offsets_t offsets;
2114         unsigned int n, l, i;
2115
2116         dns_name_init(&foundname, offsets);
2117         n = dns_name_countlabels(name);
2118         l = dns_name_countlabels(&rbtdb->common.origin);
2119         i = l + 1;
2120         while (i < n) {
2121                 dns_rbtnode_t *node = NULL;     /* dummy */
2122                 dns_name_getlabelsequence(name, n - i, i, &foundname);
2123                 if (dns_name_iswildcard(&foundname)) {
2124                         result = add_wildcard_magic(rbtdb, &foundname);
2125                         if (result != ISC_R_SUCCESS)
2126                                 return (result);
2127                         result = dns_rbt_addnode(rbtdb->tree, &foundname,
2128                                                  &node);
2129                         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2130                                 return (result);
2131                 }
2132                 i++;
2133         }
2134         return (ISC_R_SUCCESS);
2135 }
2136
2137 static isc_result_t
2138 findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create,
2139          dns_dbnode_t **nodep)
2140 {
2141         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2142         dns_rbtnode_t *node = NULL;
2143         dns_name_t nodename;
2144         isc_result_t result;
2145         isc_rwlocktype_t locktype = isc_rwlocktype_read;
2146
2147         REQUIRE(VALID_RBTDB(rbtdb));
2148
2149         dns_name_init(&nodename, NULL);
2150         RWLOCK(&rbtdb->tree_lock, locktype);
2151         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &node, NULL,
2152                                   DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2153         if (result != ISC_R_SUCCESS) {
2154                 RWUNLOCK(&rbtdb->tree_lock, locktype);
2155                 if (!create) {
2156                         if (result == DNS_R_PARTIALMATCH)
2157                                 result = ISC_R_NOTFOUND;
2158                         return (result);
2159                 }
2160                 /*
2161                  * It would be nice to try to upgrade the lock instead of
2162                  * unlocking then relocking.
2163                  */
2164                 locktype = isc_rwlocktype_write;
2165                 RWLOCK(&rbtdb->tree_lock, locktype);
2166                 node = NULL;
2167                 result = dns_rbt_addnode(rbtdb->tree, name, &node);
2168                 if (result == ISC_R_SUCCESS) {
2169                         dns_rbt_namefromnode(node, &nodename);
2170 #ifdef DNS_RBT_USEHASH
2171                         node->locknum = node->hashval % rbtdb->node_lock_count;
2172 #else
2173                         node->locknum = dns_name_hash(&nodename, ISC_TRUE) %
2174                                 rbtdb->node_lock_count;
2175 #endif
2176                         add_empty_wildcards(rbtdb, name);
2177
2178                         if (dns_name_iswildcard(name)) {
2179                                 result = add_wildcard_magic(rbtdb, name);
2180                                 if (result != ISC_R_SUCCESS) {
2181                                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2182                                         return (result);
2183                                 }
2184                         }
2185                 } else if (result != ISC_R_EXISTS) {
2186                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2187                         return (result);
2188                 }
2189         }
2190         reactivate_node(rbtdb, node, locktype);
2191         RWUNLOCK(&rbtdb->tree_lock, locktype);
2192
2193         *nodep = (dns_dbnode_t *)node;
2194
2195         return (ISC_R_SUCCESS);
2196 }
2197
2198 static isc_result_t
2199 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2200         rbtdb_search_t *search = arg;
2201         rdatasetheader_t *header, *header_next;
2202         rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2203         rdatasetheader_t *found;
2204         isc_result_t result;
2205         dns_rbtnode_t *onode;
2206
2207         /*
2208          * We only want to remember the topmost zone cut, since it's the one
2209          * that counts, so we'll just continue if we've already found a
2210          * zonecut.
2211          */
2212         if (search->zonecut != NULL)
2213                 return (DNS_R_CONTINUE);
2214
2215         found = NULL;
2216         result = DNS_R_CONTINUE;
2217         onode = search->rbtdb->origin_node;
2218
2219         NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2220                   isc_rwlocktype_read);
2221
2222         /*
2223          * Look for an NS or DNAME rdataset active in our version.
2224          */
2225         ns_header = NULL;
2226         dname_header = NULL;
2227         sigdname_header = NULL;
2228         for (header = node->data; header != NULL; header = header_next) {
2229                 header_next = header->next;
2230                 if (header->type == dns_rdatatype_ns ||
2231                     header->type == dns_rdatatype_dname ||
2232                     header->type == RBTDB_RDATATYPE_SIGDNAME) {
2233                         do {
2234                                 if (header->serial <= search->serial &&
2235                                     !IGNORE(header)) {
2236                                         /*
2237                                          * Is this a "this rdataset doesn't
2238                                          * exist" record?
2239                                          */
2240                                         if (NONEXISTENT(header))
2241                                                 header = NULL;
2242                                         break;
2243                                 } else
2244                                         header = header->down;
2245                         } while (header != NULL);
2246                         if (header != NULL) {
2247                                 if (header->type == dns_rdatatype_dname)
2248                                         dname_header = header;
2249                                 else if (header->type ==
2250                                            RBTDB_RDATATYPE_SIGDNAME)
2251                                         sigdname_header = header;
2252                                 else if (node != onode ||
2253                                          IS_STUB(search->rbtdb)) {
2254                                         /*
2255                                          * We've found an NS rdataset that
2256                                          * isn't at the origin node.  We check
2257                                          * that they're not at the origin node,
2258                                          * because otherwise we'd erroneously
2259                                          * treat the zone top as if it were
2260                                          * a delegation.
2261                                          */
2262                                         ns_header = header;
2263                                 }
2264                         }
2265                 }
2266         }
2267
2268         /*
2269          * Did we find anything?
2270          */
2271         if (dname_header != NULL) {
2272                 /*
2273                  * Note that DNAME has precedence over NS if both exist.
2274                  */
2275                 found = dname_header;
2276                 search->zonecut_sigrdataset = sigdname_header;
2277         } else if (ns_header != NULL) {
2278                 found = ns_header;
2279                 search->zonecut_sigrdataset = NULL;
2280         }
2281
2282         if (found != NULL) {
2283                 /*
2284                  * We increment the reference count on node to ensure that
2285                  * search->zonecut_rdataset will still be valid later.
2286                  */
2287                 new_reference(search->rbtdb, node);
2288                 search->zonecut = node;
2289                 search->zonecut_rdataset = found;
2290                 search->need_cleanup = ISC_TRUE;
2291                 /*
2292                  * Since we've found a zonecut, anything beneath it is
2293                  * glue and is not subject to wildcard matching, so we
2294                  * may clear search->wild.
2295                  */
2296                 search->wild = ISC_FALSE;
2297                 if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
2298                         /*
2299                          * If the caller does not want to find glue, then
2300                          * this is the best answer and the search should
2301                          * stop now.
2302                          */
2303                         result = DNS_R_PARTIALMATCH;
2304                 } else {
2305                         dns_name_t *zcname;
2306
2307                         /*
2308                          * The search will continue beneath the zone cut.
2309                          * This may or may not be the best match.  In case it
2310                          * is, we need to remember the node name.
2311                          */
2312                         zcname = dns_fixedname_name(&search->zonecut_name);
2313                         RUNTIME_CHECK(dns_name_copy(name, zcname, NULL) ==
2314                                       ISC_R_SUCCESS);
2315                         search->copy_name = ISC_TRUE;
2316                 }
2317         } else {
2318                 /*
2319                  * There is no zonecut at this node which is active in this
2320                  * version.
2321                  *
2322                  * If this is a "wild" node and the caller hasn't disabled
2323                  * wildcard matching, remember that we've seen a wild node
2324                  * in case we need to go searching for wildcard matches
2325                  * later on.
2326                  */
2327                 if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0)
2328                         search->wild = ISC_TRUE;
2329         }
2330
2331         NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2332                     isc_rwlocktype_read);
2333
2334         return (result);
2335 }
2336
2337 static inline void
2338 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2339               rdatasetheader_t *header, isc_stdtime_t now,
2340               dns_rdataset_t *rdataset)
2341 {
2342         unsigned char *raw;     /* RDATASLAB */
2343
2344         /*
2345          * Caller must be holding the node reader lock.
2346          * XXXJT: technically, we need a writer lock, since we'll increment
2347          * the header count below.  However, since the actual counter value
2348          * doesn't matter, we prioritize performance here.  (We may want to
2349          * use atomic increment when available).
2350          */
2351
2352         if (rdataset == NULL)
2353                 return;
2354
2355         new_reference(rbtdb, node);
2356
2357         INSIST(rdataset->methods == NULL);      /* We must be disassociated. */
2358
2359         rdataset->methods = &rdataset_methods;
2360         rdataset->rdclass = rbtdb->common.rdclass;
2361         rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
2362         rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
2363         rdataset->ttl = header->rdh_ttl - now;
2364         rdataset->trust = header->trust;
2365         if (NXDOMAIN(header))
2366                 rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
2367         rdataset->private1 = rbtdb;
2368         rdataset->private2 = node;
2369         raw = (unsigned char *)header + sizeof(*header);
2370         rdataset->private3 = raw;
2371         rdataset->count = header->count++;
2372         if (rdataset->count == ISC_UINT32_MAX)
2373                 rdataset->count = 0;
2374
2375         /*
2376          * Reset iterator state.
2377          */
2378         rdataset->privateuint4 = 0;
2379         rdataset->private5 = NULL;
2380
2381         /*
2382          * Add noqname proof.
2383          */
2384         rdataset->private6 = header->noqname;
2385         if (rdataset->private6 != NULL)
2386                 rdataset->attributes |=  DNS_RDATASETATTR_NOQNAME;
2387 }
2388
2389 static inline isc_result_t
2390 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
2391                  dns_name_t *foundname, dns_rdataset_t *rdataset,
2392                  dns_rdataset_t *sigrdataset)
2393 {
2394         isc_result_t result;
2395         dns_name_t *zcname;
2396         rbtdb_rdatatype_t type;
2397         dns_rbtnode_t *node;
2398
2399         /*
2400          * The caller MUST NOT be holding any node locks.
2401          */
2402
2403         node = search->zonecut;
2404         type = search->zonecut_rdataset->type;
2405
2406         /*
2407          * If we have to set foundname, we do it before anything else.
2408          * If we were to set foundname after we had set nodep or bound the
2409          * rdataset, then we'd have to undo that work if dns_name_copy()
2410          * failed.  By setting foundname first, there's nothing to undo if
2411          * we have trouble.
2412          */
2413         if (foundname != NULL && search->copy_name) {
2414                 zcname = dns_fixedname_name(&search->zonecut_name);
2415                 result = dns_name_copy(zcname, foundname, NULL);
2416                 if (result != ISC_R_SUCCESS)
2417                         return (result);
2418         }
2419         if (nodep != NULL) {
2420                 /*
2421                  * Note that we don't have to increment the node's reference
2422                  * count here because we're going to use the reference we
2423                  * already have in the search block.
2424                  */
2425                 *nodep = node;
2426                 search->need_cleanup = ISC_FALSE;
2427         }
2428         if (rdataset != NULL) {
2429                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2430                           isc_rwlocktype_read);
2431                 bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
2432                               search->now, rdataset);
2433                 if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
2434                         bind_rdataset(search->rbtdb, node,
2435                                       search->zonecut_sigrdataset,
2436                                       search->now, sigrdataset);
2437                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2438                             isc_rwlocktype_read);
2439         }
2440
2441         if (type == dns_rdatatype_dname)
2442                 return (DNS_R_DNAME);
2443         return (DNS_R_DELEGATION);
2444 }
2445
2446 static inline isc_boolean_t
2447 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
2448            dns_rbtnode_t *node)
2449 {
2450         unsigned char *raw;     /* RDATASLAB */
2451         unsigned int count, size;
2452         dns_name_t ns_name;
2453         isc_boolean_t valid = ISC_FALSE;
2454         dns_offsets_t offsets;
2455         isc_region_t region;
2456         rdatasetheader_t *header;
2457
2458         /*
2459          * No additional locking is required.
2460          */
2461
2462         /*
2463          * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
2464          * if it occurs at a zone cut, but is not valid below it.
2465          */
2466         if (type == dns_rdatatype_ns) {
2467                 if (node != search->zonecut) {
2468                         return (ISC_FALSE);
2469                 }
2470         } else if (type != dns_rdatatype_a &&
2471                    type != dns_rdatatype_aaaa &&
2472                    type != dns_rdatatype_a6) {
2473                 return (ISC_FALSE);
2474         }
2475
2476         header = search->zonecut_rdataset;
2477         raw = (unsigned char *)header + sizeof(*header);
2478         count = raw[0] * 256 + raw[1];
2479 #if DNS_RDATASET_FIXED
2480         raw += 2 + (4 * count);
2481 #else
2482         raw += 2;
2483 #endif
2484
2485         while (count > 0) {
2486                 count--;
2487                 size = raw[0] * 256 + raw[1];
2488 #if DNS_RDATASET_FIXED
2489                 raw += 4;
2490 #else
2491                 raw += 2;
2492 #endif
2493                 region.base = raw;
2494                 region.length = size;
2495                 raw += size;
2496                 /*
2497                  * XXX Until we have rdata structures, we have no choice but
2498                  * to directly access the rdata format.
2499                  */
2500                 dns_name_init(&ns_name, offsets);
2501                 dns_name_fromregion(&ns_name, &region);
2502                 if (dns_name_compare(&ns_name, name) == 0) {
2503                         valid = ISC_TRUE;
2504                         break;
2505                 }
2506         }
2507
2508         return (valid);
2509 }
2510
2511 static inline isc_boolean_t
2512 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
2513             dns_name_t *name)
2514 {
2515         dns_fixedname_t fnext;
2516         dns_fixedname_t forigin;
2517         dns_name_t *next;
2518         dns_name_t *origin;
2519         dns_name_t prefix;
2520         dns_rbtdb_t *rbtdb;
2521         dns_rbtnode_t *node;
2522         isc_result_t result;
2523         isc_boolean_t answer = ISC_FALSE;
2524         rdatasetheader_t *header;
2525
2526         rbtdb = search->rbtdb;
2527
2528         dns_name_init(&prefix, NULL);
2529         dns_fixedname_init(&fnext);
2530         next = dns_fixedname_name(&fnext);
2531         dns_fixedname_init(&forigin);
2532         origin = dns_fixedname_name(&forigin);
2533
2534         result = dns_rbtnodechain_next(chain, NULL, NULL);
2535         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2536                 node = NULL;
2537                 result = dns_rbtnodechain_current(chain, &prefix,
2538                                                   origin, &node);
2539                 if (result != ISC_R_SUCCESS)
2540                         break;
2541                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2542                           isc_rwlocktype_read);
2543                 for (header = node->data;
2544                      header != NULL;
2545                      header = header->next) {
2546                         if (header->serial <= search->serial &&
2547                             !IGNORE(header) && EXISTS(header))
2548                                 break;
2549                 }
2550                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2551                             isc_rwlocktype_read);
2552                 if (header != NULL)
2553                         break;
2554                 result = dns_rbtnodechain_next(chain, NULL, NULL);
2555         }
2556         if (result == ISC_R_SUCCESS)
2557                 result = dns_name_concatenate(&prefix, origin, next, NULL);
2558         if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name))
2559                 answer = ISC_TRUE;
2560         return (answer);
2561 }
2562
2563 static inline isc_boolean_t
2564 activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) {
2565         dns_fixedname_t fnext;
2566         dns_fixedname_t forigin;
2567         dns_fixedname_t fprev;
2568         dns_name_t *next;
2569         dns_name_t *origin;
2570         dns_name_t *prev;
2571         dns_name_t name;
2572         dns_name_t rname;
2573         dns_name_t tname;
2574         dns_rbtdb_t *rbtdb;
2575         dns_rbtnode_t *node;
2576         dns_rbtnodechain_t chain;
2577         isc_boolean_t check_next = ISC_TRUE;
2578         isc_boolean_t check_prev = ISC_TRUE;
2579         isc_boolean_t answer = ISC_FALSE;
2580         isc_result_t result;
2581         rdatasetheader_t *header;
2582         unsigned int n;
2583
2584         rbtdb = search->rbtdb;
2585
2586         dns_name_init(&name, NULL);
2587         dns_name_init(&tname, NULL);
2588         dns_name_init(&rname, NULL);
2589         dns_fixedname_init(&fnext);
2590         next = dns_fixedname_name(&fnext);
2591         dns_fixedname_init(&fprev);
2592         prev = dns_fixedname_name(&fprev);
2593         dns_fixedname_init(&forigin);
2594         origin = dns_fixedname_name(&forigin);
2595
2596         /*
2597          * Find if qname is at or below a empty node.
2598          * Use our own copy of the chain.
2599          */
2600
2601         chain = search->chain;
2602         do {
2603                 node = NULL;
2604                 result = dns_rbtnodechain_current(&chain, &name,
2605                                                   origin, &node);
2606                 if (result != ISC_R_SUCCESS)
2607                         break;
2608                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2609                           isc_rwlocktype_read);
2610                 for (header = node->data;
2611                      header != NULL;
2612                      header = header->next) {
2613                         if (header->serial <= search->serial &&
2614                             !IGNORE(header) && EXISTS(header))
2615                                 break;
2616                 }
2617                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2618                             isc_rwlocktype_read);
2619                 if (header != NULL)
2620                         break;
2621                 result = dns_rbtnodechain_prev(&chain, NULL, NULL);
2622         } while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
2623         if (result == ISC_R_SUCCESS)
2624                 result = dns_name_concatenate(&name, origin, prev, NULL);
2625         if (result != ISC_R_SUCCESS)
2626                 check_prev = ISC_FALSE;
2627
2628         result = dns_rbtnodechain_next(&chain, NULL, NULL);
2629         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2630                 node = NULL;
2631                 result = dns_rbtnodechain_current(&chain, &name,
2632                                                   origin, &node);
2633                 if (result != ISC_R_SUCCESS)
2634                         break;
2635                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2636                           isc_rwlocktype_read);
2637                 for (header = node->data;
2638                      header != NULL;
2639                      header = header->next) {
2640                         if (header->serial <= search->serial &&
2641                             !IGNORE(header) && EXISTS(header))
2642                                 break;
2643                 }
2644                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2645                             isc_rwlocktype_read);
2646                 if (header != NULL)
2647                         break;
2648                 result = dns_rbtnodechain_next(&chain, NULL, NULL);
2649         }
2650         if (result == ISC_R_SUCCESS)
2651                 result = dns_name_concatenate(&name, origin, next, NULL);
2652         if (result != ISC_R_SUCCESS)
2653                 check_next = ISC_FALSE;
2654
2655         dns_name_clone(qname, &rname);
2656
2657         /*
2658          * Remove the wildcard label to find the terminal name.
2659          */
2660         n = dns_name_countlabels(wname);
2661         dns_name_getlabelsequence(wname, 1, n - 1, &tname);
2662
2663         do {
2664                 if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
2665                     (check_next && dns_name_issubdomain(next, &rname))) {
2666                         answer = ISC_TRUE;
2667                         break;
2668                 }
2669                 /*
2670                  * Remove the left hand label.
2671                  */
2672                 n = dns_name_countlabels(&rname);
2673                 dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
2674         } while (!dns_name_equal(&rname, &tname));
2675         return (answer);
2676 }
2677
2678 static inline isc_result_t
2679 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
2680               dns_name_t *qname)
2681 {
2682         unsigned int i, j;
2683         dns_rbtnode_t *node, *level_node, *wnode;
2684         rdatasetheader_t *header;
2685         isc_result_t result = ISC_R_NOTFOUND;
2686         dns_name_t name;
2687         dns_name_t *wname;
2688         dns_fixedname_t fwname;
2689         dns_rbtdb_t *rbtdb;
2690         isc_boolean_t done, wild, active;
2691         dns_rbtnodechain_t wchain;
2692
2693         /*
2694          * Caller must be holding the tree lock and MUST NOT be holding
2695          * any node locks.
2696          */
2697
2698         /*
2699          * Examine each ancestor level.  If the level's wild bit
2700          * is set, then construct the corresponding wildcard name and
2701          * search for it.  If the wildcard node exists, and is active in
2702          * this version, we're done.  If not, then we next check to see
2703          * if the ancestor is active in this version.  If so, then there
2704          * can be no possible wildcard match and again we're done.  If not,
2705          * continue the search.
2706          */
2707
2708         rbtdb = search->rbtdb;
2709         i = search->chain.level_matches;
2710         done = ISC_FALSE;
2711         node = *nodep;
2712         do {
2713                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2714                           isc_rwlocktype_read);
2715
2716                 /*
2717                  * First we try to figure out if this node is active in
2718                  * the search's version.  We do this now, even though we
2719                  * may not need the information, because it simplifies the
2720                  * locking and code flow.
2721                  */
2722                 for (header = node->data;
2723                      header != NULL;
2724                      header = header->next) {
2725                         if (header->serial <= search->serial &&
2726                             !IGNORE(header) && EXISTS(header))
2727                                 break;
2728                 }
2729                 if (header != NULL)
2730                         active = ISC_TRUE;
2731                 else
2732                         active = ISC_FALSE;
2733
2734                 if (node->wild)
2735                         wild = ISC_TRUE;
2736                 else
2737                         wild = ISC_FALSE;
2738
2739                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2740                             isc_rwlocktype_read);
2741
2742                 if (wild) {
2743                         /*
2744                          * Construct the wildcard name for this level.
2745                          */
2746                         dns_name_init(&name, NULL);
2747                         dns_rbt_namefromnode(node, &name);
2748                         dns_fixedname_init(&fwname);
2749                         wname = dns_fixedname_name(&fwname);
2750                         result = dns_name_concatenate(dns_wildcardname, &name,
2751                                                       wname, NULL);
2752                         j = i;
2753                         while (result == ISC_R_SUCCESS && j != 0) {
2754                                 j--;
2755                                 level_node = search->chain.levels[j];
2756                                 dns_name_init(&name, NULL);
2757                                 dns_rbt_namefromnode(level_node, &name);
2758                                 result = dns_name_concatenate(wname,
2759                                                               &name,
2760                                                               wname,
2761                                                               NULL);
2762                         }
2763                         if (result != ISC_R_SUCCESS)
2764                                 break;
2765
2766                         wnode = NULL;
2767                         dns_rbtnodechain_init(&wchain, NULL);
2768                         result = dns_rbt_findnode(rbtdb->tree, wname,
2769                                                   NULL, &wnode, &wchain,
2770                                                   DNS_RBTFIND_EMPTYDATA,
2771                                                   NULL, NULL);
2772                         if (result == ISC_R_SUCCESS) {
2773                                 nodelock_t *lock;
2774
2775                                 /*
2776                                  * We have found the wildcard node.  If it
2777                                  * is active in the search's version, we're
2778                                  * done.
2779                                  */
2780                                 lock = &rbtdb->node_locks[wnode->locknum].lock;
2781                                 NODE_LOCK(lock, isc_rwlocktype_read);
2782                                 for (header = wnode->data;
2783                                      header != NULL;
2784                                      header = header->next) {
2785                                         if (header->serial <= search->serial &&
2786                                             !IGNORE(header) && EXISTS(header))
2787                                                 break;
2788                                 }
2789                                 NODE_UNLOCK(lock, isc_rwlocktype_read);
2790                                 if (header != NULL ||
2791                                     activeempty(search, &wchain, wname)) {
2792                                         if (activeemtpynode(search, qname,
2793                                                             wname)) {
2794                                                 return (ISC_R_NOTFOUND);
2795                                         }
2796                                         /*
2797                                          * The wildcard node is active!
2798                                          *
2799                                          * Note: result is still ISC_R_SUCCESS
2800                                          * so we don't have to set it.
2801                                          */
2802                                         *nodep = wnode;
2803                                         break;
2804                                 }
2805                         } else if (result != ISC_R_NOTFOUND &&
2806                                    result != DNS_R_PARTIALMATCH) {
2807                                 /*
2808                                  * An error has occurred.  Bail out.
2809                                  */
2810                                 break;
2811                         }
2812                 }
2813
2814                 if (active) {
2815                         /*
2816                          * The level node is active.  Any wildcarding
2817                          * present at higher levels has no
2818                          * effect and we're done.
2819                          */
2820                         result = ISC_R_NOTFOUND;
2821                         break;
2822                 }
2823
2824                 if (i > 0) {
2825                         i--;
2826                         node = search->chain.levels[i];
2827                 } else
2828                         done = ISC_TRUE;
2829         } while (!done);
2830
2831         return (result);
2832 }
2833
2834 static inline isc_result_t
2835 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
2836                   dns_name_t *foundname, dns_rdataset_t *rdataset,
2837                   dns_rdataset_t *sigrdataset, isc_boolean_t need_sig)
2838 {
2839         dns_rbtnode_t *node;
2840         rdatasetheader_t *header, *header_next, *found, *foundsig;
2841         isc_boolean_t empty_node;
2842         isc_result_t result;
2843         dns_fixedname_t fname, forigin;
2844         dns_name_t *name, *origin;
2845
2846         do {
2847                 node = NULL;
2848                 dns_fixedname_init(&fname);
2849                 name = dns_fixedname_name(&fname);
2850                 dns_fixedname_init(&forigin);
2851                 origin = dns_fixedname_name(&forigin);
2852                 result = dns_rbtnodechain_current(&search->chain, name,
2853                                                   origin, &node);
2854                 if (result != ISC_R_SUCCESS)
2855                         return (result);
2856                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2857                           isc_rwlocktype_read);
2858                 found = NULL;
2859                 foundsig = NULL;
2860                 empty_node = ISC_TRUE;
2861                 for (header = node->data;
2862                      header != NULL;
2863                      header = header_next) {
2864                         header_next = header->next;
2865                         /*
2866                          * Look for an active, extant NSEC or RRSIG NSEC.
2867                          */
2868                         do {
2869                                 if (header->serial <= search->serial &&
2870                                     !IGNORE(header)) {
2871                                         /*
2872                                          * Is this a "this rdataset doesn't
2873                                          * exist" record?
2874                                          */
2875                                         if (NONEXISTENT(header))
2876                                                 header = NULL;
2877                                         break;
2878                                 } else
2879                                         header = header->down;
2880                         } while (header != NULL);
2881                         if (header != NULL) {
2882                                 /*
2883                                  * We now know that there is at least one
2884                                  * active rdataset at this node.
2885                                  */
2886                                 empty_node = ISC_FALSE;
2887                                 if (header->type == dns_rdatatype_nsec) {
2888                                         found = header;
2889                                         if (foundsig != NULL)
2890                                                 break;
2891                                 } else if (header->type ==
2892                                            RBTDB_RDATATYPE_SIGNSEC) {
2893                                         foundsig = header;
2894                                         if (found != NULL)
2895                                                 break;
2896                                 }
2897                         }
2898                 }
2899                 if (!empty_node) {
2900                         if (found != NULL &&
2901                             (foundsig != NULL || !need_sig))
2902                         {
2903                                 /*
2904                                  * We've found the right NSEC record.
2905                                  *
2906                                  * Note: for this to really be the right
2907                                  * NSEC record, it's essential that the NSEC
2908                                  * records of any nodes obscured by a zone
2909                                  * cut have been removed; we assume this is
2910                                  * the case.
2911                                  */
2912                                 result = dns_name_concatenate(name, origin,
2913                                                               foundname, NULL);
2914                                 if (result == ISC_R_SUCCESS) {
2915                                         if (nodep != NULL) {
2916                                                 new_reference(search->rbtdb,
2917                                                               node);
2918                                                 *nodep = node;
2919                                         }
2920                                         bind_rdataset(search->rbtdb, node,
2921                                                       found, search->now,
2922                                                       rdataset);
2923                                         if (foundsig != NULL)
2924                                                 bind_rdataset(search->rbtdb,
2925                                                               node,
2926                                                               foundsig,
2927                                                               search->now,
2928                                                               sigrdataset);
2929                                 }
2930                         } else if (found == NULL && foundsig == NULL) {
2931                                 /*
2932                                  * This node is active, but has no NSEC or
2933                                  * RRSIG NSEC.  That means it's glue or
2934                                  * other obscured zone data that isn't
2935                                  * relevant for our search.  Treat the
2936                                  * node as if it were empty and keep looking.
2937                                  */
2938                                 empty_node = ISC_TRUE;
2939                                 result = dns_rbtnodechain_prev(&search->chain,
2940                                                                NULL, NULL);
2941                         } else {
2942                                 /*
2943                                  * We found an active node, but either the
2944                                  * NSEC or the RRSIG NSEC is missing.  This
2945                                  * shouldn't happen.
2946                                  */
2947                                 result = DNS_R_BADDB;
2948                         }
2949                 } else {
2950                         /*
2951                          * This node isn't active.  We've got to keep
2952                          * looking.
2953                          */
2954                         result = dns_rbtnodechain_prev(&search->chain, NULL,
2955                                                        NULL);
2956                 }
2957                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2958                             isc_rwlocktype_read);
2959         } while (empty_node && result == ISC_R_SUCCESS);
2960
2961         /*
2962          * If the result is ISC_R_NOMORE, then we got to the beginning of
2963          * the database and didn't find a NSEC record.  This shouldn't
2964          * happen.
2965          */
2966         if (result == ISC_R_NOMORE)
2967                 result = DNS_R_BADDB;
2968
2969         return (result);
2970 }
2971
2972 static isc_result_t
2973 zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
2974           dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
2975           dns_dbnode_t **nodep, dns_name_t *foundname,
2976           dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
2977 {
2978         dns_rbtnode_t *node = NULL;
2979         isc_result_t result;
2980         rbtdb_search_t search;
2981         isc_boolean_t cname_ok = ISC_TRUE;
2982         isc_boolean_t close_version = ISC_FALSE;
2983         isc_boolean_t maybe_zonecut = ISC_FALSE;
2984         isc_boolean_t at_zonecut = ISC_FALSE;
2985         isc_boolean_t wild;
2986         isc_boolean_t empty_node;
2987         rdatasetheader_t *header, *header_next, *found, *nsecheader;
2988         rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
2989         rbtdb_rdatatype_t sigtype;
2990         isc_boolean_t active;
2991         dns_rbtnodechain_t chain;
2992         nodelock_t *lock;
2993
2994
2995         search.rbtdb = (dns_rbtdb_t *)db;
2996
2997         REQUIRE(VALID_RBTDB(search.rbtdb));
2998
2999         /*
3000          * We don't care about 'now'.
3001          */
3002         UNUSED(now);
3003
3004         /*
3005          * If the caller didn't supply a version, attach to the current
3006          * version.
3007          */
3008         if (version == NULL) {
3009                 currentversion(db, &version);
3010                 close_version = ISC_TRUE;
3011         }
3012
3013         search.rbtversion = version;
3014         search.serial = search.rbtversion->serial;
3015         search.options = options;
3016         search.copy_name = ISC_FALSE;
3017         search.need_cleanup = ISC_FALSE;
3018         search.wild = ISC_FALSE;
3019         search.zonecut = NULL;
3020         dns_fixedname_init(&search.zonecut_name);
3021         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3022         search.now = 0;
3023
3024         /*
3025          * 'wild' will be true iff. we've matched a wildcard.
3026          */
3027         wild = ISC_FALSE;
3028
3029         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3030
3031         /*
3032          * Search down from the root of the tree.  If, while going down, we
3033          * encounter a callback node, zone_zonecut_callback() will search the
3034          * rdatasets at the zone cut for active DNAME or NS rdatasets.
3035          */
3036         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3037                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3038                                   zone_zonecut_callback, &search);
3039
3040         if (result == DNS_R_PARTIALMATCH) {
3041         partial_match:
3042                 if (search.zonecut != NULL) {
3043                     result = setup_delegation(&search, nodep, foundname,
3044                                               rdataset, sigrdataset);
3045                     goto tree_exit;
3046                 }
3047
3048                 if (search.wild) {
3049                         /*
3050                          * At least one of the levels in the search chain
3051                          * potentially has a wildcard.  For each such level,
3052                          * we must see if there's a matching wildcard active
3053                          * in the current version.
3054                          */
3055                         result = find_wildcard(&search, &node, name);
3056                         if (result == ISC_R_SUCCESS) {
3057                                 result = dns_name_copy(name, foundname, NULL);
3058                                 if (result != ISC_R_SUCCESS)
3059                                         goto tree_exit;
3060                                 wild = ISC_TRUE;
3061                                 goto found;
3062                         }
3063                         else if (result != ISC_R_NOTFOUND)
3064                                 goto tree_exit;
3065                 }
3066
3067                 chain = search.chain;
3068                 active = activeempty(&search, &chain, name);
3069
3070                 /*
3071                  * If we're here, then the name does not exist, is not
3072                  * beneath a zonecut, and there's no matching wildcard.
3073                  */
3074                 if (search.rbtdb->secure ||
3075                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3076                 {
3077                         result = find_closest_nsec(&search, nodep, foundname,
3078                                                    rdataset, sigrdataset,
3079                                                    search.rbtdb->secure);
3080                         if (result == ISC_R_SUCCESS)
3081                                 result = active ? DNS_R_EMPTYNAME :
3082                                                   DNS_R_NXDOMAIN;
3083                 } else
3084                         result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
3085                 goto tree_exit;
3086         } else if (result != ISC_R_SUCCESS)
3087                 goto tree_exit;
3088
3089  found:
3090         /*
3091          * We have found a node whose name is the desired name, or we
3092          * have matched a wildcard.
3093          */
3094
3095         if (search.zonecut != NULL) {
3096                 /*
3097                  * If we're beneath a zone cut, we don't want to look for
3098                  * CNAMEs because they're not legitimate zone glue.
3099                  */
3100                 cname_ok = ISC_FALSE;
3101         } else {
3102                 /*
3103                  * The node may be a zone cut itself.  If it might be one,
3104                  * make sure we check for it later.
3105                  */
3106                 if (node->find_callback &&
3107                     (node != search.rbtdb->origin_node ||
3108                      IS_STUB(search.rbtdb)) &&
3109                     !dns_rdatatype_atparent(type))
3110                         maybe_zonecut = ISC_TRUE;
3111         }
3112
3113         /*
3114          * Certain DNSSEC types are not subject to CNAME matching
3115          * (RFC4035, section 2.5 and RFC3007).
3116          *
3117          * We don't check for RRSIG, because we don't store RRSIG records
3118          * directly.
3119          */
3120         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3121                 cname_ok = ISC_FALSE;
3122
3123         /*
3124          * We now go looking for rdata...
3125          */
3126
3127         NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3128                   isc_rwlocktype_read);
3129
3130         found = NULL;
3131         foundsig = NULL;
3132         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3133         nsecheader = NULL;
3134         nsecsig = NULL;
3135         cnamesig = NULL;
3136         empty_node = ISC_TRUE;
3137         for (header = node->data; header != NULL; header = header_next) {
3138                 header_next = header->next;
3139                 /*
3140                  * Look for an active, extant rdataset.
3141                  */
3142                 do {
3143                         if (header->serial <= search.serial &&
3144                             !IGNORE(header)) {
3145                                 /*
3146                                  * Is this a "this rdataset doesn't
3147                                  * exist" record?
3148                                  */
3149                                 if (NONEXISTENT(header))
3150                                         header = NULL;
3151                                 break;
3152                         } else
3153                                 header = header->down;
3154                 } while (header != NULL);
3155                 if (header != NULL) {
3156                         /*
3157                          * We now know that there is at least one active
3158                          * rdataset at this node.
3159                          */
3160                         empty_node = ISC_FALSE;
3161
3162                         /*
3163                          * Do special zone cut handling, if requested.
3164                          */
3165                         if (maybe_zonecut &&
3166                             header->type == dns_rdatatype_ns) {
3167                                 /*
3168                                  * We increment the reference count on node to
3169                                  * ensure that search->zonecut_rdataset will
3170                                  * still be valid later.
3171                                  */
3172                                 new_reference(search.rbtdb, node);
3173                                 search.zonecut = node;
3174                                 search.zonecut_rdataset = header;
3175                                 search.zonecut_sigrdataset = NULL;
3176                                 search.need_cleanup = ISC_TRUE;
3177                                 maybe_zonecut = ISC_FALSE;
3178                                 at_zonecut = ISC_TRUE;
3179                                 /*
3180                                  * It is not clear if KEY should still be
3181                                  * allowed at the parent side of the zone
3182                                  * cut or not.  It is needed for RFC3007
3183                                  * validated updates.
3184                                  */
3185                                 if ((search.options & DNS_DBFIND_GLUEOK) == 0
3186                                     && type != dns_rdatatype_nsec
3187                                     && type != dns_rdatatype_key) {
3188                                         /*
3189                                          * Glue is not OK, but any answer we
3190                                          * could return would be glue.  Return
3191                                          * the delegation.
3192                                          */
3193                                         found = NULL;
3194                                         break;
3195                                 }
3196                                 if (found != NULL && foundsig != NULL)
3197                                         break;
3198                         }
3199
3200                         /*
3201                          * If we found a type we were looking for,
3202                          * remember it.
3203                          */
3204                         if (header->type == type ||
3205                             type == dns_rdatatype_any ||
3206                             (header->type == dns_rdatatype_cname &&
3207                              cname_ok)) {
3208                                 /*
3209                                  * We've found the answer!
3210                                  */
3211                                 found = header;
3212                                 if (header->type == dns_rdatatype_cname &&
3213                                     cname_ok) {
3214                                         /*
3215                                          * We may be finding a CNAME instead
3216                                          * of the desired type.
3217                                          *
3218                                          * If we've already got the CNAME RRSIG,
3219                                          * use it, otherwise change sigtype
3220                                          * so that we find it.
3221                                          */
3222                                         if (cnamesig != NULL)
3223                                                 foundsig = cnamesig;
3224                                         else
3225                                                 sigtype =
3226                                                     RBTDB_RDATATYPE_SIGCNAME;
3227                                 }
3228                                 /*
3229                                  * If we've got all we need, end the search.
3230                                  */
3231                                 if (!maybe_zonecut && foundsig != NULL)
3232                                         break;
3233                         } else if (header->type == sigtype) {
3234                                 /*
3235                                  * We've found the RRSIG rdataset for our
3236                                  * target type.  Remember it.
3237                                  */
3238                                 foundsig = header;
3239                                 /*
3240                                  * If we've got all we need, end the search.
3241                                  */
3242                                 if (!maybe_zonecut && found != NULL)
3243                                         break;
3244                         } else if (header->type == dns_rdatatype_nsec) {
3245                                 /*
3246                                  * Remember a NSEC rdataset even if we're
3247                                  * not specifically looking for it, because
3248                                  * we might need it later.
3249                                  */
3250                                 nsecheader = header;
3251                         } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
3252                                 /*
3253                                  * If we need the NSEC rdataset, we'll also
3254                                  * need its signature.
3255                                  */
3256                                 nsecsig = header;
3257                         } else if (cname_ok &&
3258                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
3259                                 /*
3260                                  * If we get a CNAME match, we'll also need
3261                                  * its signature.
3262                                  */
3263                                 cnamesig = header;
3264                         }
3265                 }
3266         }
3267
3268         if (empty_node) {
3269                 /*
3270                  * We have an exact match for the name, but there are no
3271                  * active rdatasets in the desired version.  That means that
3272                  * this node doesn't exist in the desired version, and that
3273                  * we really have a partial match.
3274                  */
3275                 if (!wild) {
3276                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3277                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3278                         goto partial_match;
3279                 }
3280         }
3281
3282         /*
3283          * If we didn't find what we were looking for...
3284          */
3285         if (found == NULL) {
3286                 if (search.zonecut != NULL) {
3287                         /*
3288                          * We were trying to find glue at a node beneath a
3289                          * zone cut, but didn't.
3290                          *
3291                          * Return the delegation.
3292                          */
3293                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3294                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3295                         result = setup_delegation(&search, nodep, foundname,
3296                                                   rdataset, sigrdataset);
3297                         goto tree_exit;
3298                 }
3299                 /*
3300                  * The desired type doesn't exist.
3301                  */
3302                 result = DNS_R_NXRRSET;
3303                 if (search.rbtdb->secure &&
3304                     (nsecheader == NULL || nsecsig == NULL)) {
3305                         /*
3306                          * The zone is secure but there's no NSEC,
3307                          * or the NSEC has no signature!
3308                          */
3309                         if (!wild) {
3310                                 result = DNS_R_BADDB;
3311                                 goto node_exit;
3312                         }
3313
3314                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3315                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3316                         result = find_closest_nsec(&search, nodep, foundname,
3317                                                    rdataset, sigrdataset,
3318                                                    search.rbtdb->secure);
3319                         if (result == ISC_R_SUCCESS)
3320                                 result = DNS_R_EMPTYWILD;
3321                         goto tree_exit;
3322                 }
3323                 if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
3324                     nsecheader == NULL)
3325                 {
3326                         /*
3327                          * There's no NSEC record, and we were told
3328                          * to find one.
3329                          */
3330                         result = DNS_R_BADDB;
3331                         goto node_exit;
3332                 }
3333                 if (nodep != NULL) {
3334                         new_reference(search.rbtdb, node);
3335                         *nodep = node;
3336                 }
3337                 if (search.rbtdb->secure ||
3338                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3339                 {
3340                         bind_rdataset(search.rbtdb, node, nsecheader,
3341                                       0, rdataset);
3342                         if (nsecsig != NULL)
3343                                 bind_rdataset(search.rbtdb, node,
3344                                               nsecsig, 0, sigrdataset);
3345                 }
3346                 if (wild)
3347                         foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3348                 goto node_exit;
3349         }
3350
3351         /*
3352          * We found what we were looking for, or we found a CNAME.
3353          */
3354
3355         if (type != found->type &&
3356             type != dns_rdatatype_any &&
3357             found->type == dns_rdatatype_cname) {
3358                 /*
3359                  * We weren't doing an ANY query and we found a CNAME instead
3360                  * of the type we were looking for, so we need to indicate
3361                  * that result to the caller.
3362                  */
3363                 result = DNS_R_CNAME;
3364         } else if (search.zonecut != NULL) {
3365                 /*
3366                  * If we're beneath a zone cut, we must indicate that the
3367                  * result is glue, unless we're actually at the zone cut
3368                  * and the type is NSEC or KEY.
3369                  */
3370                 if (search.zonecut == node) {
3371                         /*
3372                          * It is not clear if KEY should still be
3373                          * allowed at the parent side of the zone
3374                          * cut or not.  It is needed for RFC3007
3375                          * validated updates.
3376                          */
3377                         if (type == dns_rdatatype_nsec ||
3378                             type == dns_rdatatype_key)
3379                                 result = ISC_R_SUCCESS;
3380                         else if (type == dns_rdatatype_any)
3381                                 result = DNS_R_ZONECUT;
3382                         else
3383                                 result = DNS_R_GLUE;
3384                 } else
3385                         result = DNS_R_GLUE;
3386                 /*
3387                  * We might have found data that isn't glue, but was occluded
3388                  * by a dynamic update.  If the caller cares about this, they
3389                  * will have told us to validate glue.
3390                  *
3391                  * XXX We should cache the glue validity state!
3392                  */
3393                 if (result == DNS_R_GLUE &&
3394                     (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
3395                     !valid_glue(&search, foundname, type, node)) {
3396                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3397                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3398                         result = setup_delegation(&search, nodep, foundname,
3399                                                   rdataset, sigrdataset);
3400                     goto tree_exit;
3401                 }
3402         } else {
3403                 /*
3404                  * An ordinary successful query!
3405                  */
3406                 result = ISC_R_SUCCESS;
3407         }
3408
3409         if (nodep != NULL) {
3410                 if (!at_zonecut)
3411                         new_reference(search.rbtdb, node);
3412                 else
3413                         search.need_cleanup = ISC_FALSE;
3414                 *nodep = node;
3415         }
3416
3417         if (type != dns_rdatatype_any) {
3418                 bind_rdataset(search.rbtdb, node, found, 0, rdataset);
3419                 if (foundsig != NULL)
3420                         bind_rdataset(search.rbtdb, node, foundsig, 0,
3421                                       sigrdataset);
3422         }
3423
3424         if (wild)
3425                 foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3426
3427  node_exit:
3428         NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3429                     isc_rwlocktype_read);
3430
3431  tree_exit:
3432         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3433
3434         /*
3435          * If we found a zonecut but aren't going to use it, we have to
3436          * let go of it.
3437          */
3438         if (search.need_cleanup) {
3439                 node = search.zonecut;
3440                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
3441
3442                 NODE_LOCK(lock, isc_rwlocktype_read);
3443                 decrement_reference(search.rbtdb, node, 0,
3444                                     isc_rwlocktype_read, isc_rwlocktype_none,
3445                                     ISC_FALSE);
3446                 NODE_UNLOCK(lock, isc_rwlocktype_read);
3447         }
3448
3449         if (close_version)
3450                 closeversion(db, &version, ISC_FALSE);
3451
3452         dns_rbtnodechain_reset(&search.chain);
3453
3454         return (result);
3455 }
3456
3457 static isc_result_t
3458 zone_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
3459                  isc_stdtime_t now, dns_dbnode_t **nodep,
3460                  dns_name_t *foundname,
3461                  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3462 {
3463         UNUSED(db);
3464         UNUSED(name);
3465         UNUSED(options);
3466         UNUSED(now);
3467         UNUSED(nodep);
3468         UNUSED(foundname);
3469         UNUSED(rdataset);
3470         UNUSED(sigrdataset);
3471
3472         FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
3473
3474         return (ISC_R_NOTIMPLEMENTED);
3475 }
3476
3477 static isc_result_t
3478 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
3479         rbtdb_search_t *search = arg;
3480         rdatasetheader_t *header, *header_prev, *header_next;
3481         rdatasetheader_t *dname_header, *sigdname_header;
3482         isc_result_t result;
3483         nodelock_t *lock;
3484         isc_rwlocktype_t locktype;
3485
3486         /* XXX comment */
3487
3488         REQUIRE(search->zonecut == NULL);
3489
3490         /*
3491          * Keep compiler silent.
3492          */
3493         UNUSED(name);
3494
3495         lock = &(search->rbtdb->node_locks[node->locknum].lock);
3496         locktype = isc_rwlocktype_read;
3497         NODE_LOCK(lock, locktype);
3498
3499         /*
3500          * Look for a DNAME or RRSIG DNAME rdataset.
3501          */
3502         dname_header = NULL;
3503         sigdname_header = NULL;
3504         header_prev = NULL;
3505         for (header = node->data; header != NULL; header = header_next) {
3506                 header_next = header->next;
3507                 if (header->rdh_ttl <= search->now) {
3508                         /*
3509                          * This rdataset is stale.  If no one else is
3510                          * using the node, we can clean it up right
3511                          * now, otherwise we mark it as stale, and
3512                          * the node as dirty, so it will get cleaned
3513                          * up later.
3514                          */
3515                         if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) &&
3516                             (locktype == isc_rwlocktype_write ||
3517                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3518                                 /*
3519                                  * We update the node's status only when we
3520                                  * can get write access; otherwise, we leave
3521                                  * others to this work.  Periodical cleaning
3522                                  * will eventually take the job as the last
3523                                  * resort.
3524                                  * We won't downgrade the lock, since other
3525                                  * rdatasets are probably stale, too.
3526                                  */
3527                                 locktype = isc_rwlocktype_write;
3528
3529                                 if (dns_rbtnode_refcurrent(node) == 0) {
3530                                         isc_mem_t *mctx;
3531
3532                                         /*
3533                                          * header->down can be non-NULL if the
3534                                          * refcount has just decremented to 0
3535                                          * but decrement_reference() has not
3536                                          * performed clean_cache_node(), in
3537                                          * which case we need to purge the
3538                                          * stale headers first.
3539                                          */
3540                                         mctx = search->rbtdb->common.mctx;
3541                                         clean_stale_headers(search->rbtdb,
3542                                                             mctx,
3543                                                             header);
3544                                         if (header_prev != NULL)
3545                                                 header_prev->next =
3546                                                         header->next;
3547                                         else
3548                                                 node->data = header->next;
3549                                         free_rdataset(search->rbtdb, mctx,
3550                                                       header);
3551                                 } else {
3552                                         header->attributes |=
3553                                                 RDATASET_ATTR_STALE;
3554                                         node->dirty = 1;
3555                                         header_prev = header;
3556                                 }
3557                         } else
3558                                 header_prev = header;
3559                 } else if (header->type == dns_rdatatype_dname &&
3560                            EXISTS(header)) {
3561                         dname_header = header;
3562                         header_prev = header;
3563                 } else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
3564                          EXISTS(header)) {
3565                         sigdname_header = header;
3566                         header_prev = header;
3567                 } else
3568                         header_prev = header;
3569         }
3570
3571         if (dname_header != NULL &&
3572             (!DNS_TRUST_PENDING(dname_header->trust) ||
3573              (search->options & DNS_DBFIND_PENDINGOK) != 0)) {
3574                 /*
3575                  * We increment the reference count on node to ensure that
3576                  * search->zonecut_rdataset will still be valid later.
3577                  */
3578                 new_reference(search->rbtdb, node);
3579                 INSIST(!ISC_LINK_LINKED(node, deadlink));
3580                 search->zonecut = node;
3581                 search->zonecut_rdataset = dname_header;
3582                 search->zonecut_sigrdataset = sigdname_header;
3583                 search->need_cleanup = ISC_TRUE;
3584                 result = DNS_R_PARTIALMATCH;
3585         } else
3586                 result = DNS_R_CONTINUE;
3587
3588         NODE_UNLOCK(lock, locktype);
3589
3590         return (result);
3591 }
3592
3593 static inline isc_result_t
3594 find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
3595                      dns_dbnode_t **nodep, dns_name_t *foundname,
3596                      dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3597 {
3598         unsigned int i;
3599         dns_rbtnode_t *level_node;
3600         rdatasetheader_t *header, *header_prev, *header_next;
3601         rdatasetheader_t *found, *foundsig;
3602         isc_result_t result = ISC_R_NOTFOUND;
3603         dns_name_t name;
3604         dns_rbtdb_t *rbtdb;
3605         isc_boolean_t done;
3606         nodelock_t *lock;
3607         isc_rwlocktype_t locktype;
3608
3609         /*
3610          * Caller must be holding the tree lock.
3611          */
3612
3613         rbtdb = search->rbtdb;
3614         i = search->chain.level_matches;
3615         done = ISC_FALSE;
3616         do {
3617                 locktype = isc_rwlocktype_read;
3618                 lock = &rbtdb->node_locks[node->locknum].lock;
3619                 NODE_LOCK(lock, locktype);
3620
3621                 /*
3622                  * Look for NS and RRSIG NS rdatasets.
3623                  */
3624                 found = NULL;
3625                 foundsig = NULL;
3626                 header_prev = NULL;
3627                 for (header = node->data;
3628                      header != NULL;
3629                      header = header_next) {
3630                         header_next = header->next;
3631                         if (header->rdh_ttl <= search->now) {
3632                                 /*
3633                                  * This rdataset is stale.  If no one else is
3634                                  * using the node, we can clean it up right
3635                                  * now, otherwise we mark it as stale, and
3636                                  * the node as dirty, so it will get cleaned
3637                                  * up later.
3638                                  */
3639                                 if ((header->rdh_ttl <= search->now -
3640                                                     RBTDB_VIRTUAL) &&
3641                                     (locktype == isc_rwlocktype_write ||
3642                                      NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3643                                         /*
3644                                          * We update the node's status only
3645                                          * when we can get write access.
3646                                          */
3647                                         locktype = isc_rwlocktype_write;
3648
3649                                         if (dns_rbtnode_refcurrent(node)
3650                                             == 0) {
3651                                                 isc_mem_t *m;
3652
3653                                                 m = search->rbtdb->common.mctx;
3654                                                 clean_stale_headers(
3655                                                         search->rbtdb,
3656                                                         m, header);
3657                                                 if (header_prev != NULL)
3658                                                         header_prev->next =
3659                                                                 header->next;
3660                                                 else
3661                                                         node->data =
3662                                                                 header->next;
3663                                                 free_rdataset(rbtdb, m,
3664                                                               header);
3665                                         } else {
3666                                                 header->attributes |=
3667                                                         RDATASET_ATTR_STALE;
3668                                                 node->dirty = 1;
3669                                                 header_prev = header;
3670                                         }
3671                                 } else
3672                                         header_prev = header;
3673                         } else if (EXISTS(header)) {
3674                                 /*
3675                                  * We've found an extant rdataset.  See if
3676                                  * we're interested in it.
3677                                  */
3678                                 if (header->type == dns_rdatatype_ns) {
3679                                         found = header;
3680                                         if (foundsig != NULL)
3681                                                 break;
3682                                 } else if (header->type ==
3683                                            RBTDB_RDATATYPE_SIGNS) {
3684                                         foundsig = header;
3685                                         if (found != NULL)
3686                                                 break;
3687                                 }
3688                                 header_prev = header;
3689                         } else
3690                                 header_prev = header;
3691                 }
3692
3693                 if (found != NULL) {
3694                         /*
3695                          * If we have to set foundname, we do it before
3696                          * anything else.  If we were to set foundname after
3697                          * we had set nodep or bound the rdataset, then we'd
3698                          * have to undo that work if dns_name_concatenate()
3699                          * failed.  By setting foundname first, there's
3700                          * nothing to undo if we have trouble.
3701                          */
3702                         if (foundname != NULL) {
3703                                 dns_name_init(&name, NULL);
3704                                 dns_rbt_namefromnode(node, &name);
3705                                 result = dns_name_copy(&name, foundname, NULL);
3706                                 while (result == ISC_R_SUCCESS && i > 0) {
3707                                         i--;
3708                                         level_node = search->chain.levels[i];
3709                                         dns_name_init(&name, NULL);
3710                                         dns_rbt_namefromnode(level_node,
3711                                                              &name);
3712                                         result =
3713                                                 dns_name_concatenate(foundname,
3714                                                                      &name,
3715                                                                      foundname,
3716                                                                      NULL);
3717                                 }
3718                                 if (result != ISC_R_SUCCESS) {
3719                                         *nodep = NULL;
3720                                         goto node_exit;
3721                                 }
3722                         }
3723                         result = DNS_R_DELEGATION;
3724                         if (nodep != NULL) {
3725                                 new_reference(search->rbtdb, node);
3726                                 *nodep = node;
3727                         }
3728                         bind_rdataset(search->rbtdb, node, found, search->now,
3729                                       rdataset);
3730                         if (foundsig != NULL)
3731                                 bind_rdataset(search->rbtdb, node, foundsig,
3732                                               search->now, sigrdataset);
3733                         if (need_headerupdate(found, search->now) ||
3734                             (foundsig != NULL &&
3735                              need_headerupdate(foundsig, search->now))) {
3736                                 if (locktype != isc_rwlocktype_write) {
3737                                         NODE_UNLOCK(lock, locktype);
3738                                         NODE_LOCK(lock, isc_rwlocktype_write);
3739                                         locktype = isc_rwlocktype_write;
3740                                 }
3741                                 if (need_headerupdate(found, search->now))
3742                                         update_header(search->rbtdb, found,
3743                                                       search->now);
3744                                 if (foundsig != NULL &&
3745                                     need_headerupdate(foundsig, search->now)) {
3746                                         update_header(search->rbtdb, foundsig,
3747                                                       search->now);
3748                                 }
3749                         }
3750                 }
3751
3752         node_exit:
3753                 NODE_UNLOCK(lock, locktype);
3754
3755                 if (found == NULL && i > 0) {
3756                         i--;
3757                         node = search->chain.levels[i];
3758                 } else
3759                         done = ISC_TRUE;
3760
3761         } while (!done);
3762
3763         return (result);
3764 }
3765
3766 static isc_result_t
3767 find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3768                   isc_stdtime_t now, dns_name_t *foundname,
3769                   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3770 {
3771         dns_rbtnode_t *node;
3772         rdatasetheader_t *header, *header_next, *header_prev;
3773         rdatasetheader_t *found, *foundsig;
3774         isc_boolean_t empty_node;
3775         isc_result_t result;
3776         dns_fixedname_t fname, forigin;
3777         dns_name_t *name, *origin;
3778         rbtdb_rdatatype_t matchtype, sigmatchtype;
3779         nodelock_t *lock;
3780         isc_rwlocktype_t locktype;
3781
3782         matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
3783         sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
3784                                              dns_rdatatype_nsec);
3785
3786         do {
3787                 node = NULL;
3788                 dns_fixedname_init(&fname);
3789                 name = dns_fixedname_name(&fname);
3790                 dns_fixedname_init(&forigin);
3791                 origin = dns_fixedname_name(&forigin);
3792                 result = dns_rbtnodechain_current(&search->chain, name,
3793                                                   origin, &node);
3794                 if (result != ISC_R_SUCCESS)
3795                         return (result);
3796                 locktype = isc_rwlocktype_read;
3797                 lock = &(search->rbtdb->node_locks[node->locknum].lock);
3798                 NODE_LOCK(lock, locktype);
3799                 found = NULL;
3800                 foundsig = NULL;
3801                 empty_node = ISC_TRUE;
3802                 header_prev = NULL;
3803                 for (header = node->data;
3804                      header != NULL;
3805                      header = header_next) {
3806                         header_next = header->next;
3807                         if (header->rdh_ttl <= now) {
3808                                 /*
3809                                  * This rdataset is stale.  If no one else is
3810                                  * using the node, we can clean it up right
3811                                  * now, otherwise we mark it as stale, and the
3812                                  * node as dirty, so it will get cleaned up
3813                                  * later.
3814                                  */
3815                                 if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3816                                     (locktype == isc_rwlocktype_write ||
3817                                      NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3818                                         /*
3819                                          * We update the node's status only
3820                                          * when we can get write access.
3821                                          */
3822                                         locktype = isc_rwlocktype_write;
3823
3824                                         if (dns_rbtnode_refcurrent(node)
3825                                             == 0) {
3826                                                 isc_mem_t *m;
3827
3828                                                 m = search->rbtdb->common.mctx;
3829                                                 clean_stale_headers(
3830                                                         search->rbtdb,
3831                                                         m, header);
3832                                                 if (header_prev != NULL)
3833                                                         header_prev->next =
3834                                                                 header->next;
3835                                                 else
3836                                                         node->data = header->next;
3837                                                 free_rdataset(search->rbtdb, m,
3838                                                               header);
3839                                         } else {
3840                                                 header->attributes |=
3841                                                         RDATASET_ATTR_STALE;
3842                                                 node->dirty = 1;
3843                                                 header_prev = header;
3844                                         }
3845                                 } else
3846                                         header_prev = header;
3847                                 continue;
3848                         }
3849                         if (NONEXISTENT(header) ||
3850                             RBTDB_RDATATYPE_BASE(header->type) == 0) {
3851                                 header_prev = header;
3852                                 continue;
3853                         }
3854                         empty_node = ISC_FALSE;
3855                         if (header->type == matchtype)
3856                                 found = header;
3857                         else if (header->type == sigmatchtype)
3858                                 foundsig = header;
3859                         header_prev = header;
3860                 }
3861                 if (found != NULL) {
3862                         result = dns_name_concatenate(name, origin,
3863                                                       foundname, NULL);
3864                         if (result != ISC_R_SUCCESS)
3865                                 goto unlock_node;
3866                         bind_rdataset(search->rbtdb, node, found,
3867                                       now, rdataset);
3868                         if (foundsig != NULL)
3869                                 bind_rdataset(search->rbtdb, node, foundsig,
3870                                               now, sigrdataset);
3871                         new_reference(search->rbtdb, node);
3872                         *nodep = node;
3873                         result = DNS_R_COVERINGNSEC;
3874                 } else if (!empty_node) {
3875                         result = ISC_R_NOTFOUND;
3876                 } else
3877                         result = dns_rbtnodechain_prev(&search->chain, NULL,
3878                                                        NULL);
3879  unlock_node:
3880                 NODE_UNLOCK(lock, locktype);
3881         } while (empty_node && result == ISC_R_SUCCESS);
3882         return (result);
3883 }
3884
3885 static isc_result_t
3886 cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
3887            dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
3888            dns_dbnode_t **nodep, dns_name_t *foundname,
3889            dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3890 {
3891         dns_rbtnode_t *node = NULL;
3892         isc_result_t result;
3893         rbtdb_search_t search;
3894         isc_boolean_t cname_ok = ISC_TRUE;
3895         isc_boolean_t empty_node;
3896         nodelock_t *lock;
3897         isc_rwlocktype_t locktype;
3898         rdatasetheader_t *header, *header_prev, *header_next;
3899         rdatasetheader_t *found, *nsheader;
3900         rdatasetheader_t *foundsig, *nssig, *cnamesig;
3901         rdatasetheader_t *update, *updatesig;
3902         rbtdb_rdatatype_t sigtype, negtype;
3903
3904         UNUSED(version);
3905
3906         search.rbtdb = (dns_rbtdb_t *)db;
3907
3908         REQUIRE(VALID_RBTDB(search.rbtdb));
3909         REQUIRE(version == NULL);
3910
3911         if (now == 0)
3912                 isc_stdtime_get(&now);
3913
3914         search.rbtversion = NULL;
3915         search.serial = 1;
3916         search.options = options;
3917         search.copy_name = ISC_FALSE;
3918         search.need_cleanup = ISC_FALSE;
3919         search.wild = ISC_FALSE;
3920         search.zonecut = NULL;
3921         dns_fixedname_init(&search.zonecut_name);
3922         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3923         search.now = now;
3924         update = NULL;
3925         updatesig = NULL;
3926
3927         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3928
3929         /*
3930          * Search down from the root of the tree.  If, while going down, we
3931          * encounter a callback node, cache_zonecut_callback() will search the
3932          * rdatasets at the zone cut for a DNAME rdataset.
3933          */
3934         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3935                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3936                                   cache_zonecut_callback, &search);
3937
3938         if (result == DNS_R_PARTIALMATCH) {
3939                 if ((search.options & DNS_DBFIND_COVERINGNSEC) != 0) {
3940                         result = find_coveringnsec(&search, nodep, now,
3941                                                    foundname, rdataset,
3942                                                    sigrdataset);
3943                         if (result == DNS_R_COVERINGNSEC)
3944                                 goto tree_exit;
3945                 }
3946                 if (search.zonecut != NULL) {
3947                     result = setup_delegation(&search, nodep, foundname,
3948                                               rdataset, sigrdataset);
3949                     goto tree_exit;
3950                 } else {
3951                 find_ns:
3952                         result = find_deepest_zonecut(&search, node, nodep,
3953                                                       foundname, rdataset,
3954                                                       sigrdataset);
3955                         goto tree_exit;
3956                 }
3957         } else if (result != ISC_R_SUCCESS)
3958                 goto tree_exit;
3959
3960         /*
3961          * Certain DNSSEC types are not subject to CNAME matching
3962          * (RFC4035, section 2.5 and RFC3007).
3963          *
3964          * We don't check for RRSIG, because we don't store RRSIG records
3965          * directly.
3966          */
3967         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3968                 cname_ok = ISC_FALSE;
3969
3970         /*
3971          * We now go looking for rdata...
3972          */
3973
3974         lock = &(search.rbtdb->node_locks[node->locknum].lock);
3975         locktype = isc_rwlocktype_read;
3976         NODE_LOCK(lock, locktype);
3977
3978         found = NULL;
3979         foundsig = NULL;
3980         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3981         negtype = RBTDB_RDATATYPE_VALUE(0, type);
3982         nsheader = NULL;
3983         nssig = NULL;
3984         cnamesig = NULL;
3985         empty_node = ISC_TRUE;
3986         header_prev = NULL;
3987         for (header = node->data; header != NULL; header = header_next) {
3988                 header_next = header->next;
3989                 if (header->rdh_ttl <= now) {
3990                         /*
3991                          * This rdataset is stale.  If no one else is using the
3992                          * node, we can clean it up right now, otherwise we
3993                          * mark it as stale, and the node as dirty, so it will
3994                          * get cleaned up later.
3995                          */
3996                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3997                             (locktype == isc_rwlocktype_write ||
3998                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3999                                 /*
4000                                  * We update the node's status only when we
4001                                  * can get write access.
4002                                  */
4003                                 locktype = isc_rwlocktype_write;
4004
4005                                 if (dns_rbtnode_refcurrent(node) == 0) {
4006                                         isc_mem_t *mctx;
4007
4008                                         mctx = search.rbtdb->common.mctx;
4009                                         clean_stale_headers(search.rbtdb, mctx,
4010                                                             header);
4011                                         if (header_prev != NULL)
4012                                                 header_prev->next =
4013                                                         header->next;
4014                                         else
4015                                                 node->data = header->next;
4016                                         free_rdataset(search.rbtdb, mctx,
4017                                                       header);
4018                                 } else {
4019                                         header->attributes |=
4020                                                 RDATASET_ATTR_STALE;
4021                                         node->dirty = 1;
4022                                         header_prev = header;
4023                                 }
4024                         } else
4025                                 header_prev = header;
4026                 } else if (EXISTS(header)) {
4027                         /*
4028                          * We now know that there is at least one active
4029                          * non-stale rdataset at this node.
4030                          */
4031                         empty_node = ISC_FALSE;
4032
4033                         /*
4034                          * If we found a type we were looking for, remember
4035                          * it.
4036                          */
4037                         if (header->type == type ||
4038                             (type == dns_rdatatype_any &&
4039                              RBTDB_RDATATYPE_BASE(header->type) != 0) ||
4040                             (cname_ok && header->type ==
4041                              dns_rdatatype_cname)) {
4042                                 /*
4043                                  * We've found the answer.
4044                                  */
4045                                 found = header;
4046                                 if (header->type == dns_rdatatype_cname &&
4047                                     cname_ok &&
4048                                     cnamesig != NULL) {
4049                                         /*
4050                                          * If we've already got the CNAME RRSIG,
4051                                          * use it, otherwise change sigtype
4052                                          * so that we find it.
4053                                          */
4054                                         if (cnamesig != NULL)
4055                                                 foundsig = cnamesig;
4056                                         else
4057                                                 sigtype =
4058                                                     RBTDB_RDATATYPE_SIGCNAME;
4059                                         foundsig = cnamesig;
4060                                 }
4061                         } else if (header->type == sigtype) {
4062                                 /*
4063                                  * We've found the RRSIG rdataset for our
4064                                  * target type.  Remember it.
4065                                  */
4066                                 foundsig = header;
4067                         } else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
4068                                    header->type == negtype) {
4069                                 /*
4070                                  * We've found a negative cache entry.
4071                                  */
4072                                 found = header;
4073                         } else if (header->type == dns_rdatatype_ns) {
4074                                 /*
4075                                  * Remember a NS rdataset even if we're
4076                                  * not specifically looking for it, because
4077                                  * we might need it later.
4078                                  */
4079                                 nsheader = header;
4080                         } else if (header->type == RBTDB_RDATATYPE_SIGNS) {
4081                                 /*
4082                                  * If we need the NS rdataset, we'll also
4083                                  * need its signature.
4084                                  */
4085                                 nssig = header;
4086                         } else if (cname_ok &&
4087                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
4088                                 /*
4089                                  * If we get a CNAME match, we'll also need
4090                                  * its signature.
4091                                  */
4092                                 cnamesig = header;
4093                         }
4094                         header_prev = header;
4095                 } else
4096                         header_prev = header;
4097         }
4098
4099         if (empty_node) {
4100                 /*
4101                  * We have an exact match for the name, but there are no
4102                  * extant rdatasets.  That means that this node doesn't
4103                  * meaningfully exist, and that we really have a partial match.
4104                  */
4105                 NODE_UNLOCK(lock, locktype);
4106                 goto find_ns;
4107         }
4108
4109         /*
4110          * If we didn't find what we were looking for...
4111          */
4112         if (found == NULL ||
4113             (DNS_TRUST_ADDITIONAL(found->trust) &&
4114              ((options & DNS_DBFIND_ADDITIONALOK) == 0)) ||
4115             (found->trust == dns_trust_glue &&
4116              ((options & DNS_DBFIND_GLUEOK) == 0)) ||
4117             (DNS_TRUST_PENDING(found->trust) &&
4118              ((options & DNS_DBFIND_PENDINGOK) == 0))) {
4119                 /*
4120                  * If there is an NS rdataset at this node, then this is the
4121                  * deepest zone cut.
4122                  */
4123                 if (nsheader != NULL) {
4124                         if (nodep != NULL) {
4125                                 new_reference(search.rbtdb, node);
4126                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4127                                 *nodep = node;
4128                         }
4129                         bind_rdataset(search.rbtdb, node, nsheader, search.now,
4130                                       rdataset);
4131                         if (need_headerupdate(nsheader, search.now))
4132                                 update = nsheader;
4133                         if (nssig != NULL) {
4134                                 bind_rdataset(search.rbtdb, node, nssig,
4135                                               search.now, sigrdataset);
4136                                 if (need_headerupdate(nssig, search.now))
4137                                         updatesig = nssig;
4138                         }
4139                         result = DNS_R_DELEGATION;
4140                         goto node_exit;
4141                 }
4142
4143                 /*
4144                  * Go find the deepest zone cut.
4145                  */
4146                 NODE_UNLOCK(lock, locktype);
4147                 goto find_ns;
4148         }
4149
4150         /*
4151          * We found what we were looking for, or we found a CNAME.
4152          */
4153
4154         if (nodep != NULL) {
4155                 new_reference(search.rbtdb, node);
4156                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4157                 *nodep = node;
4158         }
4159
4160         if (RBTDB_RDATATYPE_BASE(found->type) == 0) {
4161                 /*
4162                  * We found a negative cache entry.
4163                  */
4164                 if (NXDOMAIN(found))
4165                         result = DNS_R_NCACHENXDOMAIN;
4166                 else
4167                         result = DNS_R_NCACHENXRRSET;
4168         } else if (type != found->type &&
4169                    type != dns_rdatatype_any &&
4170                    found->type == dns_rdatatype_cname) {
4171                 /*
4172                  * We weren't doing an ANY query and we found a CNAME instead
4173                  * of the type we were looking for, so we need to indicate
4174                  * that result to the caller.
4175                  */
4176                 result = DNS_R_CNAME;
4177         } else {
4178                 /*
4179                  * An ordinary successful query!
4180                  */
4181                 result = ISC_R_SUCCESS;
4182         }
4183
4184         if (type != dns_rdatatype_any || result == DNS_R_NCACHENXDOMAIN ||
4185             result == DNS_R_NCACHENXRRSET) {
4186                 bind_rdataset(search.rbtdb, node, found, search.now,
4187                               rdataset);
4188                 if (need_headerupdate(found, search.now))
4189                         update = found;
4190                 if (foundsig != NULL) {
4191                         bind_rdataset(search.rbtdb, node, foundsig, search.now,
4192                                       sigrdataset);
4193                         if (need_headerupdate(foundsig, search.now))
4194                                 updatesig = foundsig;
4195                 }
4196         }
4197
4198  node_exit:
4199         if ((update != NULL || updatesig != NULL) &&
4200             locktype != isc_rwlocktype_write) {
4201                 NODE_UNLOCK(lock, locktype);
4202                 NODE_LOCK(lock, isc_rwlocktype_write);
4203                 locktype = isc_rwlocktype_write;
4204         }
4205         if (update != NULL && need_headerupdate(update, search.now))
4206                 update_header(search.rbtdb, update, search.now);
4207         if (updatesig != NULL && need_headerupdate(updatesig, search.now))
4208                 update_header(search.rbtdb, updatesig, search.now);
4209
4210         NODE_UNLOCK(lock, locktype);
4211
4212  tree_exit:
4213         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4214
4215         /*
4216          * If we found a zonecut but aren't going to use it, we have to
4217          * let go of it.
4218          */
4219         if (search.need_cleanup) {
4220                 node = search.zonecut;
4221                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
4222
4223                 NODE_LOCK(lock, isc_rwlocktype_read);
4224                 decrement_reference(search.rbtdb, node, 0,
4225                                     isc_rwlocktype_read, isc_rwlocktype_none,
4226                                     ISC_FALSE);
4227                 NODE_UNLOCK(lock, isc_rwlocktype_read);
4228         }
4229
4230         dns_rbtnodechain_reset(&search.chain);
4231
4232         return (result);
4233 }
4234
4235 static isc_result_t
4236 cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
4237                   isc_stdtime_t now, dns_dbnode_t **nodep,
4238                   dns_name_t *foundname,
4239                   dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
4240 {
4241         dns_rbtnode_t *node = NULL;
4242         nodelock_t *lock;
4243         isc_result_t result;
4244         rbtdb_search_t search;
4245         rdatasetheader_t *header, *header_prev, *header_next;
4246         rdatasetheader_t *found, *foundsig;
4247         unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA;
4248         isc_rwlocktype_t locktype;
4249
4250         search.rbtdb = (dns_rbtdb_t *)db;
4251
4252         REQUIRE(VALID_RBTDB(search.rbtdb));
4253
4254         if (now == 0)
4255                 isc_stdtime_get(&now);
4256
4257         search.rbtversion = NULL;
4258         search.serial = 1;
4259         search.options = options;
4260         search.copy_name = ISC_FALSE;
4261         search.need_cleanup = ISC_FALSE;
4262         search.wild = ISC_FALSE;
4263         search.zonecut = NULL;
4264         dns_fixedname_init(&search.zonecut_name);
4265         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
4266         search.now = now;
4267
4268         if ((options & DNS_DBFIND_NOEXACT) != 0)
4269                 rbtoptions |= DNS_RBTFIND_NOEXACT;
4270
4271         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4272
4273         /*
4274          * Search down from the root of the tree.
4275          */
4276         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
4277                                   &search.chain, rbtoptions, NULL, &search);
4278
4279         if (result == DNS_R_PARTIALMATCH) {
4280         find_ns:
4281                 result = find_deepest_zonecut(&search, node, nodep, foundname,
4282                                               rdataset, sigrdataset);
4283                 goto tree_exit;
4284         } else if (result != ISC_R_SUCCESS)
4285                 goto tree_exit;
4286
4287         /*
4288          * We now go looking for an NS rdataset at the node.
4289          */
4290
4291         lock = &(search.rbtdb->node_locks[node->locknum].lock);
4292         locktype = isc_rwlocktype_read;
4293         NODE_LOCK(lock, locktype);
4294
4295         found = NULL;
4296         foundsig = NULL;
4297         header_prev = NULL;
4298         for (header = node->data; header != NULL; header = header_next) {
4299                 header_next = header->next;
4300                 if (header->rdh_ttl <= now) {
4301                         /*
4302                          * This rdataset is stale.  If no one else is using the
4303                          * node, we can clean it up right now, otherwise we
4304                          * mark it as stale, and the node as dirty, so it will
4305                          * get cleaned up later.
4306                          */
4307                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
4308                             (locktype == isc_rwlocktype_write ||
4309                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
4310                                 /*
4311                                  * We update the node's status only when we
4312                                  * can get write access.
4313                                  */
4314                                 locktype = isc_rwlocktype_write;
4315
4316                                 if (dns_rbtnode_refcurrent(node) == 0) {
4317                                         isc_mem_t *mctx;
4318
4319                                         mctx = search.rbtdb->common.mctx;
4320                                         clean_stale_headers(search.rbtdb, mctx,
4321                                                             header);
4322                                         if (header_prev != NULL)
4323                                                 header_prev->next =
4324                                                         header->next;
4325                                         else
4326                                                 node->data = header->next;
4327                                         free_rdataset(search.rbtdb, mctx,
4328                                                       header);
4329                                 } else {
4330                                         header->attributes |=
4331                                                 RDATASET_ATTR_STALE;
4332                                         node->dirty = 1;
4333                                         header_prev = header;
4334                                 }
4335                         } else
4336                                 header_prev = header;
4337                 } else if (EXISTS(header)) {
4338                         /*
4339                          * If we found a type we were looking for, remember
4340                          * it.
4341                          */
4342                         if (header->type == dns_rdatatype_ns) {
4343                                 /*
4344                                  * Remember a NS rdataset even if we're
4345                                  * not specifically looking for it, because
4346                                  * we might need it later.
4347                                  */
4348                                 found = header;
4349                         } else if (header->type == RBTDB_RDATATYPE_SIGNS) {
4350                                 /*
4351                                  * If we need the NS rdataset, we'll also
4352                                  * need its signature.
4353                                  */
4354                                 foundsig = header;
4355                         }
4356                         header_prev = header;
4357                 } else
4358                         header_prev = header;
4359         }
4360
4361         if (found == NULL) {
4362                 /*
4363                  * No NS records here.
4364                  */
4365                 NODE_UNLOCK(lock, locktype);
4366                 goto find_ns;
4367         }
4368
4369         if (nodep != NULL) {
4370                 new_reference(search.rbtdb, node);
4371                 INSIST(!ISC_LINK_LINKED(node, deadlink));
4372                 *nodep = node;
4373         }
4374
4375         bind_rdataset(search.rbtdb, node, found, search.now, rdataset);
4376         if (foundsig != NULL)
4377                 bind_rdataset(search.rbtdb, node, foundsig, search.now,
4378                               sigrdataset);
4379
4380         if (need_headerupdate(found, search.now) ||
4381             (foundsig != NULL &&  need_headerupdate(foundsig, search.now))) {
4382                 if (locktype != isc_rwlocktype_write) {
4383                         NODE_UNLOCK(lock, locktype);
4384                         NODE_LOCK(lock, isc_rwlocktype_write);
4385                         locktype = isc_rwlocktype_write;
4386                 }
4387                 if (need_headerupdate(found, search.now))
4388                         update_header(search.rbtdb, found, search.now);
4389                 if (foundsig != NULL &&
4390                     need_headerupdate(foundsig, search.now)) {
4391                         update_header(search.rbtdb, foundsig, search.now);
4392                 }
4393         }
4394
4395         NODE_UNLOCK(lock, locktype);
4396
4397  tree_exit:
4398         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
4399
4400         INSIST(!search.need_cleanup);
4401
4402         dns_rbtnodechain_reset(&search.chain);
4403
4404         if (result == DNS_R_DELEGATION)
4405                 result = ISC_R_SUCCESS;
4406
4407         return (result);
4408 }
4409
4410 static void
4411 attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) {
4412         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4413         dns_rbtnode_t *node = (dns_rbtnode_t *)source;
4414         unsigned int refs;
4415
4416         REQUIRE(VALID_RBTDB(rbtdb));
4417         REQUIRE(targetp != NULL && *targetp == NULL);
4418
4419         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
4420         dns_rbtnode_refincrement(node, &refs);
4421         INSIST(refs != 0);
4422         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
4423
4424         *targetp = source;
4425 }
4426
4427 static void
4428 detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
4429         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4430         dns_rbtnode_t *node;
4431         isc_boolean_t want_free = ISC_FALSE;
4432         isc_boolean_t inactive = ISC_FALSE;
4433         rbtdb_nodelock_t *nodelock;
4434
4435         REQUIRE(VALID_RBTDB(rbtdb));
4436         REQUIRE(targetp != NULL && *targetp != NULL);
4437
4438         node = (dns_rbtnode_t *)(*targetp);
4439         nodelock = &rbtdb->node_locks[node->locknum];
4440
4441         NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
4442
4443         if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
4444                                 isc_rwlocktype_none, ISC_FALSE)) {
4445                 if (isc_refcount_current(&nodelock->references) == 0 &&
4446                     nodelock->exiting) {
4447                         inactive = ISC_TRUE;
4448                 }
4449         }
4450
4451         NODE_UNLOCK(&nodelock->lock, isc_rwlocktype_read);
4452
4453         *targetp = NULL;
4454
4455         if (inactive) {
4456                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
4457                 rbtdb->active--;
4458                 if (rbtdb->active == 0)
4459                         want_free = ISC_TRUE;
4460                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
4461                 if (want_free) {
4462                         char buf[DNS_NAME_FORMATSIZE];
4463                         if (dns_name_dynamic(&rbtdb->common.origin))
4464                                 dns_name_format(&rbtdb->common.origin, buf,
4465                                                 sizeof(buf));
4466                         else
4467                                 strcpy(buf, "<UNKNOWN>");
4468                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
4469                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
4470                                       "calling free_rbtdb(%s)", buf);
4471                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
4472                 }
4473         }
4474 }
4475
4476 static isc_result_t
4477 expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) {
4478         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4479         dns_rbtnode_t *rbtnode = node;
4480         rdatasetheader_t *header;
4481         isc_boolean_t force_expire = ISC_FALSE;
4482         /*
4483          * These are the category and module used by the cache cleaner.
4484          */
4485         isc_boolean_t log = ISC_FALSE;
4486         isc_logcategory_t *category = DNS_LOGCATEGORY_DATABASE;
4487         isc_logmodule_t *module = DNS_LOGMODULE_CACHE;
4488         int level = ISC_LOG_DEBUG(2);
4489         char printname[DNS_NAME_FORMATSIZE];
4490
4491         REQUIRE(VALID_RBTDB(rbtdb));
4492
4493         /*
4494          * Caller must hold a tree lock.
4495          */
4496
4497         if (now == 0)
4498                 isc_stdtime_get(&now);
4499
4500         if (rbtdb->overmem) {
4501                 isc_uint32_t val;
4502
4503                 isc_random_get(&val);
4504                 /*
4505                  * XXXDCL Could stand to have a better policy, like LRU.
4506                  */
4507                 force_expire = ISC_TF(rbtnode->down == NULL && val % 4 == 0);
4508
4509                 /*
4510                  * Note that 'log' can be true IFF rbtdb->overmem is also true.
4511                  * rbtdb->overmem can currently only be true for cache
4512                  * databases -- hence all of the "overmem cache" log strings.
4513                  */
4514                 log = ISC_TF(isc_log_wouldlog(dns_lctx, level));
4515                 if (log)
4516                         isc_log_write(dns_lctx, category, module, level,
4517                                       "overmem cache: %s %s",
4518                                       force_expire ? "FORCE" : "check",
4519                                       dns_rbt_formatnodename(rbtnode,
4520                                                            printname,
4521                                                            sizeof(printname)));
4522         }
4523
4524         /*
4525          * We may not need write access, but this code path is not performance
4526          * sensitive, so it should be okay to always lock as a writer.
4527          */
4528         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4529                   isc_rwlocktype_write);
4530
4531         for (header = rbtnode->data; header != NULL; header = header->next)
4532                 if (header->rdh_ttl <= now - RBTDB_VIRTUAL) {
4533                         /*
4534                          * We don't check if refcurrent(rbtnode) == 0 and try
4535                          * to free like we do in cache_find(), because
4536                          * refcurrent(rbtnode) must be non-zero.  This is so
4537                          * because 'node' is an argument to the function.
4538                          */
4539                         header->attributes |= RDATASET_ATTR_STALE;
4540                         rbtnode->dirty = 1;
4541                         if (log)
4542                                 isc_log_write(dns_lctx, category, module,
4543                                               level, "overmem cache: stale %s",
4544                                               printname);
4545                 } else if (force_expire) {
4546                         if (! RETAIN(header)) {
4547                                 set_ttl(rbtdb, header, 0);
4548                                 header->attributes |= RDATASET_ATTR_STALE;
4549                                 rbtnode->dirty = 1;
4550                         } else if (log) {
4551                                 isc_log_write(dns_lctx, category, module,
4552                                               level, "overmem cache: "
4553                                               "reprieve by RETAIN() %s",
4554                                               printname);
4555                         }
4556                 } else if (rbtdb->overmem && log)
4557                         isc_log_write(dns_lctx, category, module, level,
4558                                       "overmem cache: saved %s", printname);
4559
4560         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4561                     isc_rwlocktype_write);
4562
4563         return (ISC_R_SUCCESS);
4564 }
4565
4566 static void
4567 overmem(dns_db_t *db, isc_boolean_t overmem) {
4568         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4569
4570         if (IS_CACHE(rbtdb))
4571                 rbtdb->overmem = overmem;
4572 }
4573
4574 static void
4575 printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) {
4576         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4577         dns_rbtnode_t *rbtnode = node;
4578         isc_boolean_t first;
4579
4580         REQUIRE(VALID_RBTDB(rbtdb));
4581
4582         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4583                   isc_rwlocktype_read);
4584
4585         fprintf(out, "node %p, %u references, locknum = %u\n",
4586                 rbtnode, dns_rbtnode_refcurrent(rbtnode),
4587                 rbtnode->locknum);
4588         if (rbtnode->data != NULL) {
4589                 rdatasetheader_t *current, *top_next;
4590
4591                 for (current = rbtnode->data; current != NULL;
4592                      current = top_next) {
4593                         top_next = current->next;
4594                         first = ISC_TRUE;
4595                         fprintf(out, "\ttype %u", current->type);
4596                         do {
4597                                 if (!first)
4598                                         fprintf(out, "\t");
4599                                 first = ISC_FALSE;
4600                                 fprintf(out,
4601                                         "\tserial = %lu, ttl = %u, "
4602                                         "trust = %u, attributes = %u\n",
4603                                         (unsigned long)current->serial,
4604                                         current->rdh_ttl,
4605                                         current->trust,
4606                                         current->attributes);
4607                                 current = current->down;
4608                         } while (current != NULL);
4609                 }
4610         } else
4611                 fprintf(out, "(empty)\n");
4612
4613         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4614                     isc_rwlocktype_read);
4615 }
4616
4617 static isc_result_t
4618 createiterator(dns_db_t *db, isc_boolean_t relative_names,
4619                dns_dbiterator_t **iteratorp)
4620 {
4621         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4622         rbtdb_dbiterator_t *rbtdbiter;
4623
4624         REQUIRE(VALID_RBTDB(rbtdb));
4625
4626         rbtdbiter = isc_mem_get(rbtdb->common.mctx, sizeof(*rbtdbiter));
4627         if (rbtdbiter == NULL)
4628                 return (ISC_R_NOMEMORY);
4629
4630         rbtdbiter->common.methods = &dbiterator_methods;
4631         rbtdbiter->common.db = NULL;
4632         dns_db_attach(db, &rbtdbiter->common.db);
4633         rbtdbiter->common.relative_names = relative_names;
4634         rbtdbiter->common.magic = DNS_DBITERATOR_MAGIC;
4635         rbtdbiter->common.cleaning = ISC_FALSE;
4636         rbtdbiter->paused = ISC_TRUE;
4637         rbtdbiter->tree_locked = isc_rwlocktype_none;
4638         rbtdbiter->result = ISC_R_SUCCESS;
4639         dns_fixedname_init(&rbtdbiter->name);
4640         dns_fixedname_init(&rbtdbiter->origin);
4641         rbtdbiter->node = NULL;
4642         rbtdbiter->delete = 0;
4643         memset(rbtdbiter->deletions, 0, sizeof(rbtdbiter->deletions));
4644         dns_rbtnodechain_init(&rbtdbiter->chain, db->mctx);
4645
4646         *iteratorp = (dns_dbiterator_t *)rbtdbiter;
4647
4648         return (ISC_R_SUCCESS);
4649 }
4650
4651 static isc_result_t
4652 zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4653                   dns_rdatatype_t type, dns_rdatatype_t covers,
4654                   isc_stdtime_t now, dns_rdataset_t *rdataset,
4655                   dns_rdataset_t *sigrdataset)
4656 {
4657         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4658         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4659         rdatasetheader_t *header, *header_next, *found, *foundsig;
4660         rbtdb_serial_t serial;
4661         rbtdb_version_t *rbtversion = version;
4662         isc_boolean_t close_version = ISC_FALSE;
4663         rbtdb_rdatatype_t matchtype, sigmatchtype;
4664
4665         REQUIRE(VALID_RBTDB(rbtdb));
4666         REQUIRE(type != dns_rdatatype_any);
4667
4668         if (rbtversion == NULL) {
4669                 currentversion(db, (dns_dbversion_t **) (void *)(&rbtversion));
4670                 close_version = ISC_TRUE;
4671         }
4672         serial = rbtversion->serial;
4673         now = 0;
4674
4675         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4676                   isc_rwlocktype_read);
4677
4678         found = NULL;
4679         foundsig = NULL;
4680         matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
4681         if (covers == 0)
4682                 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4683         else
4684                 sigmatchtype = 0;
4685
4686         for (header = rbtnode->data; header != NULL; header = header_next) {
4687                 header_next = header->next;
4688                 do {
4689                         if (header->serial <= serial &&
4690                             !IGNORE(header)) {
4691                                 /*
4692                                  * Is this a "this rdataset doesn't
4693                                  * exist" record?
4694                                  */
4695                                 if (NONEXISTENT(header))
4696                                         header = NULL;
4697                                 break;
4698                         } else
4699                                 header = header->down;
4700                 } while (header != NULL);
4701                 if (header != NULL) {
4702                         /*
4703                          * We have an active, extant rdataset.  If it's a
4704                          * type we're looking for, remember it.
4705                          */
4706                         if (header->type == matchtype) {
4707                                 found = header;
4708                                 if (foundsig != NULL)
4709                                         break;
4710                         } else if (header->type == sigmatchtype) {
4711                                 foundsig = header;
4712                                 if (found != NULL)
4713                                         break;
4714                         }
4715                 }
4716         }
4717         if (found != NULL) {
4718                 bind_rdataset(rbtdb, rbtnode, found, now, rdataset);
4719                 if (foundsig != NULL)
4720                         bind_rdataset(rbtdb, rbtnode, foundsig, now,
4721                                       sigrdataset);
4722         }
4723
4724         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
4725                     isc_rwlocktype_read);
4726
4727         if (close_version)
4728                 closeversion(db, (dns_dbversion_t **) (void *)(&rbtversion),
4729                              ISC_FALSE);
4730
4731         if (found == NULL)
4732                 return (ISC_R_NOTFOUND);
4733
4734         return (ISC_R_SUCCESS);
4735 }
4736
4737 static isc_result_t
4738 cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4739                    dns_rdatatype_t type, dns_rdatatype_t covers,
4740                    isc_stdtime_t now, dns_rdataset_t *rdataset,
4741                    dns_rdataset_t *sigrdataset)
4742 {
4743         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4744         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4745         rdatasetheader_t *header, *header_next, *found, *foundsig;
4746         rbtdb_rdatatype_t matchtype, sigmatchtype, negtype;
4747         isc_result_t result;
4748         nodelock_t *lock;
4749         isc_rwlocktype_t locktype;
4750
4751         REQUIRE(VALID_RBTDB(rbtdb));
4752         REQUIRE(type != dns_rdatatype_any);
4753
4754         UNUSED(version);
4755
4756         result = ISC_R_SUCCESS;
4757
4758         if (now == 0)
4759                 isc_stdtime_get(&now);
4760
4761         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
4762         locktype = isc_rwlocktype_read;
4763         NODE_LOCK(lock, locktype);
4764
4765         found = NULL;
4766         foundsig = NULL;
4767         matchtype = RBTDB_RDATATYPE_VALUE(type, covers);
4768         negtype = RBTDB_RDATATYPE_VALUE(0, type);
4769         if (covers == 0)
4770                 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
4771         else
4772                 sigmatchtype = 0;
4773
4774         for (header = rbtnode->data; header != NULL; header = header_next) {
4775                 header_next = header->next;
4776                 if (header->rdh_ttl <= now) {
4777                         if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
4778                             (locktype == isc_rwlocktype_write ||
4779                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
4780                                 /*
4781                                  * We update the node's status only when we
4782                                  * can get write access.
4783                                  */
4784                                 locktype = isc_rwlocktype_write;
4785
4786                                 /*
4787                                  * We don't check if refcurrent(rbtnode) == 0
4788                                  * and try to free like we do in cache_find(),
4789                                  * because refcurrent(rbtnode) must be
4790                                  * non-zero.  This is so because 'node' is an
4791                                  * argument to the function.
4792                                  */
4793                                 header->attributes |= RDATASET_ATTR_STALE;
4794                                 rbtnode->dirty = 1;
4795                         }
4796                 } else if (EXISTS(header)) {
4797                         if (header->type == matchtype)
4798                                 found = header;
4799                         else if (header->type == RBTDB_RDATATYPE_NCACHEANY ||
4800                                  header->type == negtype)
4801                                 found = header;
4802                         else if (header->type == sigmatchtype)
4803                                 foundsig = header;
4804                 }
4805         }
4806         if (found != NULL) {
4807                 bind_rdataset(rbtdb, rbtnode, found, now, rdataset);
4808                 if (foundsig != NULL)
4809                         bind_rdataset(rbtdb, rbtnode, foundsig, now,
4810                                       sigrdataset);
4811         }
4812
4813         NODE_UNLOCK(lock, locktype);
4814
4815         if (found == NULL)
4816                 return (ISC_R_NOTFOUND);
4817
4818         if (RBTDB_RDATATYPE_BASE(found->type) == 0) {
4819                 /*
4820                  * We found a negative cache entry.
4821                  */
4822                 if (NXDOMAIN(found))
4823                         result = DNS_R_NCACHENXDOMAIN;
4824                 else
4825                         result = DNS_R_NCACHENXRRSET;
4826         }
4827
4828         return (result);
4829 }
4830
4831 static isc_result_t
4832 allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
4833              isc_stdtime_t now, dns_rdatasetiter_t **iteratorp)
4834 {
4835         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
4836         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
4837         rbtdb_version_t *rbtversion = version;
4838         rbtdb_rdatasetiter_t *iterator;
4839         unsigned int refs;
4840
4841         REQUIRE(VALID_RBTDB(rbtdb));
4842
4843         iterator = isc_mem_get(rbtdb->common.mctx, sizeof(*iterator));
4844         if (iterator == NULL)
4845                 return (ISC_R_NOMEMORY);
4846
4847         if ((db->attributes & DNS_DBATTR_CACHE) == 0) {
4848                 now = 0;
4849                 if (rbtversion == NULL)
4850                         currentversion(db,
4851                                  (dns_dbversion_t **) (void *)(&rbtversion));
4852                 else {
4853                         unsigned int refs;
4854
4855                         isc_refcount_increment(&rbtversion->references,
4856                                                &refs);
4857                         INSIST(refs > 1);
4858                 }
4859         } else {
4860                 if (now == 0)
4861                         isc_stdtime_get(&now);
4862                 rbtversion = NULL;
4863         }
4864
4865         iterator->common.magic = DNS_RDATASETITER_MAGIC;
4866         iterator->common.methods = &rdatasetiter_methods;
4867         iterator->common.db = db;
4868         iterator->common.node = node;
4869         iterator->common.version = (dns_dbversion_t *)rbtversion;
4870         iterator->common.now = now;
4871
4872         NODE_STRONGLOCK(&rbtdb->node_locks[rbtnode->locknum].lock);
4873
4874         dns_rbtnode_refincrement(rbtnode, &refs);
4875         INSIST(refs != 0);
4876
4877         iterator->current = NULL;
4878
4879         NODE_STRONGUNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock);
4880
4881         *iteratorp = (dns_rdatasetiter_t *)iterator;
4882
4883         return (ISC_R_SUCCESS);
4884 }
4885
4886 static isc_boolean_t
4887 cname_and_other_data(dns_rbtnode_t *node, rbtdb_serial_t serial) {
4888         rdatasetheader_t *header, *header_next;
4889         isc_boolean_t cname, other_data;
4890         dns_rdatatype_t rdtype;
4891
4892         /*
4893          * The caller must hold the node lock.
4894          */
4895
4896         /*
4897          * Look for CNAME and "other data" rdatasets active in our version.
4898          */
4899         cname = ISC_FALSE;
4900         other_data = ISC_FALSE;
4901         for (header = node->data; header != NULL; header = header_next) {
4902                 header_next = header->next;
4903                 if (header->type == dns_rdatatype_cname) {
4904                         /*
4905                          * Look for an active extant CNAME.
4906                          */
4907                         do {
4908                                 if (header->serial <= serial &&
4909                                     !IGNORE(header)) {
4910                                         /*
4911                                          * Is this a "this rdataset doesn't
4912                                          * exist" record?
4913                                          */
4914                                         if (NONEXISTENT(header))
4915                                                 header = NULL;
4916                                         break;
4917                                 } else
4918                                         header = header->down;
4919                         } while (header != NULL);
4920                         if (header != NULL)
4921                                 cname = ISC_TRUE;
4922                 } else {
4923                         /*
4924                          * Look for active extant "other data".
4925                          *
4926                          * "Other data" is any rdataset whose type is not
4927                          * KEY, NSEC, SIG or RRSIG.
4928                          */
4929                         rdtype = RBTDB_RDATATYPE_BASE(header->type);
4930                         if (rdtype != dns_rdatatype_key &&
4931                             rdtype != dns_rdatatype_sig &&
4932                             rdtype != dns_rdatatype_nsec &&
4933                             rdtype != dns_rdatatype_rrsig) {
4934                                 /*
4935                                  * Is it active and extant?
4936                                  */
4937                                 do {
4938                                         if (header->serial <= serial &&
4939                                             !IGNORE(header)) {
4940                                                 /*
4941                                                  * Is this a "this rdataset
4942                                                  * doesn't exist" record?
4943                                                  */
4944                                                 if (NONEXISTENT(header))
4945                                                         header = NULL;
4946                                                 break;
4947                                         } else
4948                                                 header = header->down;
4949                                 } while (header != NULL);
4950                                 if (header != NULL)
4951                                         other_data = ISC_TRUE;
4952                         }
4953                 }
4954         }
4955
4956         if (cname && other_data)
4957                 return (ISC_TRUE);
4958
4959         return (ISC_FALSE);
4960 }
4961
4962 static isc_result_t
4963 add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion,
4964     rdatasetheader_t *newheader, unsigned int options, isc_boolean_t loading,
4965     dns_rdataset_t *addedrdataset, isc_stdtime_t now)
4966 {
4967         rbtdb_changed_t *changed = NULL;
4968         rdatasetheader_t *topheader, *topheader_prev, *header;
4969         unsigned char *merged;
4970         isc_result_t result;
4971         isc_boolean_t header_nx;
4972         isc_boolean_t newheader_nx;
4973         isc_boolean_t merge;
4974         dns_rdatatype_t rdtype, covers;
4975         rbtdb_rdatatype_t negtype;
4976         dns_trust_t trust;
4977
4978         /*
4979          * Add an rdatasetheader_t to a node.
4980          */
4981
4982         /*
4983          * Caller must be holding the node lock.
4984          */
4985
4986         if ((options & DNS_DBADD_MERGE) != 0) {
4987                 REQUIRE(rbtversion != NULL);
4988                 merge = ISC_TRUE;
4989         } else
4990                 merge = ISC_FALSE;
4991
4992         if ((options & DNS_DBADD_FORCE) != 0)
4993                 trust = dns_trust_ultimate;
4994         else
4995                 trust = newheader->trust;
4996
4997         if (rbtversion != NULL && !loading) {
4998                 /*
4999                  * We always add a changed record, even if no changes end up
5000                  * being made to this node, because it's harmless and
5001                  * simplifies the code.
5002                  */
5003                 changed = add_changed(rbtdb, rbtversion, rbtnode);
5004                 if (changed == NULL) {
5005                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5006                         return (ISC_R_NOMEMORY);
5007                 }
5008         }
5009
5010         newheader_nx = NONEXISTENT(newheader) ? ISC_TRUE : ISC_FALSE;
5011         topheader_prev = NULL;
5012
5013         negtype = 0;
5014         if (rbtversion == NULL && !newheader_nx) {
5015                 rdtype = RBTDB_RDATATYPE_BASE(newheader->type);
5016                 if (rdtype == 0) {
5017                         /*
5018                          * We're adding a negative cache entry.
5019                          */
5020                         covers = RBTDB_RDATATYPE_EXT(newheader->type);
5021                         if (covers == dns_rdatatype_any) {
5022                                 /*
5023                                  * We're adding an negative cache entry
5024                                  * which covers all types (NXDOMAIN,
5025                                  * NODATA(QTYPE=ANY)).
5026                                  *
5027                                  * We make all other data stale so that the
5028                                  * only rdataset that can be found at this
5029                                  * node is the negative cache entry.
5030                                  */
5031                                 for (topheader = rbtnode->data;
5032                                      topheader != NULL;
5033                                      topheader = topheader->next) {
5034                                         set_ttl(rbtdb, topheader, 0);
5035                                         topheader->attributes |=
5036                                                 RDATASET_ATTR_STALE;
5037                                 }
5038                                 rbtnode->dirty = 1;
5039                                 goto find_header;
5040                         }
5041                         negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
5042                 } else {
5043                         /*
5044                          * We're adding something that isn't a
5045                          * negative cache entry.  Look for an extant
5046                          * non-stale NXDOMAIN/NODATA(QTYPE=ANY) negative
5047                          * cache entry.
5048                          */
5049                         for (topheader = rbtnode->data;
5050                              topheader != NULL;
5051                              topheader = topheader->next) {
5052                                 if (topheader->type ==
5053                                     RBTDB_RDATATYPE_NCACHEANY)
5054                                         break;
5055                         }
5056                         if (topheader != NULL && EXISTS(topheader) &&
5057                             topheader->rdh_ttl > now) {
5058                                 /*
5059                                  * Found one.
5060                                  */
5061                                 if (trust < topheader->trust) {
5062                                         /*
5063                                          * The NXDOMAIN/NODATA(QTYPE=ANY)
5064                                          * is more trusted.
5065                                          */
5066                                         free_rdataset(rbtdb,
5067                                                       rbtdb->common.mctx,
5068                                                       newheader);
5069                                         if (addedrdataset != NULL)
5070                                                 bind_rdataset(rbtdb, rbtnode,
5071                                                               topheader, now,
5072                                                               addedrdataset);
5073                                         return (DNS_R_UNCHANGED);
5074                                 }
5075                                 /*
5076                                  * The new rdataset is better.  Expire the
5077                                  * NXDOMAIN/NODATA(QTYPE=ANY).
5078                                  */
5079                                 set_ttl(rbtdb, topheader, 0);
5080                                 topheader->attributes |= RDATASET_ATTR_STALE;
5081                                 rbtnode->dirty = 1;
5082                                 topheader = NULL;
5083                                 goto find_header;
5084                         }
5085                         negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
5086                 }
5087         }
5088
5089         for (topheader = rbtnode->data;
5090              topheader != NULL;
5091              topheader = topheader->next) {
5092                 if (topheader->type == newheader->type ||
5093                     topheader->type == negtype)
5094                         break;
5095                 topheader_prev = topheader;
5096         }
5097
5098  find_header:
5099         /*
5100          * If header isn't NULL, we've found the right type.  There may be
5101          * IGNORE rdatasets between the top of the chain and the first real
5102          * data.  We skip over them.
5103          */
5104         header = topheader;
5105         while (header != NULL && IGNORE(header))
5106                 header = header->down;
5107         if (header != NULL) {
5108                 header_nx = NONEXISTENT(header) ? ISC_TRUE : ISC_FALSE;
5109
5110                 /*
5111                  * Deleting an already non-existent rdataset has no effect.
5112                  */
5113                 if (header_nx && newheader_nx) {
5114                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5115                         return (DNS_R_UNCHANGED);
5116                 }
5117
5118                 /*
5119                  * Trying to add an rdataset with lower trust to a cache DB
5120                  * has no effect, provided that the cache data isn't stale.
5121                  */
5122                 if (rbtversion == NULL && trust < header->trust &&
5123                     (header->rdh_ttl > now || header_nx)) {
5124                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5125                         if (addedrdataset != NULL)
5126                                 bind_rdataset(rbtdb, rbtnode, header, now,
5127                                               addedrdataset);
5128                         return (DNS_R_UNCHANGED);
5129                 }
5130
5131                 /*
5132                  * Don't merge if a nonexistent rdataset is involved.
5133                  */
5134                 if (merge && (header_nx || newheader_nx))
5135                         merge = ISC_FALSE;
5136
5137                 /*
5138                  * If 'merge' is ISC_TRUE, we'll try to create a new rdataset
5139                  * that is the union of 'newheader' and 'header'.
5140                  */
5141                 if (merge) {
5142                         unsigned int flags = 0;
5143                         INSIST(rbtversion->serial >= header->serial);
5144                         merged = NULL;
5145                         result = ISC_R_SUCCESS;
5146
5147                         if ((options & DNS_DBADD_EXACT) != 0)
5148                                 flags |= DNS_RDATASLAB_EXACT;
5149                         if ((options & DNS_DBADD_EXACTTTL) != 0 &&
5150                              newheader->rdh_ttl != header->rdh_ttl)
5151                                         result = DNS_R_NOTEXACT;
5152                         else if (newheader->rdh_ttl != header->rdh_ttl)
5153                                 flags |= DNS_RDATASLAB_FORCE;
5154                         if (result == ISC_R_SUCCESS)
5155                                 result = dns_rdataslab_merge(
5156                                              (unsigned char *)header,
5157                                              (unsigned char *)newheader,
5158                                              (unsigned int)(sizeof(*newheader)),
5159                                              rbtdb->common.mctx,
5160                                              rbtdb->common.rdclass,
5161                                              (dns_rdatatype_t)header->type,
5162                                              flags, &merged);
5163                         if (result == ISC_R_SUCCESS) {
5164                                 /*
5165                                  * If 'header' has the same serial number as
5166                                  * we do, we could clean it up now if we knew
5167                                  * that our caller had no references to it.
5168                                  * We don't know this, however, so we leave it
5169                                  * alone.  It will get cleaned up when
5170                                  * clean_zone_node() runs.
5171                                  */
5172                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5173                                               newheader);
5174                                 newheader = (rdatasetheader_t *)merged;
5175                         } else {
5176                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5177                                               newheader);
5178                                 return (result);
5179                         }
5180                 }
5181                 /*
5182                  * Don't replace existing NS, A and AAAA RRsets
5183                  * in the cache if they are already exist.  This
5184                  * prevents named being locked to old servers.
5185                  * Don't lower trust of existing record if the
5186                  * update is forced.
5187                  */
5188                 if (IS_CACHE(rbtdb) && header->rdh_ttl > now &&
5189                     header->type == dns_rdatatype_ns &&
5190                     !header_nx && !newheader_nx &&
5191                     header->trust >= newheader->trust &&
5192                     dns_rdataslab_equalx((unsigned char *)header,
5193                                          (unsigned char *)newheader,
5194                                          (unsigned int)(sizeof(*newheader)),
5195                                          rbtdb->common.rdclass,
5196                                          (dns_rdatatype_t)header->type)) {
5197                         /*
5198                          * Honour the new ttl if it is less than the
5199                          * older one.
5200                          */
5201                         if (header->rdh_ttl > newheader->rdh_ttl)
5202                                 set_ttl(rbtdb, header, newheader->rdh_ttl);
5203                         if (header->noqname == NULL &&
5204                             newheader->noqname != NULL) {
5205                                 header->noqname = newheader->noqname;
5206                                 newheader->noqname = NULL;
5207                         }
5208                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5209                         if (addedrdataset != NULL)
5210                                 bind_rdataset(rbtdb, rbtnode, header, now,
5211                                               addedrdataset);
5212                         return (ISC_R_SUCCESS);
5213                 }
5214                 if (IS_CACHE(rbtdb) && header->rdh_ttl > now &&
5215                     (header->type == dns_rdatatype_a ||
5216                      header->type == dns_rdatatype_aaaa) &&
5217                     !header_nx && !newheader_nx &&
5218                     header->trust >= newheader->trust &&
5219                     dns_rdataslab_equal((unsigned char *)header,
5220                                         (unsigned char *)newheader,
5221                                         (unsigned int)(sizeof(*newheader)))) {
5222                         /*
5223                          * Honour the new ttl if it is less than the
5224                          * older one.
5225                          */
5226                         if (header->rdh_ttl > newheader->rdh_ttl)
5227                                 set_ttl(rbtdb, header, newheader->rdh_ttl);
5228                         if (header->noqname == NULL &&
5229                             newheader->noqname != NULL) {
5230                                 header->noqname = newheader->noqname;
5231                                 newheader->noqname = NULL;
5232                         }
5233                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5234                         if (addedrdataset != NULL)
5235                                 bind_rdataset(rbtdb, rbtnode, header, now,
5236                                               addedrdataset);
5237                         return (ISC_R_SUCCESS);
5238                 }
5239                 INSIST(rbtversion == NULL ||
5240                        rbtversion->serial >= topheader->serial);
5241                 if (topheader_prev != NULL)
5242                         topheader_prev->next = newheader;
5243                 else
5244                         rbtnode->data = newheader;
5245                 newheader->next = topheader->next;
5246                 if (loading) {
5247                         /*
5248                          * There are no other references to 'header' when
5249                          * loading, so we MAY clean up 'header' now.
5250                          * Since we don't generate changed records when
5251                          * loading, we MUST clean up 'header' now.
5252                          */
5253                         newheader->down = NULL;
5254                         free_rdataset(rbtdb, rbtdb->common.mctx, header);
5255                 } else {
5256                         newheader->down = topheader;
5257                         topheader->next = newheader;
5258                         rbtnode->dirty = 1;
5259                         if (changed != NULL)
5260                                 changed->dirty = ISC_TRUE;
5261                         if (rbtversion == NULL) {
5262                                 set_ttl(rbtdb, header, 0);
5263                                 header->attributes |= RDATASET_ATTR_STALE;
5264                         }
5265                         if (IS_CACHE(rbtdb)) {
5266                                 int idx = newheader->node->locknum;
5267
5268                                 ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
5269                                                  newheader, lru_link);
5270
5271                                 /*
5272                                  * XXXMLG We don't check the return value
5273                                  * here.  If it fails, we will not do TTL
5274                                  * based expiry on this node.  However, we
5275                                  * will do it on the LRU side, so memory
5276                                  * will not leak... for long.
5277                                  */
5278                                 isc_heap_insert(rbtdb->heaps[idx], newheader);
5279                         }
5280                 }
5281         } else {
5282                 /*
5283                  * No non-IGNORED rdatasets of the given type exist at
5284                  * this node.
5285                  */
5286
5287                 /*
5288                  * If we're trying to delete the type, don't bother.
5289                  */
5290                 if (newheader_nx) {
5291                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5292                         return (DNS_R_UNCHANGED);
5293                 }
5294
5295                 if (topheader != NULL) {
5296                         /*
5297                          * We have an list of rdatasets of the given type,
5298                          * but they're all marked IGNORE.  We simply insert
5299                          * the new rdataset at the head of the list.
5300                          *
5301                          * Ignored rdatasets cannot occur during loading, so
5302                          * we INSIST on it.
5303                          */
5304                         INSIST(!loading);
5305                         INSIST(rbtversion == NULL ||
5306                                rbtversion->serial >= topheader->serial);
5307                         if (topheader_prev != NULL)
5308                                 topheader_prev->next = newheader;
5309                         else
5310                                 rbtnode->data = newheader;
5311                         newheader->next = topheader->next;
5312                         newheader->down = topheader;
5313                         topheader->next = newheader;
5314                         rbtnode->dirty = 1;
5315                         if (changed != NULL)
5316                                 changed->dirty = ISC_TRUE;
5317                 } else {
5318                         /*
5319                          * No rdatasets of the given type exist at the node.
5320                          */
5321                         newheader->next = rbtnode->data;
5322                         newheader->down = NULL;
5323                         rbtnode->data = newheader;
5324                 }
5325                 if (IS_CACHE(rbtdb)) {
5326                         int idx = newheader->node->locknum;
5327                         ISC_LIST_PREPEND(rbtdb->rdatasets[idx],
5328                                          newheader, lru_link);
5329                         isc_heap_insert(rbtdb->heaps[idx], newheader);
5330                 }
5331         }
5332
5333         /*
5334          * Check if the node now contains CNAME and other data.
5335          */
5336         if (rbtversion != NULL &&
5337             cname_and_other_data(rbtnode, rbtversion->serial))
5338                 return (DNS_R_CNAMEANDOTHER);
5339
5340         if (addedrdataset != NULL)
5341                 bind_rdataset(rbtdb, rbtnode, newheader, now, addedrdataset);
5342
5343         return (ISC_R_SUCCESS);
5344 }
5345
5346 static inline isc_boolean_t
5347 delegating_type(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
5348                 rbtdb_rdatatype_t type)
5349 {
5350         if (IS_CACHE(rbtdb)) {
5351                 if (type == dns_rdatatype_dname)
5352                         return (ISC_TRUE);
5353                 else
5354                         return (ISC_FALSE);
5355         } else if (type == dns_rdatatype_dname ||
5356                    (type == dns_rdatatype_ns &&
5357                     (node != rbtdb->origin_node || IS_STUB(rbtdb))))
5358                 return (ISC_TRUE);
5359         return (ISC_FALSE);
5360 }
5361
5362 static inline isc_result_t
5363 addnoqname(dns_rbtdb_t *rbtdb, rdatasetheader_t *newheader,
5364            dns_rdataset_t *rdataset)
5365 {
5366         struct noqname *noqname;
5367         isc_mem_t *mctx = rbtdb->common.mctx;
5368         dns_name_t name;
5369         dns_rdataset_t nsec, nsecsig;
5370         isc_result_t result;
5371         isc_region_t r;
5372
5373         dns_name_init(&name, NULL);
5374         dns_rdataset_init(&nsec);
5375         dns_rdataset_init(&nsecsig);
5376
5377         result = dns_rdataset_getnoqname(rdataset, &name, &nsec, &nsecsig);
5378         RUNTIME_CHECK(result == ISC_R_SUCCESS);
5379
5380         noqname = isc_mem_get(mctx, sizeof(*noqname));
5381         if (noqname == NULL) {
5382                 result = ISC_R_NOMEMORY;
5383                 goto cleanup;
5384         }
5385         dns_name_init(&noqname->name, NULL);
5386         noqname->nsec = NULL;
5387         noqname->nsecsig = NULL;
5388         result = dns_name_dup(&name, mctx, &noqname->name);
5389         if (result != ISC_R_SUCCESS)
5390                 goto cleanup;
5391         result = dns_rdataslab_fromrdataset(&nsec, mctx, &r, 0);
5392         if (result != ISC_R_SUCCESS)
5393                 goto cleanup;
5394         noqname->nsec = r.base;
5395         result = dns_rdataslab_fromrdataset(&nsecsig, mctx, &r, 0);
5396         if (result != ISC_R_SUCCESS)
5397                 goto cleanup;
5398         noqname->nsecsig = r.base;
5399         dns_rdataset_disassociate(&nsec);
5400         dns_rdataset_disassociate(&nsecsig);
5401         newheader->noqname = noqname;
5402         return (ISC_R_SUCCESS);
5403
5404 cleanup:
5405         dns_rdataset_disassociate(&nsec);
5406         dns_rdataset_disassociate(&nsecsig);
5407         free_noqname(mctx, &noqname);
5408         return(result);
5409 }
5410
5411 static isc_result_t
5412 addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5413             isc_stdtime_t now, dns_rdataset_t *rdataset, unsigned int options,
5414             dns_rdataset_t *addedrdataset)
5415 {
5416         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5417         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5418         rbtdb_version_t *rbtversion = version;
5419         isc_region_t region;
5420         rdatasetheader_t *newheader;
5421         rdatasetheader_t *header;
5422         isc_result_t result;
5423         isc_boolean_t delegating;
5424         isc_boolean_t tree_locked = ISC_FALSE;
5425
5426         REQUIRE(VALID_RBTDB(rbtdb));
5427
5428         if (rbtversion == NULL) {
5429                 if (now == 0)
5430                         isc_stdtime_get(&now);
5431         } else
5432                 now = 0;
5433
5434         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5435                                             &region,
5436                                             sizeof(rdatasetheader_t));
5437         if (result != ISC_R_SUCCESS)
5438                 return (result);
5439
5440         newheader = (rdatasetheader_t *)region.base;
5441         init_rdataset(rbtdb, newheader);
5442         set_ttl(rbtdb, newheader, rdataset->ttl + now);
5443         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5444                                                 rdataset->covers);
5445         newheader->attributes = 0;
5446         newheader->noqname = NULL;
5447         newheader->count = init_count++;
5448         newheader->trust = rdataset->trust;
5449         newheader->additional_auth = NULL;
5450         newheader->additional_glue = NULL;
5451         newheader->last_used = now;
5452         newheader->node = rbtnode;
5453         if (rbtversion != NULL) {
5454                 newheader->serial = rbtversion->serial;
5455                 now = 0;
5456         } else {
5457                 newheader->serial = 1;
5458                 if ((rdataset->attributes & DNS_RDATASETATTR_NXDOMAIN) != 0)
5459                         newheader->attributes |= RDATASET_ATTR_NXDOMAIN;
5460                 if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) {
5461                         result = addnoqname(rbtdb, newheader, rdataset);
5462                         if (result != ISC_R_SUCCESS) {
5463                                 free_rdataset(rbtdb, rbtdb->common.mctx,
5464                                               newheader);
5465                                 return (result);
5466                         }
5467                 }
5468         }
5469
5470         /*
5471          * If we're adding a delegation type (e.g. NS or DNAME for a zone,
5472          * just DNAME for the cache), then we need to set the callback bit
5473          * on the node.
5474          */
5475         if (delegating_type(rbtdb, rbtnode, rdataset->type))
5476                 delegating = ISC_TRUE;
5477         else
5478                 delegating = ISC_FALSE;
5479
5480         /*
5481          * If we're adding a delegation type or the DB is a cache in an overmem
5482          * state, hold an exclusive lock on the tree.  In the latter case
5483          * the lock does not necessarily have to be acquired but it will help
5484          * purge stale entries more effectively.
5485          */
5486         if (delegating || (IS_CACHE(rbtdb) && rbtdb->overmem)) {
5487                 tree_locked = ISC_TRUE;
5488                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5489         }
5490
5491         if (IS_CACHE(rbtdb) && rbtdb->overmem)
5492                 overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked);
5493
5494         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5495                   isc_rwlocktype_write);
5496
5497         if (rbtdb->rrsetstats != NULL) {
5498                 newheader->attributes |= RDATASET_ATTR_STATCOUNT;
5499                 update_rrsetstats(rbtdb, newheader, ISC_TRUE);
5500         }
5501
5502         if (IS_CACHE(rbtdb)) {
5503                 if (tree_locked)
5504                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
5505
5506                 header = isc_heap_element(rbtdb->heaps[rbtnode->locknum], 1);
5507                 if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL)
5508                         expire_header(rbtdb, header, tree_locked);
5509
5510                 /*
5511                  * If we've been holding a write lock on the tree just for
5512                  * cleaning, we can release it now.  However, we still need the
5513                  * node lock.
5514                  */
5515                 if (tree_locked && !delegating) {
5516                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5517                         tree_locked = ISC_FALSE;
5518                 }
5519         }
5520
5521         result = add(rbtdb, rbtnode, rbtversion, newheader, options, ISC_FALSE,
5522                      addedrdataset, now);
5523         if (result == ISC_R_SUCCESS && delegating)
5524                 rbtnode->find_callback = 1;
5525
5526         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5527                     isc_rwlocktype_write);
5528
5529         if (tree_locked)
5530                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
5531
5532         /*
5533          * Update the zone's secure status.  If version is non-NULL
5534          * this is deferred until closeversion() is called.
5535          */
5536         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5537                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5538
5539         return (result);
5540 }
5541
5542 static isc_result_t
5543 subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5544                  dns_rdataset_t *rdataset, unsigned int options,
5545                  dns_rdataset_t *newrdataset)
5546 {
5547         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5548         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5549         rbtdb_version_t *rbtversion = version;
5550         rdatasetheader_t *topheader, *topheader_prev, *header, *newheader;
5551         unsigned char *subresult;
5552         isc_region_t region;
5553         isc_result_t result;
5554         rbtdb_changed_t *changed;
5555
5556         REQUIRE(VALID_RBTDB(rbtdb));
5557
5558         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5559                                             &region,
5560                                             sizeof(rdatasetheader_t));
5561         if (result != ISC_R_SUCCESS)
5562                 return (result);
5563         newheader = (rdatasetheader_t *)region.base;
5564         init_rdataset(rbtdb, newheader);
5565         set_ttl(rbtdb, newheader, rdataset->ttl);
5566         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5567                                                 rdataset->covers);
5568         newheader->attributes = 0;
5569         newheader->serial = rbtversion->serial;
5570         newheader->trust = 0;
5571         newheader->noqname = NULL;
5572         newheader->count = init_count++;
5573         newheader->additional_auth = NULL;
5574         newheader->additional_glue = NULL;
5575         newheader->last_used = 0;
5576         newheader->node = rbtnode;
5577
5578         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5579                   isc_rwlocktype_write);
5580
5581         changed = add_changed(rbtdb, rbtversion, rbtnode);
5582         if (changed == NULL) {
5583                 free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5584                 NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5585                             isc_rwlocktype_write);
5586                 return (ISC_R_NOMEMORY);
5587         }
5588
5589         topheader_prev = NULL;
5590         for (topheader = rbtnode->data;
5591              topheader != NULL;
5592              topheader = topheader->next) {
5593                 if (topheader->type == newheader->type)
5594                         break;
5595                 topheader_prev = topheader;
5596         }
5597         /*
5598          * If header isn't NULL, we've found the right type.  There may be
5599          * IGNORE rdatasets between the top of the chain and the first real
5600          * data.  We skip over them.
5601          */
5602         header = topheader;
5603         while (header != NULL && IGNORE(header))
5604                 header = header->down;
5605         if (header != NULL && EXISTS(header)) {
5606                 unsigned int flags = 0;
5607                 subresult = NULL;
5608                 result = ISC_R_SUCCESS;
5609                 if ((options & DNS_DBSUB_EXACT) != 0) {
5610                         flags |= DNS_RDATASLAB_EXACT;
5611                         if (newheader->rdh_ttl != header->rdh_ttl)
5612                                 result = DNS_R_NOTEXACT;
5613                 }
5614                 if (result == ISC_R_SUCCESS)
5615                         result = dns_rdataslab_subtract(
5616                                         (unsigned char *)header,
5617                                         (unsigned char *)newheader,
5618                                         (unsigned int)(sizeof(*newheader)),
5619                                         rbtdb->common.mctx,
5620                                         rbtdb->common.rdclass,
5621                                         (dns_rdatatype_t)header->type,
5622                                         flags, &subresult);
5623                 if (result == ISC_R_SUCCESS) {
5624                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5625                         newheader = (rdatasetheader_t *)subresult;
5626                         /*
5627                          * We have to set the serial since the rdataslab
5628                          * subtraction routine copies the reserved portion of
5629                          * header, not newheader.
5630                          */
5631                         newheader->serial = rbtversion->serial;
5632                         /*
5633                          * XXXJT: dns_rdataslab_subtract() copied the pointers
5634                          * to additional info.  We need to clear these fields
5635                          * to avoid having duplicated references.
5636                          */
5637                         newheader->additional_auth = NULL;
5638                         newheader->additional_glue = NULL;
5639                 } else if (result == DNS_R_NXRRSET) {
5640                         /*
5641                          * This subtraction would remove all of the rdata;
5642                          * add a nonexistent header instead.
5643                          */
5644                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5645                         newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
5646                         if (newheader == NULL) {
5647                                 result = ISC_R_NOMEMORY;
5648                                 goto unlock;
5649                         }
5650                         set_ttl(rbtdb, newheader, 0);
5651                         newheader->type = topheader->type;
5652                         newheader->attributes = RDATASET_ATTR_NONEXISTENT;
5653                         newheader->trust = 0;
5654                         newheader->serial = rbtversion->serial;
5655                         newheader->noqname = NULL;
5656                         newheader->count = 0;
5657                         newheader->additional_auth = NULL;
5658                         newheader->additional_glue = NULL;
5659                         newheader->node = rbtnode;
5660                         newheader->last_used = 0;
5661                 } else {
5662                         free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5663                         goto unlock;
5664                 }
5665
5666                 /*
5667                  * If we're here, we want to link newheader in front of
5668                  * topheader.
5669                  */
5670                 INSIST(rbtversion->serial >= topheader->serial);
5671                 if (topheader_prev != NULL)
5672                         topheader_prev->next = newheader;
5673                 else
5674                         rbtnode->data = newheader;
5675                 newheader->next = topheader->next;
5676                 newheader->down = topheader;
5677                 topheader->next = newheader;
5678                 rbtnode->dirty = 1;
5679                 changed->dirty = ISC_TRUE;
5680         } else {
5681                 /*
5682                  * The rdataset doesn't exist, so we don't need to do anything
5683                  * to satisfy the deletion request.
5684                  */
5685                 free_rdataset(rbtdb, rbtdb->common.mctx, newheader);
5686                 if ((options & DNS_DBSUB_EXACT) != 0)
5687                         result = DNS_R_NOTEXACT;
5688                 else
5689                         result = DNS_R_UNCHANGED;
5690         }
5691
5692         if (result == ISC_R_SUCCESS && newrdataset != NULL)
5693                 bind_rdataset(rbtdb, rbtnode, newheader, 0, newrdataset);
5694
5695  unlock:
5696         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5697                     isc_rwlocktype_write);
5698
5699         /*
5700          * Update the zone's secure status.  If version is non-NULL
5701          * this is deferred until closeversion() is called.
5702          */
5703         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5704                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5705
5706         return (result);
5707 }
5708
5709 static isc_result_t
5710 deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version,
5711                dns_rdatatype_t type, dns_rdatatype_t covers)
5712 {
5713         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5714         dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node;
5715         rbtdb_version_t *rbtversion = version;
5716         isc_result_t result;
5717         rdatasetheader_t *newheader;
5718
5719         REQUIRE(VALID_RBTDB(rbtdb));
5720
5721         if (type == dns_rdatatype_any)
5722                 return (ISC_R_NOTIMPLEMENTED);
5723         if (type == dns_rdatatype_rrsig && covers == 0)
5724                 return (ISC_R_NOTIMPLEMENTED);
5725
5726         newheader = new_rdataset(rbtdb, rbtdb->common.mctx);
5727         if (newheader == NULL)
5728                 return (ISC_R_NOMEMORY);
5729         set_ttl(rbtdb, newheader, 0);
5730         newheader->type = RBTDB_RDATATYPE_VALUE(type, covers);
5731         newheader->attributes = RDATASET_ATTR_NONEXISTENT;
5732         newheader->trust = 0;
5733         newheader->noqname = NULL;
5734         newheader->additional_auth = NULL;
5735         newheader->additional_glue = NULL;
5736         if (rbtversion != NULL)
5737                 newheader->serial = rbtversion->serial;
5738         else
5739                 newheader->serial = 0;
5740         newheader->count = 0;
5741         newheader->last_used = 0;
5742         newheader->node = rbtnode;
5743
5744         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5745                   isc_rwlocktype_write);
5746
5747         result = add(rbtdb, rbtnode, rbtversion, newheader, DNS_DBADD_FORCE,
5748                      ISC_FALSE, NULL, 0);
5749
5750         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
5751                     isc_rwlocktype_write);
5752
5753         /*
5754          * Update the zone's secure status.  If version is non-NULL
5755          * this is deferred until closeversion() is called.
5756          */
5757         if (result == ISC_R_SUCCESS && version == NULL && !IS_CACHE(rbtdb))
5758                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5759
5760         return (result);
5761 }
5762
5763 static isc_result_t
5764 loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) {
5765         rbtdb_load_t *loadctx = arg;
5766         dns_rbtdb_t *rbtdb = loadctx->rbtdb;
5767         dns_rbtnode_t *node;
5768         isc_result_t result;
5769         isc_region_t region;
5770         rdatasetheader_t *newheader;
5771
5772         /*
5773          * This routine does no node locking.  See comments in
5774          * 'load' below for more information on loading and
5775          * locking.
5776          */
5777
5778
5779         /*
5780          * SOA records are only allowed at top of zone.
5781          */
5782         if (rdataset->type == dns_rdatatype_soa &&
5783             !IS_CACHE(rbtdb) && !dns_name_equal(name, &rbtdb->common.origin))
5784                 return (DNS_R_NOTZONETOP);
5785
5786         add_empty_wildcards(rbtdb, name);
5787
5788         if (dns_name_iswildcard(name)) {
5789                 /*
5790                  * NS record owners cannot legally be wild cards.
5791                  */
5792                 if (rdataset->type == dns_rdatatype_ns)
5793                         return (DNS_R_INVALIDNS);
5794                 result = add_wildcard_magic(rbtdb, name);
5795                 if (result != ISC_R_SUCCESS)
5796                         return (result);
5797         }
5798
5799         node = NULL;
5800         result = dns_rbt_addnode(rbtdb->tree, name, &node);
5801         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
5802                 return (result);
5803         if (result != ISC_R_EXISTS) {
5804                 dns_name_t foundname;
5805                 dns_name_init(&foundname, NULL);
5806                 dns_rbt_namefromnode(node, &foundname);
5807 #ifdef DNS_RBT_USEHASH
5808                 node->locknum = node->hashval % rbtdb->node_lock_count;
5809 #else
5810                 node->locknum = dns_name_hash(&foundname, ISC_TRUE) %
5811                         rbtdb->node_lock_count;
5812 #endif
5813         }
5814
5815         result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx,
5816                                             &region,
5817                                             sizeof(rdatasetheader_t));
5818         if (result != ISC_R_SUCCESS)
5819                 return (result);
5820         newheader = (rdatasetheader_t *)region.base;
5821         init_rdataset(rbtdb, newheader);
5822         set_ttl(rbtdb, newheader,
5823                 rdataset->ttl + loadctx->now); /* XXX overflow check */
5824         newheader->type = RBTDB_RDATATYPE_VALUE(rdataset->type,
5825                                                 rdataset->covers);
5826         newheader->attributes = 0;
5827         newheader->trust = rdataset->trust;
5828         newheader->serial = 1;
5829         newheader->noqname = NULL;
5830         newheader->count = init_count++;
5831         newheader->additional_auth = NULL;
5832         newheader->additional_glue = NULL;
5833         /* won't be used, but initialize anyway */
5834         newheader->last_used = 0;
5835         newheader->node = node;
5836
5837         result = add(rbtdb, node, rbtdb->current_version, newheader,
5838                      DNS_DBADD_MERGE, ISC_TRUE, NULL, 0);
5839         if (result == ISC_R_SUCCESS &&
5840             delegating_type(rbtdb, node, rdataset->type))
5841                 node->find_callback = 1;
5842         else if (result == DNS_R_UNCHANGED)
5843                 result = ISC_R_SUCCESS;
5844
5845         return (result);
5846 }
5847
5848 static isc_result_t
5849 beginload(dns_db_t *db, dns_addrdatasetfunc_t *addp, dns_dbload_t **dbloadp) {
5850         rbtdb_load_t *loadctx;
5851         dns_rbtdb_t *rbtdb;
5852
5853         rbtdb = (dns_rbtdb_t *)db;
5854
5855         REQUIRE(VALID_RBTDB(rbtdb));
5856
5857         loadctx = isc_mem_get(rbtdb->common.mctx, sizeof(*loadctx));
5858         if (loadctx == NULL)
5859                 return (ISC_R_NOMEMORY);
5860
5861         loadctx->rbtdb = rbtdb;
5862         if (IS_CACHE(rbtdb))
5863                 isc_stdtime_get(&loadctx->now);
5864         else
5865                 loadctx->now = 0;
5866
5867         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5868
5869         REQUIRE((rbtdb->attributes & (RBTDB_ATTR_LOADED|RBTDB_ATTR_LOADING))
5870                 == 0);
5871         rbtdb->attributes |= RBTDB_ATTR_LOADING;
5872
5873         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5874
5875         *addp = loading_addrdataset;
5876         *dbloadp = loadctx;
5877
5878         return (ISC_R_SUCCESS);
5879 }
5880
5881 static isc_result_t
5882 endload(dns_db_t *db, dns_dbload_t **dbloadp) {
5883         rbtdb_load_t *loadctx;
5884         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5885
5886         REQUIRE(VALID_RBTDB(rbtdb));
5887         REQUIRE(dbloadp != NULL);
5888         loadctx = *dbloadp;
5889         REQUIRE(loadctx->rbtdb == rbtdb);
5890
5891         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5892
5893         REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0);
5894         REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0);
5895
5896         rbtdb->attributes &= ~RBTDB_ATTR_LOADING;
5897         rbtdb->attributes |= RBTDB_ATTR_LOADED;
5898
5899         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5900
5901         /*
5902          * If there's a KEY rdataset at the zone origin containing a
5903          * zone key, we consider the zone secure.
5904          */
5905         if (! IS_CACHE(rbtdb))
5906                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
5907
5908         *dbloadp = NULL;
5909
5910         isc_mem_put(rbtdb->common.mctx, loadctx, sizeof(*loadctx));
5911
5912         return (ISC_R_SUCCESS);
5913 }
5914
5915 static isc_result_t
5916 dump(dns_db_t *db, dns_dbversion_t *version, const char *filename,
5917      dns_masterformat_t masterformat) {
5918         dns_rbtdb_t *rbtdb;
5919
5920         rbtdb = (dns_rbtdb_t *)db;
5921
5922         REQUIRE(VALID_RBTDB(rbtdb));
5923
5924         return (dns_master_dump2(rbtdb->common.mctx, db, version,
5925                                  &dns_master_style_default,
5926                                  filename, masterformat));
5927 }
5928
5929 static void
5930 delete_callback(void *data, void *arg) {
5931         dns_rbtdb_t *rbtdb = arg;
5932         rdatasetheader_t *current, *next;
5933
5934         for (current = data; current != NULL; current = next) {
5935                 next = current->next;
5936                 free_rdataset(rbtdb, rbtdb->common.mctx, current);
5937         }
5938 }
5939
5940 static isc_boolean_t
5941 issecure(dns_db_t *db) {
5942         dns_rbtdb_t *rbtdb;
5943         isc_boolean_t secure;
5944
5945         rbtdb = (dns_rbtdb_t *)db;
5946
5947         REQUIRE(VALID_RBTDB(rbtdb));
5948
5949         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5950         secure = rbtdb->secure;
5951         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5952
5953         return (secure);
5954 }
5955
5956 static unsigned int
5957 nodecount(dns_db_t *db) {
5958         dns_rbtdb_t *rbtdb;
5959         unsigned int count;
5960
5961         rbtdb = (dns_rbtdb_t *)db;
5962
5963         REQUIRE(VALID_RBTDB(rbtdb));
5964
5965         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5966         count = dns_rbt_nodecount(rbtdb->tree);
5967         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
5968
5969         return (count);
5970 }
5971
5972 static void
5973 settask(dns_db_t *db, isc_task_t *task) {
5974         dns_rbtdb_t *rbtdb;
5975
5976         rbtdb = (dns_rbtdb_t *)db;
5977
5978         REQUIRE(VALID_RBTDB(rbtdb));
5979
5980         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
5981         if (rbtdb->task != NULL)
5982                 isc_task_detach(&rbtdb->task);
5983         if (task != NULL)
5984                 isc_task_attach(task, &rbtdb->task);
5985         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
5986 }
5987
5988 static isc_boolean_t
5989 ispersistent(dns_db_t *db) {
5990         UNUSED(db);
5991         return (ISC_FALSE);
5992 }
5993
5994 static isc_result_t
5995 getoriginnode(dns_db_t *db, dns_dbnode_t **nodep) {
5996         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
5997         dns_rbtnode_t *onode;
5998         isc_result_t result = ISC_R_SUCCESS;
5999
6000         REQUIRE(VALID_RBTDB(rbtdb));
6001         REQUIRE(nodep != NULL && *nodep == NULL);
6002
6003         /* Note that the access to origin_node doesn't require a DB lock */
6004         onode = (dns_rbtnode_t *)rbtdb->origin_node;
6005         if (onode != NULL) {
6006                 NODE_STRONGLOCK(&rbtdb->node_locks[onode->locknum].lock);
6007                 new_reference(rbtdb, onode);
6008                 NODE_STRONGUNLOCK(&rbtdb->node_locks[onode->locknum].lock);
6009
6010                 *nodep = rbtdb->origin_node;
6011         } else {
6012                 INSIST(!IS_CACHE(rbtdb));
6013                 result = ISC_R_NOTFOUND;
6014         }
6015
6016         return (result);
6017 }
6018
6019 static dns_stats_t *
6020 getrrsetstats(dns_db_t *db) {
6021         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
6022
6023         REQUIRE(VALID_RBTDB(rbtdb));
6024         REQUIRE(IS_CACHE(rbtdb)); /* current restriction */
6025
6026         return (rbtdb->rrsetstats);
6027 }
6028
6029 static dns_dbmethods_t zone_methods = {
6030         attach,
6031         detach,
6032         beginload,
6033         endload,
6034         dump,
6035         currentversion,
6036         newversion,
6037         attachversion,
6038         closeversion,
6039         findnode,
6040         zone_find,
6041         zone_findzonecut,
6042         attachnode,
6043         detachnode,
6044         expirenode,
6045         printnode,
6046         createiterator,
6047         zone_findrdataset,
6048         allrdatasets,
6049         addrdataset,
6050         subtractrdataset,
6051         deleterdataset,
6052         issecure,
6053         nodecount,
6054         ispersistent,
6055         overmem,
6056         settask,
6057         getoriginnode,
6058         NULL,
6059         NULL
6060 };
6061
6062 static dns_dbmethods_t cache_methods = {
6063         attach,
6064         detach,
6065         beginload,
6066         endload,
6067         dump,
6068         currentversion,
6069         newversion,
6070         attachversion,
6071         closeversion,
6072         findnode,
6073         cache_find,
6074         cache_findzonecut,
6075         attachnode,
6076         detachnode,
6077         expirenode,
6078         printnode,
6079         createiterator,
6080         cache_findrdataset,
6081         allrdatasets,
6082         addrdataset,
6083         subtractrdataset,
6084         deleterdataset,
6085         issecure,
6086         nodecount,
6087         ispersistent,
6088         overmem,
6089         settask,
6090         getoriginnode,
6091         NULL,
6092         getrrsetstats
6093 };
6094
6095 isc_result_t
6096 #ifdef DNS_RBTDB_VERSION64
6097 dns_rbtdb64_create
6098 #else
6099 dns_rbtdb_create
6100 #endif
6101                 (isc_mem_t *mctx, dns_name_t *origin, dns_dbtype_t type,
6102                  dns_rdataclass_t rdclass, unsigned int argc, char *argv[],
6103                  void *driverarg, dns_db_t **dbp)
6104 {
6105         dns_rbtdb_t *rbtdb;
6106         isc_result_t result;
6107         int i;
6108         dns_name_t name;
6109
6110         /* Keep the compiler happy. */
6111         UNUSED(argc);
6112         UNUSED(argv);
6113         UNUSED(driverarg);
6114
6115         rbtdb = isc_mem_get(mctx, sizeof(*rbtdb));
6116         if (rbtdb == NULL)
6117                 return (ISC_R_NOMEMORY);
6118
6119         memset(rbtdb, '\0', sizeof(*rbtdb));
6120         dns_name_init(&rbtdb->common.origin, NULL);
6121         rbtdb->common.attributes = 0;
6122         if (type == dns_dbtype_cache) {
6123                 rbtdb->common.methods = &cache_methods;
6124                 rbtdb->common.attributes |= DNS_DBATTR_CACHE;
6125         } else if (type == dns_dbtype_stub) {
6126                 rbtdb->common.methods = &zone_methods;
6127                 rbtdb->common.attributes |= DNS_DBATTR_STUB;
6128         } else
6129                 rbtdb->common.methods = &zone_methods;
6130         rbtdb->common.rdclass = rdclass;
6131         rbtdb->common.mctx = NULL;
6132
6133         result = RBTDB_INITLOCK(&rbtdb->lock);
6134         if (result != ISC_R_SUCCESS)
6135                 goto cleanup_rbtdb;
6136
6137         result = isc_rwlock_init(&rbtdb->tree_lock, 0, 0);
6138         if (result != ISC_R_SUCCESS)
6139                 goto cleanup_lock;
6140
6141         /*
6142          * Initialize node_lock_count in a generic way to support future
6143          * extension which allows the user to specify this value on creation.
6144          * Note that when specified for a cache DB it must be larger than 1
6145          * as commented with the definition of DEFAULT_CACHE_NODE_LOCK_COUNT.
6146          */
6147         if (rbtdb->node_lock_count == 0) {
6148                 if (IS_CACHE(rbtdb))
6149                         rbtdb->node_lock_count = DEFAULT_CACHE_NODE_LOCK_COUNT;
6150                 else
6151                         rbtdb->node_lock_count = DEFAULT_NODE_LOCK_COUNT;
6152         } else if (rbtdb->node_lock_count < 2 && IS_CACHE(rbtdb)) {
6153                 result = ISC_R_RANGE;
6154                 goto cleanup_tree_lock;
6155         }
6156         INSIST(rbtdb->node_lock_count < (1 << DNS_RBT_LOCKLENGTH));
6157         rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count *
6158                                         sizeof(rbtdb_nodelock_t));
6159         if (rbtdb->node_locks == NULL) {
6160                 result = ISC_R_NOMEMORY;
6161                 goto cleanup_tree_lock;
6162         }
6163
6164         rbtdb->rrsetstats = NULL;
6165         if (IS_CACHE(rbtdb)) {
6166                 result = dns_rdatasetstats_create(mctx, &rbtdb->rrsetstats);
6167                 if (result != ISC_R_SUCCESS)
6168                         goto cleanup_node_locks;
6169                 rbtdb->rdatasets = isc_mem_get(mctx, rbtdb->node_lock_count *
6170                                                sizeof(rdatasetheaderlist_t));
6171                 if (rbtdb->rdatasets == NULL) {
6172                         result = ISC_R_NOMEMORY;
6173                         goto cleanup_rrsetstats;
6174                 }
6175                 for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6176                         ISC_LIST_INIT(rbtdb->rdatasets[i]);
6177
6178                 /*
6179                  * Create the heaps.
6180                  */
6181                 rbtdb->heaps = isc_mem_get(mctx, rbtdb->node_lock_count *
6182                                            sizeof(isc_heap_t *));
6183                 if (rbtdb->heaps == NULL) {
6184                         result = ISC_R_NOMEMORY;
6185                         goto cleanup_rdatasets;
6186                 }
6187                 for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6188                         rbtdb->heaps[i] = NULL;
6189                 for (i = 0; i < (int)rbtdb->node_lock_count; i++) {
6190                         result = isc_heap_create(mctx, ttl_sooner,
6191                                                  ttl_set_index, 0,
6192                                                  &rbtdb->heaps[i]);
6193                         if (result != ISC_R_SUCCESS)
6194                                 goto cleanup_heaps;
6195                 }
6196         } else {
6197                 rbtdb->rdatasets = NULL;
6198                 rbtdb->heaps = NULL;
6199         }
6200
6201         rbtdb->deadnodes = isc_mem_get(mctx, rbtdb->node_lock_count *
6202                                        sizeof(rbtnodelist_t));
6203         if (rbtdb->deadnodes == NULL) {
6204                 result = ISC_R_NOMEMORY;
6205                 goto cleanup_heaps;
6206         }
6207         for (i = 0; i < (int)rbtdb->node_lock_count; i++)
6208                 ISC_LIST_INIT(rbtdb->deadnodes[i]);
6209
6210         rbtdb->active = rbtdb->node_lock_count;
6211
6212         for (i = 0; i < (int)(rbtdb->node_lock_count); i++) {
6213                 result = NODE_INITLOCK(&rbtdb->node_locks[i].lock);
6214                 if (result == ISC_R_SUCCESS) {
6215                         result = isc_refcount_init(&rbtdb->node_locks[i].references, 0);
6216                         if (result != ISC_R_SUCCESS)
6217                                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
6218                 }
6219                 if (result != ISC_R_SUCCESS) {
6220                         while (i-- > 0) {
6221                                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
6222                                 isc_refcount_decrement(&rbtdb->node_locks[i].references, NULL);
6223                                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
6224                         }
6225                         goto cleanup_deadnodes;
6226                 }
6227                 rbtdb->node_locks[i].exiting = ISC_FALSE;
6228         }
6229
6230         /*
6231          * Attach to the mctx.  The database will persist so long as there
6232          * are references to it, and attaching to the mctx ensures that our
6233          * mctx won't disappear out from under us.
6234          */
6235         isc_mem_attach(mctx, &rbtdb->common.mctx);
6236
6237         /*
6238          * Must be initialized before free_rbtdb() is called.
6239          */
6240         isc_ondestroy_init(&rbtdb->common.ondest);
6241
6242         /*
6243          * Make a copy of the origin name.
6244          */
6245         result = dns_name_dupwithoffsets(origin, mctx, &rbtdb->common.origin);
6246         if (result != ISC_R_SUCCESS) {
6247                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6248                 return (result);
6249         }
6250
6251         /*
6252          * Make the Red-Black Tree.
6253          */
6254         result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree);
6255         if (result != ISC_R_SUCCESS) {
6256                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6257                 return (result);
6258         }
6259         /*
6260          * In order to set the node callback bit correctly in zone databases,
6261          * we need to know if the node has the origin name of the zone.
6262          * In loading_addrdataset() we could simply compare the new name
6263          * to the origin name, but this is expensive.  Also, we don't know the
6264          * node name in addrdataset(), so we need another way of knowing the
6265          * zone's top.
6266          *
6267          * We now explicitly create a node for the zone's origin, and then
6268          * we simply remember the node's address.  This is safe, because
6269          * the top-of-zone node can never be deleted, nor can its address
6270          * change.
6271          */
6272         if (!IS_CACHE(rbtdb)) {
6273                 rbtdb->origin_node = NULL;
6274                 result = dns_rbt_addnode(rbtdb->tree, &rbtdb->common.origin,
6275                                          &rbtdb->origin_node);
6276                 if (result != ISC_R_SUCCESS) {
6277                         INSIST(result != ISC_R_EXISTS);
6278                         free_rbtdb(rbtdb, ISC_FALSE, NULL);
6279                         return (result);
6280                 }
6281                 /*
6282                  * We need to give the origin node the right locknum.
6283                  */
6284                 dns_name_init(&name, NULL);
6285                 dns_rbt_namefromnode(rbtdb->origin_node, &name);
6286 #ifdef DNS_RBT_USEHASH
6287                 rbtdb->origin_node->locknum =
6288                         rbtdb->origin_node->hashval %
6289                         rbtdb->node_lock_count;
6290 #else
6291                 rbtdb->origin_node->locknum =
6292                         dns_name_hash(&name, ISC_TRUE) %
6293                         rbtdb->node_lock_count;
6294 #endif
6295         }
6296
6297         /*
6298          * Misc. Initialization.
6299          */
6300         result = isc_refcount_init(&rbtdb->references, 1);
6301         if (result != ISC_R_SUCCESS) {
6302                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6303                 return (result);
6304         }
6305         rbtdb->attributes = 0;
6306         rbtdb->secure = ISC_FALSE;
6307         rbtdb->overmem = ISC_FALSE;
6308         rbtdb->task = NULL;
6309
6310         /*
6311          * Version Initialization.
6312          */
6313         rbtdb->current_serial = 1;
6314         rbtdb->least_serial = 1;
6315         rbtdb->next_serial = 2;
6316         rbtdb->current_version = allocate_version(mctx, 1, 1, ISC_FALSE);
6317         if (rbtdb->current_version == NULL) {
6318                 isc_refcount_decrement(&rbtdb->references, NULL);
6319                 isc_refcount_destroy(&rbtdb->references);
6320                 free_rbtdb(rbtdb, ISC_FALSE, NULL);
6321                 return (ISC_R_NOMEMORY);
6322         }
6323         rbtdb->future_version = NULL;
6324         ISC_LIST_INIT(rbtdb->open_versions);
6325         /*
6326          * Keep the current version in the open list so that list operation
6327          * won't happen in normal lookup operations.
6328          */
6329         PREPEND(rbtdb->open_versions, rbtdb->current_version, link);
6330
6331         rbtdb->common.magic = DNS_DB_MAGIC;
6332         rbtdb->common.impmagic = RBTDB_MAGIC;
6333
6334         *dbp = (dns_db_t *)rbtdb;
6335
6336         return (ISC_R_SUCCESS);
6337
6338  cleanup_deadnodes:
6339         isc_mem_put(mctx, rbtdb->deadnodes,
6340                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
6341
6342  cleanup_heaps:
6343         if (rbtdb->heaps != NULL) {
6344                 for (i = 0 ; i < (int)rbtdb->node_lock_count ; i++)
6345                         if (rbtdb->heaps[i] != NULL)
6346                                 isc_heap_destroy(&rbtdb->heaps[i]);
6347                 isc_mem_put(mctx, rbtdb->heaps,
6348                             rbtdb->node_lock_count * sizeof(isc_heap_t *));
6349         }
6350
6351  cleanup_rdatasets:
6352         if (rbtdb->rdatasets != NULL)
6353                 isc_mem_put(mctx, rbtdb->rdatasets, rbtdb->node_lock_count *
6354                             sizeof(rdatasetheaderlist_t));
6355  cleanup_rrsetstats:
6356         if (rbtdb->rrsetstats != NULL)
6357                 dns_stats_detach(&rbtdb->rrsetstats);
6358
6359  cleanup_node_locks:
6360         isc_mem_put(mctx, rbtdb->node_locks,
6361                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
6362
6363  cleanup_tree_lock:
6364         isc_rwlock_destroy(&rbtdb->tree_lock);
6365
6366  cleanup_lock:
6367         RBTDB_DESTROYLOCK(&rbtdb->lock);
6368
6369  cleanup_rbtdb:
6370         isc_mem_put(mctx, rbtdb,  sizeof(*rbtdb));
6371         return (result);
6372 }
6373
6374
6375 /*
6376  * Slabbed Rdataset Methods
6377  */
6378
6379 static void
6380 rdataset_disassociate(dns_rdataset_t *rdataset) {
6381         dns_db_t *db = rdataset->private1;
6382         dns_dbnode_t *node = rdataset->private2;
6383
6384         detachnode(db, &node);
6385 }
6386
6387 static isc_result_t
6388 rdataset_first(dns_rdataset_t *rdataset) {
6389         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
6390         unsigned int count;
6391
6392         count = raw[0] * 256 + raw[1];
6393         if (count == 0) {
6394                 rdataset->private5 = NULL;
6395                 return (ISC_R_NOMORE);
6396         }
6397
6398 #if DNS_RDATASET_FIXED
6399         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0)
6400                 raw += 2 + (4 * count);
6401         else
6402 #endif
6403                 raw += 2;
6404
6405         /*
6406          * The privateuint4 field is the number of rdata beyond the
6407          * cursor position, so we decrement the total count by one
6408          * before storing it.
6409          *
6410          * If DNS_RDATASETATTR_LOADORDER is not set 'raw' points to the
6411          * first record.  If DNS_RDATASETATTR_LOADORDER is set 'raw' points
6412          * to the first entry in the offset table.
6413          */
6414         count--;
6415         rdataset->privateuint4 = count;
6416         rdataset->private5 = raw;
6417
6418         return (ISC_R_SUCCESS);
6419 }
6420
6421 static isc_result_t
6422 rdataset_next(dns_rdataset_t *rdataset) {
6423         unsigned int count;
6424         unsigned int length;
6425         unsigned char *raw;     /* RDATASLAB */
6426
6427         count = rdataset->privateuint4;
6428         if (count == 0)
6429                 return (ISC_R_NOMORE);
6430         count--;
6431         rdataset->privateuint4 = count;
6432
6433         /*
6434          * Skip forward one record (length + 4) or one offset (4).
6435          */
6436         raw = rdataset->private5;
6437 #if DNS_RDATASET_FIXED
6438         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) == 0) {
6439 #endif
6440                 length = raw[0] * 256 + raw[1];
6441                 raw += length;
6442 #if DNS_RDATASET_FIXED
6443         }
6444         rdataset->private5 = raw + 4;           /* length(2) + order(2) */
6445 #else
6446         rdataset->private5 = raw + 2;           /* length(2) */
6447 #endif
6448
6449         return (ISC_R_SUCCESS);
6450 }
6451
6452 static void
6453 rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata) {
6454         unsigned char *raw = rdataset->private5;        /* RDATASLAB */
6455 #if DNS_RDATASET_FIXED
6456         unsigned int offset;
6457 #endif
6458         isc_region_t r;
6459
6460         REQUIRE(raw != NULL);
6461
6462         /*
6463          * Find the start of the record if not already in private5
6464          * then skip the length and order fields.
6465          */
6466 #if DNS_RDATASET_FIXED
6467         if ((rdataset->attributes & DNS_RDATASETATTR_LOADORDER) != 0) {
6468                 offset = (raw[0] << 24) + (raw[1] << 16) +
6469                          (raw[2] << 8) + raw[3];
6470                 raw = rdataset->private3;
6471                 raw += offset;
6472         }
6473 #endif
6474         r.length = raw[0] * 256 + raw[1];
6475
6476 #if DNS_RDATASET_FIXED
6477         raw += 4;
6478 #else
6479         raw += 2;
6480 #endif
6481         r.base = raw;
6482         dns_rdata_fromregion(rdata, rdataset->rdclass, rdataset->type, &r);
6483 }
6484
6485 static void
6486 rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target) {
6487         dns_db_t *db = source->private1;
6488         dns_dbnode_t *node = source->private2;
6489         dns_dbnode_t *cloned_node = NULL;
6490
6491         attachnode(db, node, &cloned_node);
6492         *target = *source;
6493
6494         /*
6495          * Reset iterator state.
6496          */
6497         target->privateuint4 = 0;
6498         target->private5 = NULL;
6499 }
6500
6501 static unsigned int
6502 rdataset_count(dns_rdataset_t *rdataset) {
6503         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
6504         unsigned int count;
6505
6506         count = raw[0] * 256 + raw[1];
6507
6508         return (count);
6509 }
6510
6511 static isc_result_t
6512 rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
6513                     dns_rdataset_t *nsec, dns_rdataset_t *nsecsig)
6514 {
6515         dns_db_t *db = rdataset->private1;
6516         dns_dbnode_t *node = rdataset->private2;
6517         dns_dbnode_t *cloned_node;
6518         struct noqname *noqname = rdataset->private6;
6519
6520         cloned_node = NULL;
6521         attachnode(db, node, &cloned_node);
6522         nsec->methods = &rdataset_methods;
6523         nsec->rdclass = db->rdclass;
6524         nsec->type = dns_rdatatype_nsec;
6525         nsec->covers = 0;
6526         nsec->ttl = rdataset->ttl;
6527         nsec->trust = rdataset->trust;
6528         nsec->private1 = rdataset->private1;
6529         nsec->private2 = rdataset->private2;
6530         nsec->private3 = noqname->nsec;
6531         nsec->privateuint4 = 0;
6532         nsec->private5 = NULL;
6533         nsec->private6 = NULL;
6534
6535         cloned_node = NULL;
6536         attachnode(db, node, &cloned_node);
6537         nsecsig->methods = &rdataset_methods;
6538         nsecsig->rdclass = db->rdclass;
6539         nsecsig->type = dns_rdatatype_rrsig;
6540         nsecsig->covers = dns_rdatatype_nsec;
6541         nsecsig->ttl = rdataset->ttl;
6542         nsecsig->trust = rdataset->trust;
6543         nsecsig->private1 = rdataset->private1;
6544         nsecsig->private2 = rdataset->private2;
6545         nsecsig->private3 = noqname->nsecsig;
6546         nsecsig->privateuint4 = 0;
6547         nsecsig->private5 = NULL;
6548         nsec->private6 = NULL;
6549
6550         dns_name_clone(&noqname->name, name);
6551
6552         return (ISC_R_SUCCESS);
6553 }
6554
6555 static void
6556 rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust) {
6557         dns_rbtdb_t *rbtdb = rdataset->private1;
6558         dns_rbtnode_t *rbtnode = rdataset->private2;
6559         rdatasetheader_t *header = rdataset->private3;
6560
6561         header--;
6562         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6563                   isc_rwlocktype_write);
6564         header->trust = rdataset->trust = trust;
6565         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6566                   isc_rwlocktype_write);
6567 }
6568
6569 static void
6570 rdataset_expire(dns_rdataset_t *rdataset) {
6571         dns_rbtdb_t *rbtdb = rdataset->private1;
6572         dns_rbtnode_t *rbtnode = rdataset->private2;
6573         rdatasetheader_t *header = rdataset->private3;
6574
6575         header--;
6576         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6577                   isc_rwlocktype_write);
6578         expire_header(rbtdb, header, ISC_FALSE);
6579         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6580                   isc_rwlocktype_write);
6581 }
6582
6583 /*
6584  * Rdataset Iterator Methods
6585  */
6586
6587 static void
6588 rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp) {
6589         rbtdb_rdatasetiter_t *rbtiterator;
6590
6591         rbtiterator = (rbtdb_rdatasetiter_t *)(*iteratorp);
6592
6593         if (rbtiterator->common.version != NULL)
6594                 closeversion(rbtiterator->common.db,
6595                              &rbtiterator->common.version, ISC_FALSE);
6596         detachnode(rbtiterator->common.db, &rbtiterator->common.node);
6597         isc_mem_put(rbtiterator->common.db->mctx, rbtiterator,
6598                     sizeof(*rbtiterator));
6599
6600         *iteratorp = NULL;
6601 }
6602
6603 static isc_result_t
6604 rdatasetiter_first(dns_rdatasetiter_t *iterator) {
6605         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6606         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6607         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6608         rbtdb_version_t *rbtversion = rbtiterator->common.version;
6609         rdatasetheader_t *header, *top_next;
6610         rbtdb_serial_t serial;
6611         isc_stdtime_t now;
6612
6613         if (IS_CACHE(rbtdb)) {
6614                 serial = 1;
6615                 now = rbtiterator->common.now;
6616         } else {
6617                 serial = rbtversion->serial;
6618                 now = 0;
6619         }
6620
6621         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6622                   isc_rwlocktype_read);
6623
6624         for (header = rbtnode->data; header != NULL; header = top_next) {
6625                 top_next = header->next;
6626                 do {
6627                         if (header->serial <= serial && !IGNORE(header)) {
6628                                 /*
6629                                  * Is this a "this rdataset doesn't exist"
6630                                  * record?  Or is it too old in the cache?
6631                                  *
6632                                  * Note: unlike everywhere else, we
6633                                  * check for now > header->ttl instead
6634                                  * of now >= header->ttl.  This allows
6635                                  * ANY and RRSIG queries for 0 TTL
6636                                  * rdatasets to work.
6637                                  */
6638                                 if (NONEXISTENT(header) ||
6639                                     (now != 0 && now > header->rdh_ttl))
6640                                         header = NULL;
6641                                 break;
6642                         } else
6643                                 header = header->down;
6644                 } while (header != NULL);
6645                 if (header != NULL)
6646                         break;
6647         }
6648
6649         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6650                     isc_rwlocktype_read);
6651
6652         rbtiterator->current = header;
6653
6654         if (header == NULL)
6655                 return (ISC_R_NOMORE);
6656
6657         return (ISC_R_SUCCESS);
6658 }
6659
6660 static isc_result_t
6661 rdatasetiter_next(dns_rdatasetiter_t *iterator) {
6662         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6663         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6664         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6665         rbtdb_version_t *rbtversion = rbtiterator->common.version;
6666         rdatasetheader_t *header, *top_next;
6667         rbtdb_serial_t serial;
6668         isc_stdtime_t now;
6669         rbtdb_rdatatype_t type, negtype;
6670         dns_rdatatype_t rdtype, covers;
6671
6672         header = rbtiterator->current;
6673         if (header == NULL)
6674                 return (ISC_R_NOMORE);
6675
6676         if (IS_CACHE(rbtdb)) {
6677                 serial = 1;
6678                 now = rbtiterator->common.now;
6679         } else {
6680                 serial = rbtversion->serial;
6681                 now = 0;
6682         }
6683
6684         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6685                   isc_rwlocktype_read);
6686
6687         type = header->type;
6688         rdtype = RBTDB_RDATATYPE_BASE(header->type);
6689         if (rdtype == 0) {
6690                 covers = RBTDB_RDATATYPE_EXT(header->type);
6691                 negtype = RBTDB_RDATATYPE_VALUE(covers, 0);
6692         } else
6693                 negtype = RBTDB_RDATATYPE_VALUE(0, rdtype);
6694         for (header = header->next; header != NULL; header = top_next) {
6695                 top_next = header->next;
6696                 /*
6697                  * If not walking back up the down list.
6698                  */
6699                 if (header->type != type && header->type != negtype) {
6700                         do {
6701                                 if (header->serial <= serial &&
6702                                     !IGNORE(header)) {
6703                                         /*
6704                                          * Is this a "this rdataset doesn't
6705                                          * exist" record?
6706                                          *
6707                                          * Note: unlike everywhere else, we
6708                                          * check for now > header->ttl instead
6709                                          * of now >= header->ttl.  This allows
6710                                          * ANY and RRSIG queries for 0 TTL
6711                                          * rdatasets to work.
6712                                          */
6713                                         if ((header->attributes &
6714                                              RDATASET_ATTR_NONEXISTENT) != 0 ||
6715                                             (now != 0 && now > header->rdh_ttl))
6716                                                 header = NULL;
6717                                         break;
6718                                 } else
6719                                         header = header->down;
6720                         } while (header != NULL);
6721                         if (header != NULL)
6722                                 break;
6723                 }
6724         }
6725
6726         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6727                     isc_rwlocktype_read);
6728
6729         rbtiterator->current = header;
6730
6731         if (header == NULL)
6732                 return (ISC_R_NOMORE);
6733
6734         return (ISC_R_SUCCESS);
6735 }
6736
6737 static void
6738 rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) {
6739         rbtdb_rdatasetiter_t *rbtiterator = (rbtdb_rdatasetiter_t *)iterator;
6740         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(rbtiterator->common.db);
6741         dns_rbtnode_t *rbtnode = rbtiterator->common.node;
6742         rdatasetheader_t *header;
6743
6744         header = rbtiterator->current;
6745         REQUIRE(header != NULL);
6746
6747         NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6748                   isc_rwlocktype_read);
6749
6750         bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now,
6751                       rdataset);
6752
6753         NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock,
6754                     isc_rwlocktype_read);
6755 }
6756
6757
6758 /*
6759  * Database Iterator Methods
6760  */
6761
6762 static inline void
6763 reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
6764         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6765         dns_rbtnode_t *node = rbtdbiter->node;
6766
6767         if (node == NULL)
6768                 return;
6769
6770         INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none);
6771         reactivate_node(rbtdb, node, rbtdbiter->tree_locked);
6772 }
6773
6774 static inline void
6775 dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
6776         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6777         dns_rbtnode_t *node = rbtdbiter->node;
6778         nodelock_t *lock;
6779
6780         if (node == NULL)
6781                 return;
6782
6783         lock = &rbtdb->node_locks[node->locknum].lock;
6784         NODE_LOCK(lock, isc_rwlocktype_read);
6785         decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
6786                             rbtdbiter->tree_locked, ISC_FALSE);
6787         NODE_UNLOCK(lock, isc_rwlocktype_read);
6788
6789         rbtdbiter->node = NULL;
6790 }
6791
6792 static void
6793 flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
6794         dns_rbtnode_t *node;
6795         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6796         isc_boolean_t was_read_locked = ISC_FALSE;
6797         nodelock_t *lock;
6798         int i;
6799
6800         if (rbtdbiter->delete != 0) {
6801                 /*
6802                  * Note that "%d node of %d in tree" can report things like
6803                  * "flush_deletions: 59 nodes of 41 in tree".  This means
6804                  * That some nodes appear on the deletions list more than
6805                  * once.  Only the last occurence will actually be deleted.
6806                  */
6807                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
6808                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
6809                               "flush_deletions: %d nodes of %d in tree",
6810                               rbtdbiter->delete,
6811                               dns_rbt_nodecount(rbtdb->tree));
6812
6813                 if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
6814                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6815                         was_read_locked = ISC_TRUE;
6816                 }
6817                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6818                 rbtdbiter->tree_locked = isc_rwlocktype_write;
6819
6820                 for (i = 0; i < rbtdbiter->delete; i++) {
6821                         node = rbtdbiter->deletions[i];
6822                         lock = &rbtdb->node_locks[node->locknum].lock;
6823
6824                         NODE_LOCK(lock, isc_rwlocktype_read);
6825                         decrement_reference(rbtdb, node, 0,
6826                                             isc_rwlocktype_read,
6827                                             rbtdbiter->tree_locked, ISC_FALSE);
6828                         NODE_UNLOCK(lock, isc_rwlocktype_read);
6829                 }
6830
6831                 rbtdbiter->delete = 0;
6832
6833                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
6834                 if (was_read_locked) {
6835                         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6836                         rbtdbiter->tree_locked = isc_rwlocktype_read;
6837
6838                 } else {
6839                         rbtdbiter->tree_locked = isc_rwlocktype_none;
6840                 }
6841         }
6842 }
6843
6844 static inline void
6845 resume_iteration(rbtdb_dbiterator_t *rbtdbiter) {
6846         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6847
6848         REQUIRE(rbtdbiter->paused);
6849         REQUIRE(rbtdbiter->tree_locked == isc_rwlocktype_none);
6850
6851         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6852         rbtdbiter->tree_locked = isc_rwlocktype_read;
6853
6854         rbtdbiter->paused = ISC_FALSE;
6855 }
6856
6857 static void
6858 dbiterator_destroy(dns_dbiterator_t **iteratorp) {
6859         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)(*iteratorp);
6860         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db;
6861         dns_db_t *db = NULL;
6862
6863         if (rbtdbiter->tree_locked == isc_rwlocktype_read) {
6864                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
6865                 rbtdbiter->tree_locked = isc_rwlocktype_none;
6866         } else
6867                 INSIST(rbtdbiter->tree_locked == isc_rwlocktype_none);
6868
6869         dereference_iter_node(rbtdbiter);
6870
6871         flush_deletions(rbtdbiter);
6872
6873         dns_db_attach(rbtdbiter->common.db, &db);
6874         dns_db_detach(&rbtdbiter->common.db);
6875
6876         dns_rbtnodechain_reset(&rbtdbiter->chain);
6877         isc_mem_put(db->mctx, rbtdbiter, sizeof(*rbtdbiter));
6878         dns_db_detach(&db);
6879
6880         *iteratorp = NULL;
6881 }
6882
6883 static isc_result_t
6884 dbiterator_first(dns_dbiterator_t *iterator) {
6885         isc_result_t result;
6886         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6887         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6888         dns_name_t *name, *origin;
6889
6890         if (rbtdbiter->result != ISC_R_SUCCESS &&
6891             rbtdbiter->result != ISC_R_NOMORE)
6892                 return (rbtdbiter->result);
6893
6894         if (rbtdbiter->paused)
6895                 resume_iteration(rbtdbiter);
6896
6897         dereference_iter_node(rbtdbiter);
6898
6899         name = dns_fixedname_name(&rbtdbiter->name);
6900         origin = dns_fixedname_name(&rbtdbiter->origin);
6901         dns_rbtnodechain_reset(&rbtdbiter->chain);
6902
6903         result = dns_rbtnodechain_first(&rbtdbiter->chain, rbtdb->tree, name,
6904                                         origin);
6905
6906         if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
6907                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6908                                                   NULL, &rbtdbiter->node);
6909                 if (result == ISC_R_SUCCESS) {
6910                         rbtdbiter->new_origin = ISC_TRUE;
6911                         reference_iter_node(rbtdbiter);
6912                 }
6913         } else {
6914                 INSIST(result == ISC_R_NOTFOUND);
6915                 result = ISC_R_NOMORE; /* The tree is empty. */
6916         }
6917
6918         rbtdbiter->result = result;
6919
6920         return (result);
6921 }
6922
6923 static isc_result_t
6924 dbiterator_last(dns_dbiterator_t *iterator) {
6925         isc_result_t result;
6926         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6927         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6928         dns_name_t *name, *origin;
6929
6930         if (rbtdbiter->result != ISC_R_SUCCESS &&
6931             rbtdbiter->result != ISC_R_NOMORE)
6932                 return (rbtdbiter->result);
6933
6934         if (rbtdbiter->paused)
6935                 resume_iteration(rbtdbiter);
6936
6937         dereference_iter_node(rbtdbiter);
6938
6939         name = dns_fixedname_name(&rbtdbiter->name);
6940         origin = dns_fixedname_name(&rbtdbiter->origin);
6941         dns_rbtnodechain_reset(&rbtdbiter->chain);
6942
6943         result = dns_rbtnodechain_last(&rbtdbiter->chain, rbtdb->tree, name,
6944                                        origin);
6945         if (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
6946                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
6947                                                   NULL, &rbtdbiter->node);
6948                 if (result == ISC_R_SUCCESS) {
6949                         rbtdbiter->new_origin = ISC_TRUE;
6950                         reference_iter_node(rbtdbiter);
6951                 }
6952         } else {
6953                 INSIST(result == ISC_R_NOTFOUND);
6954                 result = ISC_R_NOMORE; /* The tree is empty. */
6955         }
6956
6957         rbtdbiter->result = result;
6958
6959         return (result);
6960 }
6961
6962 static isc_result_t
6963 dbiterator_seek(dns_dbiterator_t *iterator, dns_name_t *name) {
6964         isc_result_t result;
6965         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
6966         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
6967         dns_name_t *iname, *origin;
6968
6969         if (rbtdbiter->result != ISC_R_SUCCESS &&
6970             rbtdbiter->result != ISC_R_NOMORE)
6971                 return (rbtdbiter->result);
6972
6973         if (rbtdbiter->paused)
6974                 resume_iteration(rbtdbiter);
6975
6976         dereference_iter_node(rbtdbiter);
6977
6978         iname = dns_fixedname_name(&rbtdbiter->name);
6979         origin = dns_fixedname_name(&rbtdbiter->origin);
6980         dns_rbtnodechain_reset(&rbtdbiter->chain);
6981
6982         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &rbtdbiter->node,
6983                                   &rbtdbiter->chain, DNS_RBTFIND_EMPTYDATA,
6984                                   NULL, NULL);
6985         if (result == ISC_R_SUCCESS) {
6986                 result = dns_rbtnodechain_current(&rbtdbiter->chain, iname,
6987                                                   origin, NULL);
6988                 if (result == ISC_R_SUCCESS) {
6989                         rbtdbiter->new_origin = ISC_TRUE;
6990                         reference_iter_node(rbtdbiter);
6991                 }
6992
6993         } else if (result == DNS_R_PARTIALMATCH)
6994                 result = ISC_R_NOTFOUND;
6995
6996         rbtdbiter->result = result;
6997
6998         return (result);
6999 }
7000
7001 static isc_result_t
7002 dbiterator_prev(dns_dbiterator_t *iterator) {
7003         isc_result_t result;
7004         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7005         dns_name_t *name, *origin;
7006
7007         REQUIRE(rbtdbiter->node != NULL);
7008
7009         if (rbtdbiter->result != ISC_R_SUCCESS)
7010                 return (rbtdbiter->result);
7011
7012         if (rbtdbiter->paused)
7013                 resume_iteration(rbtdbiter);
7014
7015         name = dns_fixedname_name(&rbtdbiter->name);
7016         origin = dns_fixedname_name(&rbtdbiter->origin);
7017         result = dns_rbtnodechain_prev(&rbtdbiter->chain, name, origin);
7018
7019         dereference_iter_node(rbtdbiter);
7020
7021         if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
7022                 rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN);
7023                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
7024                                                   NULL, &rbtdbiter->node);
7025         }
7026
7027         if (result == ISC_R_SUCCESS)
7028                 reference_iter_node(rbtdbiter);
7029
7030         rbtdbiter->result = result;
7031
7032         return (result);
7033 }
7034
7035 static isc_result_t
7036 dbiterator_next(dns_dbiterator_t *iterator) {
7037         isc_result_t result;
7038         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7039         dns_name_t *name, *origin;
7040
7041         REQUIRE(rbtdbiter->node != NULL);
7042
7043         if (rbtdbiter->result != ISC_R_SUCCESS)
7044                 return (rbtdbiter->result);
7045
7046         if (rbtdbiter->paused)
7047                 resume_iteration(rbtdbiter);
7048
7049         name = dns_fixedname_name(&rbtdbiter->name);
7050         origin = dns_fixedname_name(&rbtdbiter->origin);
7051         result = dns_rbtnodechain_next(&rbtdbiter->chain, name, origin);
7052
7053         dereference_iter_node(rbtdbiter);
7054
7055         if (result == DNS_R_NEWORIGIN || result == ISC_R_SUCCESS) {
7056                 rbtdbiter->new_origin = ISC_TF(result == DNS_R_NEWORIGIN);
7057                 result = dns_rbtnodechain_current(&rbtdbiter->chain, NULL,
7058                                                   NULL, &rbtdbiter->node);
7059         }
7060         if (result == ISC_R_SUCCESS)
7061                 reference_iter_node(rbtdbiter);
7062
7063         rbtdbiter->result = result;
7064
7065         return (result);
7066 }
7067
7068 static isc_result_t
7069 dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
7070                    dns_name_t *name)
7071 {
7072         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
7073         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7074         dns_rbtnode_t *node = rbtdbiter->node;
7075         isc_result_t result;
7076         dns_name_t *nodename = dns_fixedname_name(&rbtdbiter->name);
7077         dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
7078
7079         REQUIRE(rbtdbiter->result == ISC_R_SUCCESS);
7080         REQUIRE(rbtdbiter->node != NULL);
7081
7082         if (rbtdbiter->paused)
7083                 resume_iteration(rbtdbiter);
7084
7085         if (name != NULL) {
7086                 if (rbtdbiter->common.relative_names)
7087                         origin = NULL;
7088                 result = dns_name_concatenate(nodename, origin, name, NULL);
7089                 if (result != ISC_R_SUCCESS)
7090                         return (result);
7091                 if (rbtdbiter->common.relative_names && rbtdbiter->new_origin)
7092                         result = DNS_R_NEWORIGIN;
7093         } else
7094                 result = ISC_R_SUCCESS;
7095
7096         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
7097         new_reference(rbtdb, node);
7098         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
7099
7100         *nodep = rbtdbiter->node;
7101
7102         if (iterator->cleaning && result == ISC_R_SUCCESS) {
7103                 isc_result_t expire_result;
7104
7105                 /*
7106                  * If the deletion array is full, flush it before trying
7107                  * to expire the current node.  The current node can't
7108                  * fully deleted while the iteration cursor is still on it.
7109                  */
7110                 if (rbtdbiter->delete == DELETION_BATCH_MAX)
7111                         flush_deletions(rbtdbiter);
7112
7113                 expire_result = expirenode(iterator->db, *nodep, 0);
7114
7115                 /*
7116                  * expirenode() currently always returns success.
7117                  */
7118                 if (expire_result == ISC_R_SUCCESS && node->down == NULL) {
7119                         unsigned int refs;
7120
7121                         rbtdbiter->deletions[rbtdbiter->delete++] = node;
7122                         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
7123                         dns_rbtnode_refincrement(node, &refs);
7124                         INSIST(refs != 0);
7125                         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
7126                 }
7127         }
7128
7129         return (result);
7130 }
7131
7132 static isc_result_t
7133 dbiterator_pause(dns_dbiterator_t *iterator) {
7134         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)iterator->db;
7135         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7136
7137         if (rbtdbiter->result != ISC_R_SUCCESS &&
7138             rbtdbiter->result != ISC_R_NOMORE)
7139                 return (rbtdbiter->result);
7140
7141         if (rbtdbiter->paused)
7142                 return (ISC_R_SUCCESS);
7143
7144         rbtdbiter->paused = ISC_TRUE;
7145
7146         if (rbtdbiter->tree_locked != isc_rwlocktype_none) {
7147                 INSIST(rbtdbiter->tree_locked == isc_rwlocktype_read);
7148                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_read);
7149                 rbtdbiter->tree_locked = isc_rwlocktype_none;
7150         }
7151
7152         flush_deletions(rbtdbiter);
7153
7154         return (ISC_R_SUCCESS);
7155 }
7156
7157 static isc_result_t
7158 dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name) {
7159         rbtdb_dbiterator_t *rbtdbiter = (rbtdb_dbiterator_t *)iterator;
7160         dns_name_t *origin = dns_fixedname_name(&rbtdbiter->origin);
7161
7162         if (rbtdbiter->result != ISC_R_SUCCESS)
7163                 return (rbtdbiter->result);
7164
7165         return (dns_name_copy(origin, name, NULL));
7166 }
7167
7168 /*%
7169  * Additional cache routines.
7170  */
7171 static isc_result_t
7172 rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type,
7173                        dns_rdatatype_t qtype, dns_acache_t *acache,
7174                        dns_zone_t **zonep, dns_db_t **dbp,
7175                        dns_dbversion_t **versionp, dns_dbnode_t **nodep,
7176                        dns_name_t *fname, dns_message_t *msg,
7177                        isc_stdtime_t now)
7178 {
7179         dns_rbtdb_t *rbtdb = rdataset->private1;
7180         dns_rbtnode_t *rbtnode = rdataset->private2;
7181         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7182         unsigned int current_count = rdataset->privateuint4;
7183         unsigned int count;
7184         rdatasetheader_t *header;
7185         nodelock_t *nodelock;
7186         unsigned int total_count;
7187         acachectl_t *acarray;
7188         dns_acacheentry_t *entry;
7189         isc_result_t result;
7190
7191         UNUSED(qtype); /* we do not use this value at least for now */
7192         UNUSED(acache);
7193
7194         header = (struct rdatasetheader *)(raw - sizeof(*header));
7195
7196         total_count = raw[0] * 256 + raw[1];
7197         INSIST(total_count > current_count);
7198         count = total_count - current_count - 1;
7199
7200         acarray = NULL;
7201
7202         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7203         NODE_LOCK(nodelock, isc_rwlocktype_read);
7204
7205         switch (type) {
7206         case dns_rdatasetadditional_fromauth:
7207                 acarray = header->additional_auth;
7208                 break;
7209         case dns_rdatasetadditional_fromcache:
7210                 acarray = NULL;
7211                 break;
7212         case dns_rdatasetadditional_fromglue:
7213                 acarray = header->additional_glue;
7214                 break;
7215         default:
7216                 INSIST(0);
7217         }
7218
7219         if (acarray == NULL) {
7220                 if (type != dns_rdatasetadditional_fromcache)
7221                         dns_acache_countquerymiss(acache);
7222                 NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7223                 return (ISC_R_NOTFOUND);
7224         }
7225
7226         if (acarray[count].entry == NULL) {
7227                 dns_acache_countquerymiss(acache);
7228                 NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7229                 return (ISC_R_NOTFOUND);
7230         }
7231
7232         entry = NULL;
7233         dns_acache_attachentry(acarray[count].entry, &entry);
7234
7235         NODE_UNLOCK(nodelock, isc_rwlocktype_read);
7236
7237         result = dns_acache_getentry(entry, zonep, dbp, versionp,
7238                                      nodep, fname, msg, now);
7239
7240         dns_acache_detachentry(&entry);
7241
7242         return (result);
7243 }
7244
7245 static void
7246 acache_callback(dns_acacheentry_t *entry, void **arg) {
7247         dns_rbtdb_t *rbtdb;
7248         dns_rbtnode_t *rbtnode;
7249         nodelock_t *nodelock;
7250         acachectl_t *acarray = NULL;
7251         acache_cbarg_t *cbarg;
7252         unsigned int count;
7253
7254         REQUIRE(arg != NULL);
7255         cbarg = *arg;
7256
7257         /*
7258          * The caller must hold the entry lock.
7259          */
7260
7261         rbtdb = (dns_rbtdb_t *)cbarg->db;
7262         rbtnode = (dns_rbtnode_t *)cbarg->node;
7263
7264         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7265         NODE_LOCK(nodelock, isc_rwlocktype_write);
7266
7267         switch (cbarg->type) {
7268         case dns_rdatasetadditional_fromauth:
7269                 acarray = cbarg->header->additional_auth;
7270                 break;
7271         case dns_rdatasetadditional_fromglue:
7272                 acarray = cbarg->header->additional_glue;
7273                 break;
7274         default:
7275                 INSIST(0);
7276         }
7277
7278         count = cbarg->count;
7279         if (acarray != NULL && acarray[count].entry == entry) {
7280                 acarray[count].entry = NULL;
7281                 INSIST(acarray[count].cbarg == cbarg);
7282                 isc_mem_put(rbtdb->common.mctx, cbarg, sizeof(acache_cbarg_t));
7283                 acarray[count].cbarg = NULL;
7284         } else
7285                 isc_mem_put(rbtdb->common.mctx, cbarg, sizeof(acache_cbarg_t));
7286
7287         dns_acache_detachentry(&entry);
7288
7289         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7290
7291         dns_db_detachnode((dns_db_t *)rbtdb, (dns_dbnode_t **)(void*)&rbtnode);
7292         dns_db_detach((dns_db_t **)(void*)&rbtdb);
7293
7294         *arg = NULL;
7295 }
7296
7297 static void
7298 acache_cancelentry(isc_mem_t *mctx, dns_acacheentry_t *entry,
7299                       acache_cbarg_t **cbargp)
7300 {
7301         acache_cbarg_t *cbarg;
7302
7303         REQUIRE(mctx != NULL);
7304         REQUIRE(entry != NULL);
7305         REQUIRE(cbargp != NULL && *cbargp != NULL);
7306
7307         cbarg = *cbargp;
7308
7309         dns_acache_cancelentry(entry);
7310         dns_db_detachnode(cbarg->db, &cbarg->node);
7311         dns_db_detach(&cbarg->db);
7312
7313         isc_mem_put(mctx, cbarg, sizeof(acache_cbarg_t));
7314
7315         *cbargp = NULL;
7316 }
7317
7318 static isc_result_t
7319 rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type,
7320                        dns_rdatatype_t qtype, dns_acache_t *acache,
7321                        dns_zone_t *zone, dns_db_t *db,
7322                        dns_dbversion_t *version, dns_dbnode_t *node,
7323                        dns_name_t *fname)
7324 {
7325         dns_rbtdb_t *rbtdb = rdataset->private1;
7326         dns_rbtnode_t *rbtnode = rdataset->private2;
7327         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7328         unsigned int current_count = rdataset->privateuint4;
7329         rdatasetheader_t *header;
7330         unsigned int total_count, count;
7331         nodelock_t *nodelock;
7332         isc_result_t result;
7333         acachectl_t *acarray;
7334         dns_acacheentry_t *newentry, *oldentry = NULL;
7335         acache_cbarg_t *newcbarg, *oldcbarg = NULL;
7336
7337         UNUSED(qtype);
7338
7339         if (type == dns_rdatasetadditional_fromcache)
7340                 return (ISC_R_SUCCESS);
7341
7342         header = (struct rdatasetheader *)(raw - sizeof(*header));
7343
7344         total_count = raw[0] * 256 + raw[1];
7345         INSIST(total_count > current_count);
7346         count = total_count - current_count - 1; /* should be private data */
7347
7348         newcbarg = isc_mem_get(rbtdb->common.mctx, sizeof(*newcbarg));
7349         if (newcbarg == NULL)
7350                 return (ISC_R_NOMEMORY);
7351         newcbarg->type = type;
7352         newcbarg->count = count;
7353         newcbarg->header = header;
7354         newcbarg->db = NULL;
7355         dns_db_attach((dns_db_t *)rbtdb, &newcbarg->db);
7356         newcbarg->node = NULL;
7357         dns_db_attachnode((dns_db_t *)rbtdb, (dns_dbnode_t *)rbtnode,
7358                           &newcbarg->node);
7359         newentry = NULL;
7360         result = dns_acache_createentry(acache, (dns_db_t *)rbtdb,
7361                                         acache_callback, newcbarg, &newentry);
7362         if (result != ISC_R_SUCCESS)
7363                 goto fail;
7364         /* Set cache data in the new entry. */
7365         result = dns_acache_setentry(acache, newentry, zone, db,
7366                                      version, node, fname);
7367         if (result != ISC_R_SUCCESS)
7368                 goto fail;
7369
7370         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7371         NODE_LOCK(nodelock, isc_rwlocktype_write);
7372
7373         acarray = NULL;
7374         switch (type) {
7375         case dns_rdatasetadditional_fromauth:
7376                 acarray = header->additional_auth;
7377                 break;
7378         case dns_rdatasetadditional_fromglue:
7379                 acarray = header->additional_glue;
7380                 break;
7381         default:
7382                 INSIST(0);
7383         }
7384
7385         if (acarray == NULL) {
7386                 unsigned int i;
7387
7388                 acarray = isc_mem_get(rbtdb->common.mctx, total_count *
7389                                       sizeof(acachectl_t));
7390
7391                 if (acarray == NULL) {
7392                         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7393                         goto fail;
7394                 }
7395
7396                 for (i = 0; i < total_count; i++) {
7397                         acarray[i].entry = NULL;
7398                         acarray[i].cbarg = NULL;
7399                 }
7400         }
7401         switch (type) {
7402         case dns_rdatasetadditional_fromauth:
7403                 header->additional_auth = acarray;
7404                 break;
7405         case dns_rdatasetadditional_fromglue:
7406                 header->additional_glue = acarray;
7407                 break;
7408         default:
7409                 INSIST(0);
7410         }
7411
7412         if (acarray[count].entry != NULL) {
7413                 /*
7414                  * Swap the entry.  Delay cleaning-up the old entry since
7415                  * it would require a node lock.
7416                  */
7417                 oldentry = acarray[count].entry;
7418                 INSIST(acarray[count].cbarg != NULL);
7419                 oldcbarg = acarray[count].cbarg;
7420         }
7421         acarray[count].entry = newentry;
7422         acarray[count].cbarg = newcbarg;
7423
7424         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7425
7426         if (oldentry != NULL) {
7427                 acache_cancelentry(rbtdb->common.mctx, oldentry, &oldcbarg);
7428                 dns_acache_detachentry(&oldentry);
7429         }
7430
7431         return (ISC_R_SUCCESS);
7432
7433   fail:
7434         if (newcbarg != NULL) {
7435                 if (newentry != NULL) {
7436                         acache_cancelentry(rbtdb->common.mctx, newentry,
7437                                            &newcbarg);
7438                         dns_acache_detachentry(&newentry);
7439                 } else {
7440                         dns_db_detachnode((dns_db_t *)rbtdb, &newcbarg->node);
7441                         dns_db_detach(&newcbarg->db);
7442                         isc_mem_put(rbtdb->common.mctx, newcbarg,
7443                             sizeof(*newcbarg));
7444                 }
7445         }
7446
7447         return (result);
7448 }
7449
7450 static isc_result_t
7451 rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset,
7452                        dns_rdatasetadditional_t type, dns_rdatatype_t qtype)
7453 {
7454         dns_rbtdb_t *rbtdb = rdataset->private1;
7455         dns_rbtnode_t *rbtnode = rdataset->private2;
7456         unsigned char *raw = rdataset->private3;        /* RDATASLAB */
7457         unsigned int current_count = rdataset->privateuint4;
7458         rdatasetheader_t *header;
7459         nodelock_t *nodelock;
7460         unsigned int total_count, count;
7461         acachectl_t *acarray;
7462         dns_acacheentry_t *entry;
7463         acache_cbarg_t *cbarg;
7464
7465         UNUSED(qtype);          /* we do not use this value at least for now */
7466         UNUSED(acache);
7467
7468         if (type == dns_rdatasetadditional_fromcache)
7469                 return (ISC_R_SUCCESS);
7470
7471         header = (struct rdatasetheader *)(raw - sizeof(*header));
7472
7473         total_count = raw[0] * 256 + raw[1];
7474         INSIST(total_count > current_count);
7475         count = total_count - current_count - 1;
7476
7477         acarray = NULL;
7478         entry = NULL;
7479
7480         nodelock = &rbtdb->node_locks[rbtnode->locknum].lock;
7481         NODE_LOCK(nodelock, isc_rwlocktype_write);
7482
7483         switch (type) {
7484         case dns_rdatasetadditional_fromauth:
7485                 acarray = header->additional_auth;
7486                 break;
7487         case dns_rdatasetadditional_fromglue:
7488                 acarray = header->additional_glue;
7489                 break;
7490         default:
7491                 INSIST(0);
7492         }
7493
7494         if (acarray == NULL) {
7495                 NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7496                 return (ISC_R_NOTFOUND);
7497         }
7498
7499         entry = acarray[count].entry;
7500         if (entry == NULL) {
7501                 NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7502                 return (ISC_R_NOTFOUND);
7503         }
7504
7505         acarray[count].entry = NULL;
7506         cbarg = acarray[count].cbarg;
7507         acarray[count].cbarg = NULL;
7508
7509         NODE_UNLOCK(nodelock, isc_rwlocktype_write);
7510
7511         if (entry != NULL) {
7512                 if (cbarg != NULL)
7513                         acache_cancelentry(rbtdb->common.mctx, entry, &cbarg);
7514                 dns_acache_detachentry(&entry);
7515         }
7516
7517         return (ISC_R_SUCCESS);
7518 }
7519
7520 /*%
7521  * Routines for LRU-based cache management.
7522  */
7523
7524 /*%
7525  * See if a given cache entry that is being reused needs to be updated
7526  * in the LRU-list.  From the LRU management point of view, this function is
7527  * expected to return true for almost all cases.  When used with threads,
7528  * however, this may cause a non-negligible performance penalty because a
7529  * writer lock will have to be acquired before updating the list.
7530  * If DNS_RBTDB_LIMITLRUUPDATE is defined to be non 0 at compilation time, this
7531  * function returns true if the entry has not been updated for some period of
7532  * time.  We differentiate the NS or glue address case and the others since
7533  * experiments have shown that the former tends to be accessed relatively
7534  * infrequently and the cost of cache miss is higher (e.g., a missing NS records
7535  * may cause external queries at a higher level zone, involving more
7536  * transactions).
7537  *
7538  * Caller must hold the node (read or write) lock.
7539  */
7540 static inline isc_boolean_t
7541 need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now) {
7542         if ((header->attributes &
7543              (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0)
7544                 return (ISC_FALSE);
7545
7546 #if DNS_RBTDB_LIMITLRUUPDATE
7547         if (header->type == dns_rdatatype_ns ||
7548             (header->trust == dns_trust_glue &&
7549              (header->type == dns_rdatatype_a ||
7550               header->type == dns_rdatatype_aaaa))) {
7551                 /*
7552                  * Glue records are updated if at least 60 seconds have passed
7553                  * since the previous update time.
7554                  */
7555                 return (header->last_used + 60 <= now);
7556         }
7557
7558         /* Other records are updated if 5 minutes have passed. */
7559         return (header->last_used + 300 <= now);
7560 #else
7561         UNUSED(now);
7562
7563         return (ISC_TRUE);
7564 #endif
7565 }
7566
7567 /*%
7568  * Update the timestamp of a given cache entry and move it to the head
7569  * of the corresponding LRU list.
7570  *
7571  * Caller must hold the node (write) lock.
7572  *
7573  * Note that the we do NOT touch the heap here, as the TTL has not changed.
7574  */
7575 static void
7576 update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
7577               isc_stdtime_t now)
7578 {
7579         /* To be checked: can we really assume this? XXXMLG */
7580         INSIST(ISC_LINK_LINKED(header, lru_link));
7581
7582         ISC_LIST_UNLINK(rbtdb->rdatasets[header->node->locknum],
7583                         header, lru_link);
7584         header->last_used = now;
7585         ISC_LIST_PREPEND(rbtdb->rdatasets[header->node->locknum],
7586                          header, lru_link);
7587 }
7588
7589 /*%
7590  * Purge some expired and/or stale (i.e. unused for some period) cache entries
7591  * under an overmem condition.  To recover from this condition quickly, up to
7592  * 2 entries will be purged.  This process is triggered while adding a new
7593  * entry, and we specifically avoid purging entries in the same LRU bucket as
7594  * the one to which the new entry will belong.  Otherwise, we might purge
7595  * entries of the same name of different RR types while adding RRsets from a
7596  * single response (consider the case where we're adding A and AAAA glue records
7597  * of the same NS name).
7598  */
7599 static void
7600 overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
7601               isc_stdtime_t now, isc_boolean_t tree_locked)
7602 {
7603         rdatasetheader_t *header, *header_prev;
7604         unsigned int locknum;
7605         int purgecount = 2;
7606
7607         for (locknum = (locknum_start + 1) % rbtdb->node_lock_count;
7608              locknum != locknum_start && purgecount > 0;
7609              locknum = (locknum + 1) % rbtdb->node_lock_count) {
7610                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
7611                           isc_rwlocktype_write);
7612
7613                 header = isc_heap_element(rbtdb->heaps[locknum], 1);
7614                 if (header && header->rdh_ttl <= now - RBTDB_VIRTUAL) {
7615                         expire_header(rbtdb, header, tree_locked);
7616                         purgecount--;
7617                 }
7618
7619                 for (header = ISC_LIST_TAIL(rbtdb->rdatasets[locknum]);
7620                      header != NULL && purgecount > 0;
7621                      header = header_prev) {
7622                         header_prev = ISC_LIST_PREV(header, lru_link);
7623                         /*
7624                          * Unlink the entry at this point to avoid checking it
7625                          * again even if it's currently used someone else and
7626                          * cannot be purged at this moment.  This entry won't be
7627                          * referenced any more (so unlinking is safe) since the
7628                          * TTL was reset to 0.
7629                          */
7630                         ISC_LIST_UNLINK(rbtdb->rdatasets[locknum], header,
7631                                         lru_link);
7632                         expire_header(rbtdb, header, tree_locked);
7633                         purgecount--;
7634                 }
7635
7636                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
7637                                     isc_rwlocktype_write);
7638         }
7639 }
7640
7641 static void
7642 expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
7643               isc_boolean_t tree_locked)
7644 {
7645         set_ttl(rbtdb, header, 0);
7646         header->attributes |= RDATASET_ATTR_STALE;
7647         header->node->dirty = 1;
7648
7649         /*
7650          * Caller must hold the node (write) lock.
7651          */
7652
7653         if (dns_rbtnode_refcurrent(header->node) == 0) {
7654                 /*
7655                  * If no one else is using the node, we can clean it up now.
7656                  * We first need to gain a new reference to the node to meet a
7657                  * requirement of decrement_reference().
7658                  */
7659                 new_reference(rbtdb, header->node);
7660                 decrement_reference(rbtdb, header->node, 0,
7661                                     isc_rwlocktype_write,
7662                                     tree_locked ? isc_rwlocktype_write :
7663                                     isc_rwlocktype_none, ISC_FALSE);
7664         }
7665 }