bind - Upgraded vendor branch to 9.5.2-P1
[dragonfly.git] / contrib / bind-9.5.2 / lib / dns / rbtdb.c
1 /*
2  * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
3  * Copyright (C) 1999-2003  Internet Software Consortium.
4  *
5  * Permission to use, copy, modify, and/or distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15  * PERFORMANCE OF THIS SOFTWARE.
16  */
17
18 /* $Id: rbtdb.c,v 1.248.12.18.2.1 2009/11/18 23:41:18 marka Exp $ */
19
20 /*! \file */
21
22 /*
23  * Principal Author: Bob Halley
24  */
25
26 #include <config.h>
27
28 #include <isc/heap.h>
29 #include <isc/event.h>
30 #include <isc/mem.h>
31 #include <isc/platform.h>
32 #include <isc/print.h>
33 #include <isc/mutex.h>
34 #include <isc/random.h>
35 #include <isc/refcount.h>
36 #include <isc/rwlock.h>
37 #include <isc/string.h>
38 #include <isc/task.h>
39 #include <isc/time.h>
40 #include <isc/util.h>
41
42 #include <dns/acache.h>
43 #include <dns/db.h>
44 #include <dns/dbiterator.h>
45 #include <dns/events.h>
46 #include <dns/fixedname.h>
47 #include <dns/lib.h>
48 #include <dns/log.h>
49 #include <dns/masterdump.h>
50 #include <dns/rbt.h>
51 #include <dns/rdata.h>
52 #include <dns/rdataset.h>
53 #include <dns/rdatasetiter.h>
54 #include <dns/rdataslab.h>
55 #include <dns/result.h>
56 #include <dns/stats.h>
57 #include <dns/view.h>
58 #include <dns/zone.h>
59 #include <dns/zonekey.h>
60
61 #ifdef DNS_RBTDB_VERSION64
62 #include "rbtdb64.h"
63 #else
64 #include "rbtdb.h"
65 #endif
66
67 #ifdef DNS_RBTDB_VERSION64
68 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
69 #else
70 #define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
71 #endif
72
73 /*%
74  * Note that "impmagic" is not the first four bytes of the struct, so
75  * ISC_MAGIC_VALID cannot be used.
76  */
77 #define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
78                                  (rbtdb)->common.impmagic == RBTDB_MAGIC)
79
80 #ifdef DNS_RBTDB_VERSION64
81 typedef isc_uint64_t                    rbtdb_serial_t;
82 /*%
83  * Make casting easier in symbolic debuggers by using different names
84  * for the 64 bit version.
85  */
86 #define dns_rbtdb_t dns_rbtdb64_t
87 #define rdatasetheader_t rdatasetheader64_t
88 #define rbtdb_version_t rbtdb_version64_t
89 #else
90 typedef isc_uint32_t                    rbtdb_serial_t;
91 #endif
92
93 typedef isc_uint32_t                    rbtdb_rdatatype_t;
94
95 #define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
96 #define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
97 #define RBTDB_RDATATYPE_VALUE(b, e)     (((e) << 16) | (b))
98
99 #define RBTDB_RDATATYPE_SIGNSEC \
100                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
101 #define RBTDB_RDATATYPE_SIGNS \
102                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
103 #define RBTDB_RDATATYPE_SIGCNAME \
104                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
105 #define RBTDB_RDATATYPE_SIGDNAME \
106                 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
107 #define RBTDB_RDATATYPE_NCACHEANY \
108                 RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
109
110 /*
111  * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
112  * Using rwlock is effective with regard to lookup performance only when
113  * it is implemented in an efficient way.
114  * Otherwise, it is generally wise to stick to the simple locking since rwlock
115  * would require more memory or can even make lookups slower due to its own
116  * overhead (when it internally calls mutex locks).
117  */
118 #ifdef ISC_RWLOCK_USEATOMIC
119 #define DNS_RBTDB_USERWLOCK 1
120 #else
121 #define DNS_RBTDB_USERWLOCK 0
122 #endif
123
124 #if DNS_RBTDB_USERWLOCK
125 #define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
126 #define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
127 #define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
128 #define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
129 #else
130 #define RBTDB_INITLOCK(l)       isc_mutex_init(l)
131 #define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
132 #define RBTDB_LOCK(l, t)        LOCK(l)
133 #define RBTDB_UNLOCK(l, t)      UNLOCK(l)
134 #endif
135
136 /*
137  * Since node locking is sensitive to both performance and memory footprint,
138  * we need some trick here.  If we have both high-performance rwlock and
139  * high performance and small-memory reference counters, we use rwlock for
140  * node lock and isc_refcount for node references.  In this case, we don't have
141  * to protect the access to the counters by locks.
142  * Otherwise, we simply use ordinary mutex lock for node locking, and use
143  * simple integers as reference counters which is protected by the lock.
144  * In most cases, we can simply use wrapper macros such as NODE_LOCK and
145  * NODE_UNLOCK.  In some other cases, however, we need to protect reference
146  * counters first and then protect other parts of a node as read-only data.
147  * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
148  * provided for these special cases.  When we can use the efficient backend
149  * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
150  * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
151  * section including the access to the reference counter.
152  * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
153  * section is also protected by NODE_STRONGLOCK().
154  */
155 #if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
156 typedef isc_rwlock_t nodelock_t;
157
158 #define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
159 #define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
160 #define NODE_LOCK(l, t)         RWLOCK((l), (t))
161 #define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
162 #define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)
163
164 #define NODE_STRONGLOCK(l)      ((void)0)
165 #define NODE_STRONGUNLOCK(l)    ((void)0)
166 #define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
167 #define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
168 #define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
169 #else
170 typedef isc_mutex_t nodelock_t;
171
172 #define NODE_INITLOCK(l)        isc_mutex_init(l)
173 #define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
174 #define NODE_LOCK(l, t)         LOCK(l)
175 #define NODE_UNLOCK(l, t)       UNLOCK(l)
176 #define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS
177
178 #define NODE_STRONGLOCK(l)      LOCK(l)
179 #define NODE_STRONGUNLOCK(l)    UNLOCK(l)
180 #define NODE_WEAKLOCK(l, t)     ((void)0)
181 #define NODE_WEAKUNLOCK(l, t)   ((void)0)
182 #define NODE_WEAKDOWNGRADE(l)   ((void)0)
183 #endif
184
185 /*%
186  * Whether to rate-limit updating the LRU to avoid possible thread contention.
187  * Our performance measurement has shown the cost is marginal, so it's defined
188  * to be 0 by default either with or without threads.
189  */
190 #ifndef DNS_RBTDB_LIMITLRUUPDATE
191 #define DNS_RBTDB_LIMITLRUUPDATE 0
192 #endif
193
194 /*
195  * Allow clients with a virtual time of up to 5 minutes in the past to see
196  * records that would have otherwise have expired.
197  */
198 #define RBTDB_VIRTUAL 300
199
200 struct noqname {
201         dns_name_t name;
202         void *     nsec;
203         void *     nsecsig;
204 };
205
206 typedef struct acachectl acachectl_t;
207
208 typedef struct rdatasetheader {
209         /*%
210          * Locked by the owning node's lock.
211          */
212         rbtdb_serial_t                  serial;
213         dns_ttl_t                       rdh_ttl;
214         rbtdb_rdatatype_t               type;
215         isc_uint16_t                    attributes;
216         dns_trust_t                     trust;
217         struct noqname                  *noqname;
218         /*%<
219          * We don't use the LIST macros, because the LIST structure has
220          * both head and tail pointers, and is doubly linked.
221          */
222
223         struct rdatasetheader           *next;
224         /*%<
225          * If this is the top header for an rdataset, 'next' points
226          * to the top header for the next rdataset (i.e., the next type).
227          * Otherwise, it points up to the header whose down pointer points
228          * at this header.
229          */
230
231         struct rdatasetheader           *down;
232         /*%<
233          * Points to the header for the next older version of
234          * this rdataset.
235          */
236
237         isc_uint32_t                    count;
238         /*%<
239          * Monotonously increased every time this rdataset is bound so that
240          * it is used as the base of the starting point in DNS responses
241          * when the "cyclic" rrset-order is required.  Since the ordering
242          * should not be so crucial, no lock is set for the counter for
243          * performance reasons.
244          */
245
246         acachectl_t                     *additional_auth;
247         acachectl_t                     *additional_glue;
248
249         dns_rbtnode_t                   *node;
250         isc_stdtime_t                   last_used;
251         ISC_LINK(struct rdatasetheader) lru_link;
252         /*%<
253          * Used for LRU-based cache management.  We should probably make
254          * these cache-DB specific.  We might also make it a pointer and
255          * ensure only the top header has a valid link to save memory.
256          * The linked-list is locked by the rbtdb->lrulock.
257          */
258
259         /*
260          * It's possible this should not be here anymore, but instead
261          * referenced from the bucket's heap directly.
262          */
263 #if 0
264         isc_heap_t                      *heap;
265 #endif
266         unsigned int                    heap_index;
267         /*%<
268          * Used for TTL-based cache cleaning.
269          */
270 } rdatasetheader_t;
271
272 typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
273 typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;
274
275 #define RDATASET_ATTR_NONEXISTENT       0x0001
276 #define RDATASET_ATTR_STALE             0x0002
277 #define RDATASET_ATTR_IGNORE            0x0004
278 #define RDATASET_ATTR_RETAIN            0x0008
279 #define RDATASET_ATTR_NXDOMAIN          0x0010
280 #define RDATASET_ATTR_RESIGN            0x0020
281 #define RDATASET_ATTR_STATCOUNT         0x0040
282
283 typedef struct acache_cbarg {
284         dns_rdatasetadditional_t        type;
285         unsigned int                    count;
286         dns_db_t                        *db;
287         dns_dbnode_t                    *node;
288         rdatasetheader_t                *header;
289 } acache_cbarg_t;
290
291 struct acachectl {
292         dns_acacheentry_t               *entry;
293         acache_cbarg_t                  *cbarg;
294 };
295
296 /*
297  * XXX
298  * When the cache will pre-expire data (due to memory low or other
299  * situations) before the rdataset's TTL has expired, it MUST
300  * respect the RETAIN bit and not expire the data until its TTL is
301  * expired.
302  */
303
304 #undef IGNORE                   /* WIN32 winbase.h defines this. */
305
306 #define EXISTS(header) \
307         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
308 #define NONEXISTENT(header) \
309         (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
310 #define IGNORE(header) \
311         (((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
312 #define RETAIN(header) \
313         (((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
314 #define NXDOMAIN(header) \
315         (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
316
317 #define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
318
319 /*%
320  * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
321  * There is a tradeoff issue about configuring this value: if this is too
322  * small, it may cause heavier contention between threads; if this is too large,
323  * LRU purge algorithm won't work well (entries tend to be purged prematurely).
324  * The default value should work well for most environments, but this can
325  * also be configurable at compilation time via the
326  * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
327  * 1 due to the assumption of overmem_purge().
328  */
329 #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
330 #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
331 #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
332 #else
333 #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
334 #endif
335 #else
336 #define DEFAULT_CACHE_NODE_LOCK_COUNT   16
337 #endif  /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
338
339 typedef struct {
340         nodelock_t                      lock;
341         /* Protected in the refcount routines. */
342         isc_refcount_t                  references;
343         /* Locked by lock. */
344         isc_boolean_t                   exiting;
345 } rbtdb_nodelock_t;
346
347 typedef struct rbtdb_changed {
348         dns_rbtnode_t *                 node;
349         isc_boolean_t                   dirty;
350         ISC_LINK(struct rbtdb_changed)  link;
351 } rbtdb_changed_t;
352
353 typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
354
355 typedef struct rbtdb_version {
356         /* Not locked */
357         rbtdb_serial_t                  serial;
358         /*
359          * Protected in the refcount routines.
360          * XXXJT: should we change the lock policy based on the refcount
361          * performance?
362          */
363         isc_refcount_t                  references;
364         /* Locked by database lock. */
365         isc_boolean_t                   writer;
366         isc_boolean_t                   commit_ok;
367         rbtdb_changedlist_t             changed_list;
368         ISC_LINK(struct rbtdb_version)  link;
369 } rbtdb_version_t;
370
371 typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;
372
373 typedef struct {
374         /* Unlocked. */
375         dns_db_t                        common;
376 #if DNS_RBTDB_USERWLOCK
377         isc_rwlock_t                    lock;
378 #else
379         isc_mutex_t                     lock;
380 #endif
381         isc_rwlock_t                    tree_lock;
382         unsigned int                    node_lock_count;
383         rbtdb_nodelock_t *              node_locks;
384         dns_rbtnode_t *                 origin_node;
385         dns_stats_t *                   rrsetstats; /* cache DB only */
386         /* Locked by lock. */
387         unsigned int                    active;
388         isc_refcount_t                  references;
389         unsigned int                    attributes;
390         rbtdb_serial_t                  current_serial;
391         rbtdb_serial_t                  least_serial;
392         rbtdb_serial_t                  next_serial;
393         rbtdb_version_t *               current_version;
394         rbtdb_version_t *               future_version;
395         rbtdb_versionlist_t             open_versions;
396         isc_boolean_t                   overmem;
397         isc_task_t *                    task;
398         dns_dbnode_t                    *soanode;
399         dns_dbnode_t                    *nsnode;
400
401         /*
402          * This is a linked list used to implement the LRU cache.  There will
403          * be node_lock_count linked lists here.  Nodes in bucket 1 will be
404          * placed on the linked list rdatasets[1].
405          */
406         rdatasetheaderlist_t            *rdatasets;
407
408         /*%
409          * Temporary storage for stale cache nodes and dynamically deleted
410          * nodes that await being cleaned up.
411          */
412         rbtnodelist_t                   *deadnodes;
413
414         /*
415          * Heaps.  Each of these is used for TTL based expiry.
416          */
417         isc_heap_t                      **heaps;
418
419         /* Locked by tree_lock. */
420         dns_rbt_t *                     tree;
421         isc_boolean_t                   secure;
422
423         /* Unlocked */
424         unsigned int                    quantum;
425 } dns_rbtdb_t;
426
427 #define RBTDB_ATTR_LOADED               0x01
428 #define RBTDB_ATTR_LOADING              0x02
429
430 /*%
431  * Search Context
432  */
433 typedef struct {
434         dns_rbtdb_t *           rbtdb;
435         rbtdb_version_t *       rbtversion;
436         rbtdb_serial_t          serial;
437         unsigned int            options;
438         dns_rbtnodechain_t      chain;
439         isc_boolean_t           copy_name;
440         isc_boolean_t           need_cleanup;
441         isc_boolean_t           wild;
442         dns_rbtnode_t *         zonecut;
443         rdatasetheader_t *      zonecut_rdataset;
444         rdatasetheader_t *      zonecut_sigrdataset;
445         dns_fixedname_t         zonecut_name;
446         isc_stdtime_t           now;
447 } rbtdb_search_t;
448
449 /*%
450  * Load Context
451  */
452 typedef struct {
453         dns_rbtdb_t *           rbtdb;
454         isc_stdtime_t           now;
455 } rbtdb_load_t;
456
457 static void rdataset_disassociate(dns_rdataset_t *rdataset);
458 static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
459 static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
460 static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
461 static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
462 static unsigned int rdataset_count(dns_rdataset_t *rdataset);
463 static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
464                                         dns_name_t *name,
465                                         dns_rdataset_t *nsec,
466                                         dns_rdataset_t *nsecsig);
467 static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
468                                            dns_rdatasetadditional_t type,
469                                            dns_rdatatype_t qtype,
470                                            dns_acache_t *acache,
471                                            dns_zone_t **zonep,
472                                            dns_db_t **dbp,
473                                            dns_dbversion_t **versionp,
474                                            dns_dbnode_t **nodep,
475                                            dns_name_t *fname,
476                                            dns_message_t *msg,
477                                            isc_stdtime_t now);
478 static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
479                                            dns_rdatasetadditional_t type,
480                                            dns_rdatatype_t qtype,
481                                            dns_acache_t *acache,
482                                            dns_zone_t *zone,
483                                            dns_db_t *db,
484                                            dns_dbversion_t *version,
485                                            dns_dbnode_t *node,
486                                            dns_name_t *fname);
487 static isc_result_t rdataset_putadditional(dns_acache_t *acache,
488                                            dns_rdataset_t *rdataset,
489                                            dns_rdatasetadditional_t type,
490                                            dns_rdatatype_t qtype);
491 static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
492                                               isc_stdtime_t now);
493 static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
494                           isc_stdtime_t now);
495 static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
496                           isc_boolean_t tree_locked);
497 static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
498                           isc_stdtime_t now, isc_boolean_t tree_locked);
499 static void prune_tree(isc_task_t *task, isc_event_t *event);
500
501 static dns_rdatasetmethods_t rdataset_methods = {
502         rdataset_disassociate,
503         rdataset_first,
504         rdataset_next,
505         rdataset_current,
506         rdataset_clone,
507         rdataset_count,
508         NULL,
509         rdataset_getnoqname,
510         rdataset_getadditional,
511         rdataset_setadditional,
512         rdataset_putadditional
513 };
514
515 static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
516 static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
517 static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
518 static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
519                                  dns_rdataset_t *rdataset);
520
521 static dns_rdatasetitermethods_t rdatasetiter_methods = {
522         rdatasetiter_destroy,
523         rdatasetiter_first,
524         rdatasetiter_next,
525         rdatasetiter_current
526 };
527
528 typedef struct rbtdb_rdatasetiter {
529         dns_rdatasetiter_t              common;
530         rdatasetheader_t *              current;
531 } rbtdb_rdatasetiter_t;
532
533 static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
534 static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
535 static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
536 static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
537                                         dns_name_t *name);
538 static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
539 static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
540 static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
541                                            dns_dbnode_t **nodep,
542                                            dns_name_t *name);
543 static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
544 static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
545                                           dns_name_t *name);
546
547 static dns_dbiteratormethods_t dbiterator_methods = {
548         dbiterator_destroy,
549         dbiterator_first,
550         dbiterator_last,
551         dbiterator_seek,
552         dbiterator_prev,
553         dbiterator_next,
554         dbiterator_current,
555         dbiterator_pause,
556         dbiterator_origin
557 };
558
559 #define DELETION_BATCH_MAX 64
560
561 /*
562  * If 'paused' is ISC_TRUE, then the tree lock is not being held.
563  */
564 typedef struct rbtdb_dbiterator {
565         dns_dbiterator_t                common;
566         isc_boolean_t                   paused;
567         isc_boolean_t                   new_origin;
568         isc_rwlocktype_t                tree_locked;
569         isc_result_t                    result;
570         dns_fixedname_t                 name;
571         dns_fixedname_t                 origin;
572         dns_rbtnodechain_t              chain;
573         dns_rbtnode_t                   *node;
574         dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
575         int                             delete;
576 } rbtdb_dbiterator_t;
577
578
579 #define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
580 #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
581
582 static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
583                        isc_event_t *event);
584 static void overmem(dns_db_t *db, isc_boolean_t overmem);
585
586 /*%
587  * 'init_count' is used to initialize 'newheader->count' which inturn
588  * is used to determine where in the cycle rrset-order cyclic starts.
589  * We don't lock this as we don't care about simultaneous updates.
590  *
591  * Note:
592  *      Both init_count and header->count can be ISC_UINT32_MAX.
593  *      The count on the returned rdataset however can't be as
594  *      that indicates that the database does not implement cyclic
595  *      processing.
596  */
597 static unsigned int init_count;
598
599 /*
600  * Locking
601  *
602  * If a routine is going to lock more than one lock in this module, then
603  * the locking must be done in the following order:
604  *
605  *      Tree Lock
606  *
607  *      Node Lock       (Only one from the set may be locked at one time by
608  *                       any caller)
609  *
610  *      Database Lock
611  *
612  * Failure to follow this hierarchy can result in deadlock.
613  */
614
615 /*
616  * Deleting Nodes
617  *
618  * For zone databases the node for the origin of the zone MUST NOT be deleted.
619  */
620
621
622 /*
623  * DB Routines
624  */
625
626 static void
627 attach(dns_db_t *source, dns_db_t **targetp) {
628         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
629
630         REQUIRE(VALID_RBTDB(rbtdb));
631
632         isc_refcount_increment(&rbtdb->references, NULL);
633
634         *targetp = source;
635 }
636
637 static void
638 free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
639         dns_rbtdb_t *rbtdb = event->ev_arg;
640
641         UNUSED(task);
642
643         free_rbtdb(rbtdb, ISC_TRUE, event);
644 }
645
646 static void
647 update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
648                   isc_boolean_t increment)
649 {
650         dns_rdatastatstype_t statattributes = 0;
651         dns_rdatastatstype_t base = 0;
652         dns_rdatastatstype_t type;
653
654         /* At the moment we count statistics only for cache DB */
655         INSIST(IS_CACHE(rbtdb));
656
657         if (NXDOMAIN(header))
658                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
659         else if (RBTDB_RDATATYPE_BASE(header->type) == 0) {
660                 statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
661                 base = RBTDB_RDATATYPE_EXT(header->type);
662         } else
663                 base = RBTDB_RDATATYPE_BASE(header->type);
664
665         type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
666         if (increment)
667                 dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
668         else
669                 dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
670 }
671
672 static void
673 set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
674         int idx;
675         isc_heap_t *heap;
676         dns_ttl_t oldttl;
677
678         oldttl = header->rdh_ttl;
679         header->rdh_ttl = newttl;
680
681         /*
682          * It's possible the rbtdb is not a cache.  If this is the case,
683          * we will not have a heap, and we move on.  If we do, though,
684          * we might need to adjust things.
685          */
686         if (header->heap_index == 0 || newttl == oldttl)
687                 return;
688         idx = header->node->locknum;
689         if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
690             return;
691         heap = rbtdb->heaps[idx];
692
693         if (newttl < oldttl)
694                 isc_heap_increased(heap, header->heap_index);
695         else
696                 isc_heap_decreased(heap, header->heap_index);
697 }
698
699 /*%
700  * This function allows the heap code to rank the priority of each
701  * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
702  */
703 static isc_boolean_t
704 ttl_sooner(void *v1, void *v2) {
705         rdatasetheader_t *h1 = v1;
706         rdatasetheader_t *h2 = v2;
707
708         if (h1->rdh_ttl < h2->rdh_ttl)
709                 return (ISC_TRUE);
710         return (ISC_FALSE);
711 }
712
713 /*%
714  * This function sets the heap index into the header.
715  */
716 static void
717 ttl_set_index(void *what, unsigned int index) {
718         rdatasetheader_t *h = what;
719
720         h->heap_index = index;
721 }
722
723 /*%
724  * Work out how many nodes can be deleted in the time between two
725  * requests to the nameserver.  Smooth the resulting number and use it
726  * as a estimate for the number of nodes to be deleted in the next
727  * iteration.
728  */
729 static unsigned int
730 adjust_quantum(unsigned int old, isc_time_t *start) {
731         unsigned int pps = dns_pps;     /* packets per second */
732         unsigned int interval;
733         isc_uint64_t usecs;
734         isc_time_t end;
735         unsigned int new;
736
737         if (pps < 100)
738                 pps = 100;
739         isc_time_now(&end);
740
741         interval = 1000000 / pps;       /* interval in usec */
742         if (interval == 0)
743                 interval = 1;
744         usecs = isc_time_microdiff(&end, start);
745         if (usecs == 0) {
746                 /*
747                  * We were unable to measure the amount of time taken.
748                  * Double the nodes deleted next time.
749                  */
750                 old *= 2;
751                 if (old > 1000)
752                         old = 1000;
753                 return (old);
754         }
755         new = old * interval;
756         new /= (unsigned int)usecs;
757         if (new == 0)
758                 new = 1;
759         else if (new > 1000)
760                 new = 1000;
761
762         /* Smooth */
763         new = (new + old * 3) / 4;
764
765         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
766                       ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
767
768         return (new);
769 }
770
771 static void
772 free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
773         unsigned int i;
774         isc_ondestroy_t ondest;
775         isc_result_t result;
776         char buf[DNS_NAME_FORMATSIZE];
777         isc_time_t start;
778
779         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
780                 overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
781
782         REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
783         REQUIRE(rbtdb->future_version == NULL);
784
785         if (rbtdb->current_version != NULL) {
786                 unsigned int refs;
787
788                 isc_refcount_decrement(&rbtdb->current_version->references,
789                                        &refs);
790                 INSIST(refs == 0);
791                 UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
792                 isc_refcount_destroy(&rbtdb->current_version->references);
793                 isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
794                             sizeof(rbtdb_version_t));
795         }
796
797         /*
798          * We assume the number of remaining dead nodes is reasonably small;
799          * the overhead of unlinking all nodes here should be negligible.
800          */
801         for (i = 0; i < rbtdb->node_lock_count; i++) {
802                 dns_rbtnode_t *node;
803
804                 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
805                 while (node != NULL) {
806                         ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
807                         node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
808                 }
809         }
810
811         if (event == NULL)
812                 rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
813  again:
814         if (rbtdb->tree != NULL) {
815                 isc_time_now(&start);
816                 result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
817                 if (result == ISC_R_QUOTA) {
818                         INSIST(rbtdb->task != NULL);
819                         if (rbtdb->quantum != 0)
820                                 rbtdb->quantum = adjust_quantum(rbtdb->quantum,
821                                                                 &start);
822                         if (event == NULL)
823                                 event = isc_event_allocate(rbtdb->common.mctx,
824                                                            NULL,
825                                                          DNS_EVENT_FREESTORAGE,
826                                                            free_rbtdb_callback,
827                                                            rbtdb,
828                                                            sizeof(isc_event_t));
829                         if (event == NULL)
830                                 goto again;
831                         isc_task_send(rbtdb->task, &event);
832                         return;
833                 }
834                 INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
835         }
836         if (event != NULL)
837                 isc_event_free(&event);
838         if (log) {
839                 if (dns_name_dynamic(&rbtdb->common.origin))
840                         dns_name_format(&rbtdb->common.origin, buf,
841                                         sizeof(buf));
842                 else
843                         strcpy(buf, "<UNKNOWN>");
844                 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
845                               DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
846                               "done free_rbtdb(%s)", buf);
847         }
848         if (dns_name_dynamic(&rbtdb->common.origin))
849                 dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
850         for (i = 0; i < rbtdb->node_lock_count; i++) {
851                 isc_refcount_destroy(&rbtdb->node_locks[i].references);
852                 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
853         }
854
855         /*
856          * Clean up LRU cache objects.
857          */
858         if (rbtdb->rdatasets != NULL) {
859                 for (i = 0; i < rbtdb->node_lock_count; i++)
860                         INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
861                 isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
862                             rbtdb->node_lock_count *
863                             sizeof(rdatasetheaderlist_t));
864         }
865         /*
866          * Clean up dead node buckets.
867          */
868         if (rbtdb->deadnodes != NULL) {
869                 for (i = 0; i < rbtdb->node_lock_count; i++)
870                         INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
871                 isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
872                     rbtdb->node_lock_count * sizeof(rbtnodelist_t));
873         }
874         /*
875          * Clean up TTL heap cache objects.
876          */
877         if (rbtdb->heaps != NULL) {
878                 for (i = 0; i < rbtdb->node_lock_count; i++)
879                         isc_heap_destroy(&rbtdb->heaps[i]);
880                 isc_mem_put(rbtdb->common.mctx, rbtdb->heaps,
881                             rbtdb->node_lock_count *
882                             sizeof(isc_heap_t *));
883         }
884
885         if (rbtdb->rrsetstats != NULL)
886                 dns_stats_detach(&rbtdb->rrsetstats);
887
888         isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
889                     rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
890         isc_rwlock_destroy(&rbtdb->tree_lock);
891         isc_refcount_destroy(&rbtdb->references);
892         if (rbtdb->task != NULL)
893                 isc_task_detach(&rbtdb->task);
894
895         RBTDB_DESTROYLOCK(&rbtdb->lock);
896         rbtdb->common.magic = 0;
897         rbtdb->common.impmagic = 0;
898         ondest = rbtdb->common.ondest;
899         isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
900         isc_ondestroy_notify(&ondest, rbtdb);
901 }
902
903 static inline void
904 maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
905         isc_boolean_t want_free = ISC_FALSE;
906         unsigned int i;
907         unsigned int inactive = 0;
908
909         /* XXX check for open versions here */
910
911         if (rbtdb->soanode != NULL)
912                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
913         if (rbtdb->nsnode != NULL)
914                 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
915
916         /*
917          * Even though there are no external direct references, there still
918          * may be nodes in use.
919          */
920         for (i = 0; i < rbtdb->node_lock_count; i++) {
921                 NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
922                 rbtdb->node_locks[i].exiting = ISC_TRUE;
923                 NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
924                 if (isc_refcount_current(&rbtdb->node_locks[i].references)
925                     == 0) {
926                         inactive++;
927                 }
928         }
929
930         if (inactive != 0) {
931                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
932                 rbtdb->active -= inactive;
933                 if (rbtdb->active == 0)
934                         want_free = ISC_TRUE;
935                 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
936                 if (want_free) {
937                         char buf[DNS_NAME_FORMATSIZE];
938                         if (dns_name_dynamic(&rbtdb->common.origin))
939                                 dns_name_format(&rbtdb->common.origin, buf,
940                                                 sizeof(buf));
941                         else
942                                 strcpy(buf, "<UNKNOWN>");
943                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
944                                       DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
945                                       "calling free_rbtdb(%s)", buf);
946                         free_rbtdb(rbtdb, ISC_TRUE, NULL);
947                 }
948         }
949 }
950
951 static void
952 detach(dns_db_t **dbp) {
953         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
954         unsigned int refs;
955
956         REQUIRE(VALID_RBTDB(rbtdb));
957
958         isc_refcount_decrement(&rbtdb->references, &refs);
959
960         if (refs == 0)
961                 maybe_free_rbtdb(rbtdb);
962
963         *dbp = NULL;
964 }
965
966 static void
967 currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
968         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
969         rbtdb_version_t *version;
970         unsigned int refs;
971
972         REQUIRE(VALID_RBTDB(rbtdb));
973
974         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
975         version = rbtdb->current_version;
976         isc_refcount_increment(&version->references, &refs);
977         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
978
979         *versionp = (dns_dbversion_t *)version;
980 }
981
982 static inline rbtdb_version_t *
983 allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
984                  unsigned int references, isc_boolean_t writer)
985 {
986         isc_result_t result;
987         rbtdb_version_t *version;
988
989         version = isc_mem_get(mctx, sizeof(*version));
990         if (version == NULL)
991                 return (NULL);
992         version->serial = serial;
993         result = isc_refcount_init(&version->references, references);
994         if (result != ISC_R_SUCCESS) {
995                 isc_mem_put(mctx, version, sizeof(*version));
996                 return (NULL);
997         }
998         version->writer = writer;
999         version->commit_ok = ISC_FALSE;
1000         ISC_LIST_INIT(version->changed_list);
1001         ISC_LINK_INIT(version, link);
1002
1003         return (version);
1004 }
1005
1006 static isc_result_t
1007 newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1008         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1009         rbtdb_version_t *version;
1010
1011         REQUIRE(VALID_RBTDB(rbtdb));
1012         REQUIRE(versionp != NULL && *versionp == NULL);
1013         REQUIRE(rbtdb->future_version == NULL);
1014
1015         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1016         RUNTIME_CHECK(rbtdb->next_serial != 0);         /* XXX Error? */
1017         version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1018                                    ISC_TRUE);
1019         if (version != NULL) {
1020                 version->commit_ok = ISC_TRUE;
1021                 rbtdb->next_serial++;
1022                 rbtdb->future_version = version;
1023         }
1024         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1025
1026         if (version == NULL)
1027                 return (ISC_R_NOMEMORY);
1028
1029         *versionp = version;
1030
1031         return (ISC_R_SUCCESS);
1032 }
1033
1034 static void
1035 attachversion(dns_db_t *db, dns_dbversion_t *source,
1036               dns_dbversion_t **targetp)
1037 {
1038         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1039         rbtdb_version_t *rbtversion = source;
1040         unsigned int refs;
1041
1042         REQUIRE(VALID_RBTDB(rbtdb));
1043
1044         isc_refcount_increment(&rbtversion->references, &refs);
1045         INSIST(refs > 1);
1046
1047         *targetp = rbtversion;
1048 }
1049
1050 static rbtdb_changed_t *
1051 add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1052             dns_rbtnode_t *node)
1053 {
1054         rbtdb_changed_t *changed;
1055         unsigned int refs;
1056
1057         /*
1058          * Caller must be holding the node lock if its reference must be
1059          * protected by the lock.
1060          */
1061
1062         changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1063
1064         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1065
1066         REQUIRE(version->writer);
1067
1068         if (changed != NULL) {
1069                 dns_rbtnode_refincrement(node, &refs);
1070                 INSIST(refs != 0);
1071                 changed->node = node;
1072                 changed->dirty = ISC_FALSE;
1073                 ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1074         } else
1075                 version->commit_ok = ISC_FALSE;
1076
1077         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1078
1079         return (changed);
1080 }
1081
1082 static void
1083 free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
1084                  acachectl_t *array)
1085 {
1086         unsigned int count;
1087         unsigned int i;
1088         unsigned char *raw;     /* RDATASLAB */
1089
1090         /*
1091          * The caller must be holding the corresponding node lock.
1092          */
1093
1094         if (array == NULL)
1095                 return;
1096
1097         raw = (unsigned char *)header + sizeof(*header);
1098         count = raw[0] * 256 + raw[1];
1099
1100         /*
1101          * Sanity check: since an additional cache entry has a reference to
1102          * the original DB node (in the callback arg), there should be no
1103          * acache entries when the node can be freed.
1104          */
1105         for (i = 0; i < count; i++)
1106                 INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1107
1108         isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1109 }
1110
1111 static inline void
1112 free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1113
1114         if (dns_name_dynamic(&(*noqname)->name))
1115                 dns_name_free(&(*noqname)->name, mctx);
1116         if ((*noqname)->nsec != NULL)
1117                 isc_mem_put(mctx, (*noqname)->nsec,
1118                             dns_rdataslab_size((*noqname)->nsec, 0));
1119         if ((*noqname)->nsecsig != NULL)
1120                 isc_mem_put(mctx, (*noqname)->nsecsig,
1121                             dns_rdataslab_size((*noqname)->nsecsig, 0));
1122         isc_mem_put(mctx, *noqname, sizeof(**noqname));
1123         *noqname = NULL;
1124 }
1125
1126 static inline void
1127 init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
1128 {
1129         ISC_LINK_INIT(h, lru_link);
1130         h->heap_index = 0;
1131
1132 #if TRACE_HEADER
1133         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1134                 fprintf(stderr, "initialized header: %p\n", h);
1135 #else
1136         UNUSED(rbtdb);
1137 #endif
1138 }
1139
1140 static inline rdatasetheader_t *
1141 new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
1142 {
1143         rdatasetheader_t *h;
1144
1145         h = isc_mem_get(mctx, sizeof(*h));
1146         if (h == NULL)
1147                 return (NULL);
1148
1149 #if TRACE_HEADER
1150         if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1151                 fprintf(stderr, "allocated header: %p\n", h);
1152 #endif
1153         init_rdataset(rbtdb, h);
1154         return (h);
1155 }
1156
1157 static inline void
1158 free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
1159 {
1160         unsigned int size;
1161
1162         if (EXISTS(rdataset) &&
1163             (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
1164                 update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
1165         }
1166
1167         if (IS_CACHE(rbtdb) && ISC_LINK_LINKED(rdataset, lru_link)) {
1168                 int idx = rdataset->node->locknum;
1169                 ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link);
1170                 if (rdataset->heap_index != 0) {
1171                         isc_heap_delete(rbtdb->heaps[idx],
1172                                         rdataset->heap_index);
1173                 }
1174                 rdataset->heap_index = 0;
1175         }
1176
1177         if (rdataset->noqname != NULL)
1178                 free_noqname(mctx, &rdataset->noqname);
1179
1180         free_acachearray(mctx, rdataset, rdataset->additional_auth);
1181         free_acachearray(mctx, rdataset, rdataset->additional_glue);
1182
1183         if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
1184                 size = sizeof(*rdataset);
1185         else
1186                 size = dns_rdataslab_size((unsigned char *)rdataset,
1187                                           sizeof(*rdataset));
1188         isc_mem_put(mctx, rdataset, size);
1189 }
1190
1191 static inline void
1192 rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1193         rdatasetheader_t *header, *dcurrent;
1194         isc_boolean_t make_dirty = ISC_FALSE;
1195
1196         /*
1197          * Caller must hold the node lock.
1198          */
1199
1200         /*
1201          * We set the IGNORE attribute on rdatasets with serial number
1202          * 'serial'.  When the reference count goes to zero, these rdatasets
1203          * will be cleaned up; until that time, they will be ignored.
1204          */
1205         for (header = node->data; header != NULL; header = header->next) {
1206                 if (header->serial == serial) {
1207                         header->attributes |= RDATASET_ATTR_IGNORE;
1208                         make_dirty = ISC_TRUE;
1209                 }
1210                 for (dcurrent = header->down;
1211                      dcurrent != NULL;
1212                      dcurrent = dcurrent->down) {
1213                         if (dcurrent->serial == serial) {
1214                                 dcurrent->attributes |= RDATASET_ATTR_IGNORE;
1215                                 make_dirty = ISC_TRUE;
1216                         }
1217                 }
1218         }
1219         if (make_dirty)
1220                 node->dirty = 1;
1221 }
1222
1223 static inline void
1224 clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
1225 {
1226         rdatasetheader_t *d, *down_next;
1227
1228         for (d = top->down; d != NULL; d = down_next) {
1229                 down_next = d->down;
1230                 free_rdataset(rbtdb, mctx, d);
1231         }
1232         top->down = NULL;
1233 }
1234
1235 static inline void
1236 clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1237         rdatasetheader_t *current, *top_prev, *top_next;
1238         isc_mem_t *mctx = rbtdb->common.mctx;
1239
1240         /*
1241          * Caller must be holding the node lock.
1242          */
1243
1244         top_prev = NULL;
1245         for (current = node->data; current != NULL; current = top_next) {
1246                 top_next = current->next;
1247                 clean_stale_headers(rbtdb, mctx, current);
1248                 /*
1249                  * If current is nonexistent or stale, we can clean it up.
1250                  */
1251                 if ((current->attributes &
1252                      (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
1253                         if (top_prev != NULL)
1254                                 top_prev->next = current->next;
1255                         else
1256                                 node->data = current->next;
1257                         free_rdataset(rbtdb, mctx, current);
1258                 } else
1259                         top_prev = current;
1260         }
1261         node->dirty = 0;
1262 }
1263
1264 static inline void
1265 clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1266                 rbtdb_serial_t least_serial)
1267 {
1268         rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1269         rdatasetheader_t *top_prev, *top_next;
1270         isc_mem_t *mctx = rbtdb->common.mctx;
1271         isc_boolean_t still_dirty = ISC_FALSE;
1272
1273         /*
1274          * Caller must be holding the node lock.
1275          */
1276         REQUIRE(least_serial != 0);
1277
1278         top_prev = NULL;
1279         for (current = node->data; current != NULL; current = top_next) {
1280                 top_next = current->next;
1281
1282                 /*
1283                  * First, we clean up any instances of multiple rdatasets
1284                  * with the same serial number, or that have the IGNORE
1285                  * attribute.
1286                  */
1287                 dparent = current;
1288                 for (dcurrent = current->down;
1289                      dcurrent != NULL;
1290                      dcurrent = down_next) {
1291                         down_next = dcurrent->down;
1292                         INSIST(dcurrent->serial <= dparent->serial);
1293                         if (dcurrent->serial == dparent->serial ||
1294                             IGNORE(dcurrent)) {
1295                                 if (down_next != NULL)
1296                                         down_next->next = dparent;
1297                                 dparent->down = down_next;
1298                                 free_rdataset(rbtdb, mctx, dcurrent);
1299                         } else
1300                                 dparent = dcurrent;
1301                 }
1302
1303                 /*
1304                  * We've now eliminated all IGNORE datasets with the possible
1305                  * exception of current, which we now check.
1306                  */
1307                 if (IGNORE(current)) {
1308                         down_next = current->down;
1309                         if (down_next == NULL) {
1310                                 if (top_prev != NULL)
1311                                         top_prev->next = current->next;
1312                                 else
1313                                         node->data = current->next;
1314                                 free_rdataset(rbtdb, mctx, current);
1315                                 /*
1316                                  * current no longer exists, so we can
1317                                  * just continue with the loop.
1318                                  */
1319                                 continue;
1320                         } else {
1321                                 /*
1322                                  * Pull up current->down, making it the new
1323                                  * current.
1324                                  */
1325                                 if (top_prev != NULL)
1326                                         top_prev->next = down_next;
1327                                 else
1328                                         node->data = down_next;
1329                                 down_next->next = top_next;
1330                                 free_rdataset(rbtdb, mctx, current);
1331                                 current = down_next;
1332                         }
1333                 }
1334
1335                 /*
1336                  * We now try to find the first down node less than the
1337                  * least serial.
1338                  */
1339                 dparent = current;
1340                 for (dcurrent = current->down;
1341                      dcurrent != NULL;
1342                      dcurrent = down_next) {
1343                         down_next = dcurrent->down;
1344                         if (dcurrent->serial < least_serial)
1345                                 break;
1346                         dparent = dcurrent;
1347                 }
1348
1349                 /*
1350                  * If there is a such an rdataset, delete it and any older
1351                  * versions.
1352                  */
1353                 if (dcurrent != NULL) {
1354                         do {
1355                                 down_next = dcurrent->down;
1356                                 INSIST(dcurrent->serial <= least_serial);
1357                                 free_rdataset(rbtdb, mctx, dcurrent);
1358                                 dcurrent = down_next;
1359                         } while (dcurrent != NULL);
1360                         dparent->down = NULL;
1361                 }
1362
1363                 /*
1364                  * Note.  The serial number of 'current' might be less than
1365                  * least_serial too, but we cannot delete it because it is
1366                  * the most recent version, unless it is a NONEXISTENT
1367                  * rdataset.
1368                  */
1369                 if (current->down != NULL) {
1370                         still_dirty = ISC_TRUE;
1371                         top_prev = current;
1372                 } else {
1373                         /*
1374                          * If this is a NONEXISTENT rdataset, we can delete it.
1375                          */
1376                         if (NONEXISTENT(current)) {
1377                                 if (top_prev != NULL)
1378                                         top_prev->next = current->next;
1379                                 else
1380                                         node->data = current->next;
1381                                 free_rdataset(rbtdb, mctx, current);
1382                         } else
1383                                 top_prev = current;
1384                 }
1385         }
1386         if (!still_dirty)
1387                 node->dirty = 0;
1388 }
1389
1390 /*%
1391  * Clean up dead nodes.  These are nodes which have no references, and
1392  * have no data.  They are dead but we could not or chose not to delete
1393  * them when we deleted all the data at that node because we did not want
1394  * to wait for the tree write lock.
1395  *
1396  * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1397  */
1398 static void
1399 cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1400         dns_rbtnode_t *node;
1401         isc_result_t result;
1402         int count = 10;         /* XXXJT: should be adjustable */
1403
1404         node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1405         while (node != NULL && count > 0) {
1406                 ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1407
1408                 /*
1409                  * Since we're holding a tree write lock, it should be
1410                  * impossible for this node to be referenced by others.
1411                  */
1412                 INSIST(dns_rbtnode_refcurrent(node) == 0 &&
1413                        node->data == NULL);
1414
1415                 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1416                 if (result != ISC_R_SUCCESS)
1417                         isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1418                                       DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1419                                       "cleanup_dead_nodes: "
1420                                       "dns_rbt_deletenode: %s",
1421                                       isc_result_totext(result));
1422                 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1423                 count--;
1424         }
1425 }
1426
1427 /*
1428  * Caller must be holding the node lock if its reference must be protected
1429  * by the lock.
1430  */
1431 static inline void
1432 new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1433         unsigned int lockrefs, noderefs;
1434         isc_refcount_t *lockref;
1435
1436         dns_rbtnode_refincrement0(node, &noderefs);
1437         if (noderefs == 1) {    /* this is the first reference to the node */
1438                 lockref = &rbtdb->node_locks[node->locknum].references;
1439                 isc_refcount_increment0(lockref, &lockrefs);
1440                 INSIST(lockrefs != 0);
1441         }
1442         INSIST(noderefs != 0);
1443 }
1444
1445 /*
1446  * This function is assumed to be called when a node is newly referenced
1447  * and can be in the deadnode list.  In that case the node must be retrieved
1448  * from the list because it is going to be used.  In addition, if the caller
1449  * happens to hold a write lock on the tree, it's a good chance to purge dead
1450  * nodes.
1451  * Note: while a new reference is gained in multiple places, there are only very
1452  * few cases where the node can be in the deadnode list (only empty nodes can
1453  * have been added to the list).
1454  */
1455 static inline void
1456 reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1457                 isc_rwlocktype_t treelocktype)
1458 {
1459         isc_boolean_t need_relock = ISC_FALSE;
1460
1461         NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
1462         new_reference(rbtdb, node);
1463
1464         NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1465                       isc_rwlocktype_read);
1466         if (ISC_LINK_LINKED(node, deadlink))
1467                 need_relock = ISC_TRUE;
1468         else if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1469                  treelocktype == isc_rwlocktype_write)
1470                 need_relock = ISC_TRUE;
1471         NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1472                         isc_rwlocktype_read);
1473         if (need_relock) {
1474                 NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1475                               isc_rwlocktype_write);
1476                 if (ISC_LINK_LINKED(node, deadlink))
1477                         ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
1478                                         node, deadlink);
1479                 if (treelocktype == isc_rwlocktype_write)
1480                         cleanup_dead_nodes(rbtdb, node->locknum);
1481                 NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1482                                 isc_rwlocktype_write);
1483         }
1484
1485         NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
1486 }
1487
1488 /*
1489  * Caller must be holding the node lock; either the "strong", read or write
1490  * lock.  Note that the lock must be held even when node references are
1491  * atomically modified; in that case the decrement operation itself does not
1492  * have to be protected, but we must avoid a race condition where multiple
1493  * threads are decreasing the reference to zero simultaneously and at least
1494  * one of them is going to free the node.
1495  * This function returns ISC_TRUE if and only if the node reference decreases
1496  * to zero.
1497  */
1498 static isc_boolean_t
1499 decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1500                     rbtdb_serial_t least_serial,
1501                     isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
1502                     isc_boolean_t pruning)
1503 {
1504         isc_result_t result;
1505         isc_boolean_t write_locked;
1506         rbtdb_nodelock_t *nodelock;
1507         unsigned int refs, nrefs;
1508         int bucket = node->locknum;
1509         isc_boolean_t no_reference;
1510
1511         nodelock = &rbtdb->node_locks[bucket];
1512
1513         /* Handle easy and typical case first. */
1514         if (!node->dirty && (node->data != NULL || node->down != NULL)) {
1515                 dns_rbtnode_refdecrement(node, &nrefs);
1516                 INSIST((int)nrefs >= 0);
1517                 if (nrefs == 0) {
1518                         isc_refcount_decrement(&nodelock->references, &refs);
1519                         INSIST((int)refs >= 0);
1520                 }
1521                 return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
1522         }
1523
1524         /* Upgrade the lock? */
1525         if (nlock == isc_rwlocktype_read) {
1526                 NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
1527                 NODE_WEAKLOCK(&nodelock->lock, isc_rwlocktype_write);
1528         }
1529         dns_rbtnode_refdecrement(node, &nrefs);
1530         INSIST((int)nrefs >= 0);
1531         if (nrefs > 0) {
1532                 /* Restore the lock? */
1533                 if (nlock == isc_rwlocktype_read)
1534                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1535                 return (ISC_FALSE);
1536         }
1537
1538         if (node->dirty && dns_rbtnode_refcurrent(node) == 0) {
1539                 if (IS_CACHE(rbtdb))
1540                         clean_cache_node(rbtdb, node);
1541                 else {
1542                         if (least_serial == 0) {
1543                                 /*
1544                                  * Caller doesn't know the least serial.
1545                                  * Get it.
1546                                  */
1547                                 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1548                                 least_serial = rbtdb->least_serial;
1549                                 RBTDB_UNLOCK(&rbtdb->lock,
1550                                              isc_rwlocktype_read);
1551                         }
1552                         clean_zone_node(rbtdb, node, least_serial);
1553                 }
1554         }
1555
1556         isc_refcount_decrement(&nodelock->references, &refs);
1557         INSIST((int)refs >= 0);
1558
1559         /*
1560          * XXXDCL should this only be done for cache zones?
1561          */
1562         if (node->data != NULL || node->down != NULL) {
1563                 /* Restore the lock? */
1564                 if (nlock == isc_rwlocktype_read)
1565                         NODE_WEAKDOWNGRADE(&nodelock->lock);
1566                 return (ISC_TRUE);
1567         }
1568
1569         /*
1570          * Attempt to switch to a write lock on the tree.  If this fails,
1571          * we will add this node to a linked list of nodes in this locking
1572          * bucket which we will free later.
1573          */
1574         if (tlock != isc_rwlocktype_write) {
1575                 /*
1576                  * Locking hierarchy notwithstanding, we don't need to free
1577                  * the node lock before acquiring the tree write lock because
1578                  * we only do a trylock.
1579                  */
1580                 if (tlock == isc_rwlocktype_read)
1581                         result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
1582                 else
1583                         result = isc_rwlock_trylock(&rbtdb->tree_lock,
1584                                                     isc_rwlocktype_write);
1585                 RUNTIME_CHECK(result == ISC_R_SUCCESS ||
1586                               result == ISC_R_LOCKBUSY);
1587
1588                 write_locked = ISC_TF(result == ISC_R_SUCCESS);
1589         } else
1590                 write_locked = ISC_TRUE;
1591
1592         no_reference = ISC_TRUE;
1593         if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
1594                 /*
1595                  * We can now delete the node if the reference counter is
1596                  * zero.  This should be typically the case, but a different
1597                  * thread may still gain a (new) reference just before the
1598                  * current thread locks the tree (e.g., in findnode()).
1599                  */
1600
1601                 /*
1602                  * If this node is the only one in the level it's in, deleting
1603                  * this node may recursively make its parent the only node in
1604                  * the parent level; if so, and if no one is currently using
1605                  * the parent node, this is almost the only opportunity to
1606                  * clean it up.  But the recursive cleanup is not that trivial
1607                  * since the child and parent may be in different lock buckets,
1608                  * which would cause a lock order reversal problem.  To avoid
1609                  * the trouble, we'll dispatch a separate event for batch
1610                  * cleaning.  We need to check whether we're deleting the node
1611                  * as a result of pruning to avoid infinite dispatching.
1612                  * Note: pruning happens only when a task has been set for the
1613                  * rbtdb.  If the user of the rbtdb chooses not to set a task,
1614                  * it's their responsibility to purge stale leaves (e.g. by
1615                  * periodic walk-through).
1616                  */
1617                 if (!pruning && node->parent != NULL &&
1618                     node->parent->down == node && node->left == NULL &&
1619                     node->right == NULL && rbtdb->task != NULL) {
1620                         isc_event_t *ev;
1621                         dns_db_t *db;
1622
1623                         ev = isc_event_allocate(rbtdb->common.mctx, NULL,
1624                                                 DNS_EVENT_RBTPRUNE,
1625                                                 prune_tree, node,
1626                                                 sizeof(isc_event_t));
1627                         if (ev != NULL) {
1628                                 new_reference(rbtdb, node);
1629                                 db = NULL;
1630                                 attach((dns_db_t *)rbtdb, &db);
1631                                 ev->ev_sender = db;
1632                                 isc_task_send(rbtdb->task, &ev);
1633                                 no_reference = ISC_FALSE;
1634                         } else {
1635                                 /*
1636                                  * XXX: this is a weird situation.  We could
1637                                  * ignore this error case, but then the stale
1638                                  * node will unlikely be purged except via a
1639                                  * rare condition such as manual cleanup.  So
1640                                  * we queue it in the deadnodes list, hoping
1641                                  * the memory shortage is temporary and the node
1642                                  * will be deleted later.
1643                                  */
1644                                 isc_log_write(dns_lctx,
1645                                               DNS_LOGCATEGORY_DATABASE,
1646                                               DNS_LOGMODULE_CACHE,
1647                                               ISC_LOG_INFO,
1648                                               "decrement_reference: failed to "
1649                                               "allocate pruning event");
1650                                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1651                                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
1652                                                 deadlink);
1653                         }
1654                 } else {
1655                         if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1656                                 char printname[DNS_NAME_FORMATSIZE];
1657
1658                                 isc_log_write(dns_lctx,
1659                                               DNS_LOGCATEGORY_DATABASE,
1660                                               DNS_LOGMODULE_CACHE,
1661                                               ISC_LOG_DEBUG(1),
1662                                               "decrement_reference: "
1663                                               "delete from rbt: %p %s",
1664                                               node,
1665                                               dns_rbt_formatnodename(node,
1666                                                         printname,
1667                                                         sizeof(printname)));
1668                         }
1669
1670                         INSIST(!ISC_LINK_LINKED(node, deadlink));
1671                         result = dns_rbt_deletenode(rbtdb->tree, node,
1672                                                     ISC_FALSE);
1673                         if (result != ISC_R_SUCCESS) {
1674                                 isc_log_write(dns_lctx,
1675                                               DNS_LOGCATEGORY_DATABASE,
1676                                               DNS_LOGMODULE_CACHE,
1677                                               ISC_LOG_WARNING,
1678                                               "decrement_reference: "
1679                                               "dns_rbt_deletenode: %s",
1680                                               isc_result_totext(result));
1681                         }
1682                 }
1683         } else if (dns_rbtnode_refcurrent(node) == 0) {
1684                 INSIST(!ISC_LINK_LINKED(node, deadlink));
1685                 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
1686         }
1687
1688         /* Restore the lock? */
1689         if (nlock == isc_rwlocktype_read)
1690                 NODE_WEAKDOWNGRADE(&nodelock->lock);
1691
1692         /*
1693          * Relock a read lock, or unlock the write lock if no lock was held.
1694          */
1695         if (tlock == isc_rwlocktype_none)
1696                 if (write_locked)
1697                         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1698
1699         if (tlock == isc_rwlocktype_read)
1700                 if (write_locked)
1701                         isc_rwlock_downgrade(&rbtdb->tree_lock);
1702
1703         return (no_reference);
1704 }
1705
1706 /*
1707  * Prune the tree by recursively cleaning-up single leaves.  In the worst
1708  * case, the number of iteration is the number of tree levels, which is at
1709  * most the maximum number of domain name labels, i.e, 127.  In practice, this
1710  * should be much smaller (only a few times), and even the worst case would be
1711  * acceptable for a single event.
1712  */
1713 static void
1714 prune_tree(isc_task_t *task, isc_event_t *event) {
1715         dns_rbtdb_t *rbtdb = event->ev_sender;
1716         dns_rbtnode_t *node = event->ev_arg;
1717         dns_rbtnode_t *parent;
1718         unsigned int locknum;
1719
1720         UNUSED(task);
1721
1722         isc_event_free(&event);
1723
1724         RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1725         locknum = node->locknum;
1726         NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1727         do {
1728                 parent = node->parent;
1729                 decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
1730                                     isc_rwlocktype_write, ISC_TRUE);
1731
1732                 if (parent != NULL && parent->down == NULL) {
1733                         /*
1734                          * node was the only down child of the parent and has
1735                          * just been removed.  We'll then need to examine the
1736                          * parent.  Keep the lock if possible; otherwise,
1737                          * release the old lock and acquire one for the parent.
1738                          */
1739                         if (parent->locknum != locknum) {
1740                                 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
1741                                             isc_rwlocktype_write);
1742                                 locknum = parent->locknum;
1743                                 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
1744                                           isc_rwlocktype_write);
1745                         }
1746
1747                         /*
1748                          * We need to gain a reference to the node before
1749                          * decrementing it in the next iteration.  In addition,
1750                          * if the node is in the dead-nodes list, extract it
1751                          * from the list beforehand as we do in
1752                          * reactivate_node().
1753                          */
1754                         new_reference(rbtdb, parent);
1755                         if (ISC_LINK_LINKED(parent, deadlink)) {
1756                                 ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
1757                                                 parent, deadlink);
1758                         }
1759                 } else
1760                         parent = NULL;
1761
1762                 node = parent;
1763         } while (node != NULL);
1764         NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1765         RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1766
1767         detach((dns_db_t **)&rbtdb);
1768 }
1769
1770 static inline void
1771 make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1772                    rbtdb_changedlist_t *cleanup_list)
1773 {
1774         /*
1775          * Caller must be holding the database lock.
1776          */
1777
1778         rbtdb->least_serial = version->serial;
1779         *cleanup_list = version->changed_list;
1780         ISC_LIST_INIT(version->changed_list);
1781 }
1782
1783 static inline void
1784 cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
1785         rbtdb_changed_t *changed, *next_changed;
1786
1787         /*
1788          * If the changed record is dirty, then
1789          * an update created multiple versions of
1790          * a given rdataset.  We keep this list
1791          * until we're the least open version, at
1792          * which point it's safe to get rid of any
1793          * older versions.
1794          *
1795          * If the changed record isn't dirty, then
1796          * we don't need it anymore since we're
1797          * committing and not rolling back.
1798          *
1799          * The caller must be holding the database lock.
1800          */
1801         for (changed = HEAD(version->changed_list);
1802              changed != NULL;
1803              changed = next_changed) {
1804                 next_changed = NEXT(changed, link);
1805                 if (!changed->dirty) {
1806                         UNLINK(version->changed_list,
1807                                changed, link);
1808                         APPEND(*cleanup_list,
1809                                changed, link);
1810                 }
1811         }
1812 }
1813
1814 static isc_boolean_t
1815 iszonesecure(dns_db_t *db, dns_dbnode_t *origin) {
1816         dns_rdataset_t keyset;
1817         dns_rdataset_t nsecset, signsecset;
1818         isc_boolean_t haszonekey = ISC_FALSE;
1819         isc_boolean_t hasnsec = ISC_FALSE;
1820         isc_result_t result;
1821
1822         dns_rdataset_init(&keyset);
1823         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_dnskey, 0,
1824                                      0, &keyset, NULL);
1825         if (result == ISC_R_SUCCESS) {
1826                 dns_rdata_t keyrdata = DNS_RDATA_INIT;
1827                 result = dns_rdataset_first(&keyset);
1828                 while (result == ISC_R_SUCCESS) {
1829                         dns_rdataset_current(&keyset, &keyrdata);
1830                         if (dns_zonekey_iszonekey(&keyrdata)) {
1831                                 haszonekey = ISC_TRUE;
1832                                 break;
1833                         }
1834                         result = dns_rdataset_next(&keyset);
1835                 }
1836                 dns_rdataset_disassociate(&keyset);
1837         }
1838         if (!haszonekey)
1839                 return (ISC_FALSE);
1840
1841         dns_rdataset_init(&nsecset);
1842         dns_rdataset_init(&signsecset);
1843         result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_nsec, 0,
1844                                      0, &nsecset, &signsecset);
1845         if (result == ISC_R_SUCCESS) {
1846                 if (dns_rdataset_isassociated(&signsecset)) {
1847                         hasnsec = ISC_TRUE;
1848                         dns_rdataset_disassociate(&signsecset);
1849                 }
1850                 dns_rdataset_disassociate(&nsecset);
1851         }
1852         return (hasnsec);
1853 }
1854
1855 static void
1856 closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
1857         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1858         rbtdb_version_t *version, *cleanup_version, *least_greater;
1859         isc_boolean_t rollback = ISC_FALSE;
1860         rbtdb_changedlist_t cleanup_list;
1861         rbtdb_changed_t *changed, *next_changed;
1862         rbtdb_serial_t serial, least_serial;
1863         dns_rbtnode_t *rbtnode;
1864         unsigned int refs;
1865         isc_boolean_t writer;
1866
1867         REQUIRE(VALID_RBTDB(rbtdb));
1868         version = (rbtdb_version_t *)*versionp;
1869
1870         cleanup_version = NULL;
1871         ISC_LIST_INIT(cleanup_list);
1872
1873         isc_refcount_decrement(&version->references, &refs);
1874         if (refs > 0) {         /* typical and easy case first */
1875                 if (commit) {
1876                         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1877                         INSIST(!version->writer);
1878                         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1879                 }
1880                 goto end;
1881         }
1882
1883         RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1884         serial = version->serial;
1885         writer = version->writer;
1886         if (version->writer) {
1887                 if (commit) {
1888                         unsigned cur_ref;
1889                         rbtdb_version_t *cur_version;
1890
1891                         INSIST(version->commit_ok);
1892                         INSIST(version == rbtdb->future_version);
1893                         /*
1894                          * The current version is going to be replaced.
1895                          * Release the (likely last) reference to it from the
1896                          * DB itself and unlink it from the open list.
1897                          */
1898                         cur_version = rbtdb->current_version;
1899                         isc_refcount_decrement(&cur_version->references,
1900                                                &cur_ref);
1901                         if (cur_ref == 0) {
1902                                 if (cur_version->serial == rbtdb->least_serial)
1903                                         INSIST(EMPTY(cur_version->changed_list));
1904                                 UNLINK(rbtdb->open_versions,
1905                                        cur_version, link);
1906                         }
1907                         if (EMPTY(rbtdb->open_versions)) {
1908                                 /*
1909                                  * We're going to become the least open
1910                                  * version.
1911                                  */
1912                                 make_least_version(rbtdb, version,
1913                                                    &cleanup_list);
1914                         } else {
1915                                 /*
1916                                  * Some other open version is the
1917                                  * least version.  We can't cleanup
1918                                  * records that were changed in this
1919                                  * version because the older versions
1920                                  * may still be in use by an open
1921                                  * version.
1922                                  *
1923                                  * We can, however, discard the
1924                                  * changed records for things that
1925                                  * we've added that didn't exist in
1926                                  * prior versions.
1927                                  */
1928                                 cleanup_nondirty(version, &cleanup_list);
1929                         }
1930                         /*
1931                          * If the (soon to be former) current version
1932                          * isn't being used by anyone, we can clean
1933                          * it up.
1934                          */
1935                         if (cur_ref == 0) {
1936                                 cleanup_version = cur_version;
1937                                 APPENDLIST(version->changed_list,
1938                                            cleanup_version->changed_list,
1939                                            link);
1940                         }
1941                         /*
1942                          * Become the current version.
1943                          */
1944                         version->writer = ISC_FALSE;
1945                         rbtdb->current_version = version;
1946                         rbtdb->current_serial = version->serial;
1947                         rbtdb->future_version = NULL;
1948
1949                         /*
1950                          * Keep the current version in the open list, and
1951                          * gain a reference for the DB itself (see the DB
1952                          * creation function below).  This must be the only
1953                          * case where we need to increment the counter from
1954                          * zero and need to use isc_refcount_increment0().
1955                          */
1956                         isc_refcount_increment0(&version->references,
1957                                                 &cur_ref);
1958                         INSIST(cur_ref == 1);
1959                         PREPEND(rbtdb->open_versions,
1960                                 rbtdb->current_version, link);
1961                 } else {
1962                         /*
1963                          * We're rolling back this transaction.
1964                          */
1965                         cleanup_list = version->changed_list;
1966                         ISC_LIST_INIT(version->changed_list);
1967                         rollback = ISC_TRUE;
1968                         cleanup_version = version;
1969                         rbtdb->future_version = NULL;
1970                 }
1971         } else {
1972                 if (version != rbtdb->current_version) {
1973                         /*
1974                          * There are no external or internal references
1975                          * to this version and it can be cleaned up.
1976                          */
1977                         cleanup_version = version;
1978
1979                         /*
1980                          * Find the version with the least serial
1981                          * number greater than ours.
1982                          */
1983                         least_greater = PREV(version, link);
1984                         if (least_greater == NULL)
1985                                 least_greater = rbtdb->current_version;
1986
1987                         INSIST(version->serial < least_greater->serial);
1988                         /*
1989                          * Is this the least open version?
1990                          */
1991                         if (version->serial == rbtdb->least_serial) {
1992                                 /*
1993                                  * Yes.  Install the new least open
1994                                  * version.
1995                                  */
1996                                 make_least_version(rbtdb,
1997                                                    least_greater,
1998                                                    &cleanup_list);
1999                         } else {
2000                                 /*
2001                                  * Add any unexecuted cleanups to
2002                                  * those of the least greater version.
2003                                  */
2004                                 APPENDLIST(least_greater->changed_list,
2005                                            version->changed_list,
2006                                            link);
2007                         }
2008                 } else if (version->serial == rbtdb->least_serial)
2009                         INSIST(EMPTY(version->changed_list));
2010                 UNLINK(rbtdb->open_versions, version, link);
2011         }
2012         least_serial = rbtdb->least_serial;
2013         RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2014
2015         /*
2016          * Update the zone's secure status.
2017          */
2018         if (writer && commit && !IS_CACHE(rbtdb))
2019                 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
2020
2021         if (cleanup_version != NULL) {
2022                 INSIST(EMPTY(cleanup_version->changed_list));
2023                 isc_mem_put(rbtdb->common.mctx, cleanup_version,
2024                             sizeof(*cleanup_version));
2025         }
2026
2027         if (!EMPTY(cleanup_list)) {
2028                 /*
2029                  * We acquire a tree write lock here in order to make sure
2030                  * that stale nodes will be removed in decrement_reference().
2031                  * If we didn't have the lock, those nodes could miss the
2032                  * chance to be removed until the server stops.  The write lock
2033                  * is expensive, but this event should be rare enough to justify
2034                  * the cost.
2035                  */
2036                 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2037                 for (changed = HEAD(cleanup_list);
2038                      changed != NULL;
2039                      changed = next_changed) {
2040                         nodelock_t *lock;
2041
2042                         next_changed = NEXT(changed, link);
2043                         rbtnode = changed->node;
2044                         lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2045
2046                         NODE_LOCK(lock, isc_rwlocktype_write);
2047                         /*
2048                          * This is a good opportunity to purge any dead nodes,
2049                          * so use it.
2050                          */
2051                         cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2052
2053                         if (rollback)
2054                                 rollback_node(rbtnode, serial);
2055                         decrement_reference(rbtdb, rbtnode, least_serial,
2056                                             isc_rwlocktype_write,
2057                                             isc_rwlocktype_write, ISC_FALSE);
2058
2059                         NODE_UNLOCK(lock, isc_rwlocktype_write);
2060
2061                         isc_mem_put(rbtdb->common.mctx, changed,
2062                                     sizeof(*changed));
2063                 }
2064                 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2065         }
2066
2067   end:
2068         *versionp = NULL;
2069 }
2070
2071 /*
2072  * Add the necessary magic for the wildcard name 'name'
2073  * to be found in 'rbtdb'.
2074  *
2075  * In order for wildcard matching to work correctly in
2076  * zone_find(), we must ensure that a node for the wildcarding
2077  * level exists in the database, and has its 'find_callback'
2078  * and 'wild' bits set.
2079  *
2080  * E.g. if the wildcard name is "*.sub.example." then we
2081  * must ensure that "sub.example." exists and is marked as
2082  * a wildcard level.
2083  */
2084 static isc_result_t
2085 add_wildcard_magic(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2086         isc_result_t result;
2087         dns_name_t foundname;
2088         dns_offsets_t offsets;
2089         unsigned int n;
2090         dns_rbtnode_t *node = NULL;
2091
2092         dns_name_init(&foundname, offsets);
2093         n = dns_name_countlabels(name);
2094         INSIST(n >= 2);
2095         n--;
2096         dns_name_getlabelsequence(name, 1, n, &foundname);
2097         result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2098         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2099                 return (result);
2100         node->find_callback = 1;
2101         node->wild = 1;
2102         return (ISC_R_SUCCESS);
2103 }
2104
2105 static isc_result_t
2106 add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2107         isc_result_t result;
2108         dns_name_t foundname;
2109         dns_offsets_t offsets;
2110         unsigned int n, l, i;
2111
2112         dns_name_init(&foundname, offsets);
2113         n = dns_name_countlabels(name);
2114         l = dns_name_countlabels(&rbtdb->common.origin);
2115         i = l + 1;
2116         while (i < n) {
2117                 dns_rbtnode_t *node = NULL;     /* dummy */
2118                 dns_name_getlabelsequence(name, n - i, i, &foundname);
2119                 if (dns_name_iswildcard(&foundname)) {
2120                         result = add_wildcard_magic(rbtdb, &foundname);
2121                         if (result != ISC_R_SUCCESS)
2122                                 return (result);
2123                         result = dns_rbt_addnode(rbtdb->tree, &foundname,
2124                                                  &node);
2125                         if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2126                                 return (result);
2127                 }
2128                 i++;
2129         }
2130         return (ISC_R_SUCCESS);
2131 }
2132
2133 static isc_result_t
2134 findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create,
2135          dns_dbnode_t **nodep)
2136 {
2137         dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2138         dns_rbtnode_t *node = NULL;
2139         dns_name_t nodename;
2140         isc_result_t result;
2141         isc_rwlocktype_t locktype = isc_rwlocktype_read;
2142
2143         REQUIRE(VALID_RBTDB(rbtdb));
2144
2145         dns_name_init(&nodename, NULL);
2146         RWLOCK(&rbtdb->tree_lock, locktype);
2147         result = dns_rbt_findnode(rbtdb->tree, name, NULL, &node, NULL,
2148                                   DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2149         if (result != ISC_R_SUCCESS) {
2150                 RWUNLOCK(&rbtdb->tree_lock, locktype);
2151                 if (!create) {
2152                         if (result == DNS_R_PARTIALMATCH)
2153                                 result = ISC_R_NOTFOUND;
2154                         return (result);
2155                 }
2156                 /*
2157                  * It would be nice to try to upgrade the lock instead of
2158                  * unlocking then relocking.
2159                  */
2160                 locktype = isc_rwlocktype_write;
2161                 RWLOCK(&rbtdb->tree_lock, locktype);
2162                 node = NULL;
2163                 result = dns_rbt_addnode(rbtdb->tree, name, &node);
2164                 if (result == ISC_R_SUCCESS) {
2165                         dns_rbt_namefromnode(node, &nodename);
2166 #ifdef DNS_RBT_USEHASH
2167                         node->locknum = node->hashval % rbtdb->node_lock_count;
2168 #else
2169                         node->locknum = dns_name_hash(&nodename, ISC_TRUE) %
2170                                 rbtdb->node_lock_count;
2171 #endif
2172                         add_empty_wildcards(rbtdb, name);
2173
2174                         if (dns_name_iswildcard(name)) {
2175                                 result = add_wildcard_magic(rbtdb, name);
2176                                 if (result != ISC_R_SUCCESS) {
2177                                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2178                                         return (result);
2179                                 }
2180                         }
2181                 } else if (result != ISC_R_EXISTS) {
2182                         RWUNLOCK(&rbtdb->tree_lock, locktype);
2183                         return (result);
2184                 }
2185         }
2186         reactivate_node(rbtdb, node, locktype);
2187         RWUNLOCK(&rbtdb->tree_lock, locktype);
2188
2189         *nodep = (dns_dbnode_t *)node;
2190
2191         return (ISC_R_SUCCESS);
2192 }
2193
2194 static isc_result_t
2195 zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2196         rbtdb_search_t *search = arg;
2197         rdatasetheader_t *header, *header_next;
2198         rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2199         rdatasetheader_t *found;
2200         isc_result_t result;
2201         dns_rbtnode_t *onode;
2202
2203         /*
2204          * We only want to remember the topmost zone cut, since it's the one
2205          * that counts, so we'll just continue if we've already found a
2206          * zonecut.
2207          */
2208         if (search->zonecut != NULL)
2209                 return (DNS_R_CONTINUE);
2210
2211         found = NULL;
2212         result = DNS_R_CONTINUE;
2213         onode = search->rbtdb->origin_node;
2214
2215         NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2216                   isc_rwlocktype_read);
2217
2218         /*
2219          * Look for an NS or DNAME rdataset active in our version.
2220          */
2221         ns_header = NULL;
2222         dname_header = NULL;
2223         sigdname_header = NULL;
2224         for (header = node->data; header != NULL; header = header_next) {
2225                 header_next = header->next;
2226                 if (header->type == dns_rdatatype_ns ||
2227                     header->type == dns_rdatatype_dname ||
2228                     header->type == RBTDB_RDATATYPE_SIGDNAME) {
2229                         do {
2230                                 if (header->serial <= search->serial &&
2231                                     !IGNORE(header)) {
2232                                         /*
2233                                          * Is this a "this rdataset doesn't
2234                                          * exist" record?
2235                                          */
2236                                         if (NONEXISTENT(header))
2237                                                 header = NULL;
2238                                         break;
2239                                 } else
2240                                         header = header->down;
2241                         } while (header != NULL);
2242                         if (header != NULL) {
2243                                 if (header->type == dns_rdatatype_dname)
2244                                         dname_header = header;
2245                                 else if (header->type ==
2246                                            RBTDB_RDATATYPE_SIGDNAME)
2247                                         sigdname_header = header;
2248                                 else if (node != onode ||
2249                                          IS_STUB(search->rbtdb)) {
2250                                         /*
2251                                          * We've found an NS rdataset that
2252                                          * isn't at the origin node.  We check
2253                                          * that they're not at the origin node,
2254                                          * because otherwise we'd erroneously
2255                                          * treat the zone top as if it were
2256                                          * a delegation.
2257                                          */
2258                                         ns_header = header;
2259                                 }
2260                         }
2261                 }
2262         }
2263
2264         /*
2265          * Did we find anything?
2266          */
2267         if (dname_header != NULL) {
2268                 /*
2269                  * Note that DNAME has precedence over NS if both exist.
2270                  */
2271                 found = dname_header;
2272                 search->zonecut_sigrdataset = sigdname_header;
2273         } else if (ns_header != NULL) {
2274                 found = ns_header;
2275                 search->zonecut_sigrdataset = NULL;
2276         }
2277
2278         if (found != NULL) {
2279                 /*
2280                  * We increment the reference count on node to ensure that
2281                  * search->zonecut_rdataset will still be valid later.
2282                  */
2283                 new_reference(search->rbtdb, node);
2284                 search->zonecut = node;
2285                 search->zonecut_rdataset = found;
2286                 search->need_cleanup = ISC_TRUE;
2287                 /*
2288                  * Since we've found a zonecut, anything beneath it is
2289                  * glue and is not subject to wildcard matching, so we
2290                  * may clear search->wild.
2291                  */
2292                 search->wild = ISC_FALSE;
2293                 if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
2294                         /*
2295                          * If the caller does not want to find glue, then
2296                          * this is the best answer and the search should
2297                          * stop now.
2298                          */
2299                         result = DNS_R_PARTIALMATCH;
2300                 } else {
2301                         dns_name_t *zcname;
2302
2303                         /*
2304                          * The search will continue beneath the zone cut.
2305                          * This may or may not be the best match.  In case it
2306                          * is, we need to remember the node name.
2307                          */
2308                         zcname = dns_fixedname_name(&search->zonecut_name);
2309                         RUNTIME_CHECK(dns_name_copy(name, zcname, NULL) ==
2310                                       ISC_R_SUCCESS);
2311                         search->copy_name = ISC_TRUE;
2312                 }
2313         } else {
2314                 /*
2315                  * There is no zonecut at this node which is active in this
2316                  * version.
2317                  *
2318                  * If this is a "wild" node and the caller hasn't disabled
2319                  * wildcard matching, remember that we've seen a wild node
2320                  * in case we need to go searching for wildcard matches
2321                  * later on.
2322                  */
2323                 if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0)
2324                         search->wild = ISC_TRUE;
2325         }
2326
2327         NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2328                     isc_rwlocktype_read);
2329
2330         return (result);
2331 }
2332
2333 static inline void
2334 bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2335               rdatasetheader_t *header, isc_stdtime_t now,
2336               dns_rdataset_t *rdataset)
2337 {
2338         unsigned char *raw;     /* RDATASLAB */
2339
2340         /*
2341          * Caller must be holding the node reader lock.
2342          * XXXJT: technically, we need a writer lock, since we'll increment
2343          * the header count below.  However, since the actual counter value
2344          * doesn't matter, we prioritize performance here.  (We may want to
2345          * use atomic increment when available).
2346          */
2347
2348         if (rdataset == NULL)
2349                 return;
2350
2351         new_reference(rbtdb, node);
2352
2353         INSIST(rdataset->methods == NULL);      /* We must be disassociated. */
2354
2355         rdataset->methods = &rdataset_methods;
2356         rdataset->rdclass = rbtdb->common.rdclass;
2357         rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
2358         rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
2359         rdataset->ttl = header->rdh_ttl - now;
2360         rdataset->trust = header->trust;
2361         if (NXDOMAIN(header))
2362                 rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
2363         rdataset->private1 = rbtdb;
2364         rdataset->private2 = node;
2365         raw = (unsigned char *)header + sizeof(*header);
2366         rdataset->private3 = raw;
2367         rdataset->count = header->count++;
2368         if (rdataset->count == ISC_UINT32_MAX)
2369                 rdataset->count = 0;
2370
2371         /*
2372          * Reset iterator state.
2373          */
2374         rdataset->privateuint4 = 0;
2375         rdataset->private5 = NULL;
2376
2377         /*
2378          * Add noqname proof.
2379          */
2380         rdataset->private6 = header->noqname;
2381         if (rdataset->private6 != NULL)
2382                 rdataset->attributes |=  DNS_RDATASETATTR_NOQNAME;
2383 }
2384
2385 static inline isc_result_t
2386 setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
2387                  dns_name_t *foundname, dns_rdataset_t *rdataset,
2388                  dns_rdataset_t *sigrdataset)
2389 {
2390         isc_result_t result;
2391         dns_name_t *zcname;
2392         rbtdb_rdatatype_t type;
2393         dns_rbtnode_t *node;
2394
2395         /*
2396          * The caller MUST NOT be holding any node locks.
2397          */
2398
2399         node = search->zonecut;
2400         type = search->zonecut_rdataset->type;
2401
2402         /*
2403          * If we have to set foundname, we do it before anything else.
2404          * If we were to set foundname after we had set nodep or bound the
2405          * rdataset, then we'd have to undo that work if dns_name_copy()
2406          * failed.  By setting foundname first, there's nothing to undo if
2407          * we have trouble.
2408          */
2409         if (foundname != NULL && search->copy_name) {
2410                 zcname = dns_fixedname_name(&search->zonecut_name);
2411                 result = dns_name_copy(zcname, foundname, NULL);
2412                 if (result != ISC_R_SUCCESS)
2413                         return (result);
2414         }
2415         if (nodep != NULL) {
2416                 /*
2417                  * Note that we don't have to increment the node's reference
2418                  * count here because we're going to use the reference we
2419                  * already have in the search block.
2420                  */
2421                 *nodep = node;
2422                 search->need_cleanup = ISC_FALSE;
2423         }
2424         if (rdataset != NULL) {
2425                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2426                           isc_rwlocktype_read);
2427                 bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
2428                               search->now, rdataset);
2429                 if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
2430                         bind_rdataset(search->rbtdb, node,
2431                                       search->zonecut_sigrdataset,
2432                                       search->now, sigrdataset);
2433                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2434                             isc_rwlocktype_read);
2435         }
2436
2437         if (type == dns_rdatatype_dname)
2438                 return (DNS_R_DNAME);
2439         return (DNS_R_DELEGATION);
2440 }
2441
2442 static inline isc_boolean_t
2443 valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
2444            dns_rbtnode_t *node)
2445 {
2446         unsigned char *raw;     /* RDATASLAB */
2447         unsigned int count, size;
2448         dns_name_t ns_name;
2449         isc_boolean_t valid = ISC_FALSE;
2450         dns_offsets_t offsets;
2451         isc_region_t region;
2452         rdatasetheader_t *header;
2453
2454         /*
2455          * No additional locking is required.
2456          */
2457
2458         /*
2459          * Valid glue types are A, AAAA, A6.  NS is also a valid glue type
2460          * if it occurs at a zone cut, but is not valid below it.
2461          */
2462         if (type == dns_rdatatype_ns) {
2463                 if (node != search->zonecut) {
2464                         return (ISC_FALSE);
2465                 }
2466         } else if (type != dns_rdatatype_a &&
2467                    type != dns_rdatatype_aaaa &&
2468                    type != dns_rdatatype_a6) {
2469                 return (ISC_FALSE);
2470         }
2471
2472         header = search->zonecut_rdataset;
2473         raw = (unsigned char *)header + sizeof(*header);
2474         count = raw[0] * 256 + raw[1];
2475 #if DNS_RDATASET_FIXED
2476         raw += 2 + (4 * count);
2477 #else
2478         raw += 2;
2479 #endif
2480
2481         while (count > 0) {
2482                 count--;
2483                 size = raw[0] * 256 + raw[1];
2484 #if DNS_RDATASET_FIXED
2485                 raw += 4;
2486 #else
2487                 raw += 2;
2488 #endif
2489                 region.base = raw;
2490                 region.length = size;
2491                 raw += size;
2492                 /*
2493                  * XXX Until we have rdata structures, we have no choice but
2494                  * to directly access the rdata format.
2495                  */
2496                 dns_name_init(&ns_name, offsets);
2497                 dns_name_fromregion(&ns_name, &region);
2498                 if (dns_name_compare(&ns_name, name) == 0) {
2499                         valid = ISC_TRUE;
2500                         break;
2501                 }
2502         }
2503
2504         return (valid);
2505 }
2506
2507 static inline isc_boolean_t
2508 activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
2509             dns_name_t *name)
2510 {
2511         dns_fixedname_t fnext;
2512         dns_fixedname_t forigin;
2513         dns_name_t *next;
2514         dns_name_t *origin;
2515         dns_name_t prefix;
2516         dns_rbtdb_t *rbtdb;
2517         dns_rbtnode_t *node;
2518         isc_result_t result;
2519         isc_boolean_t answer = ISC_FALSE;
2520         rdatasetheader_t *header;
2521
2522         rbtdb = search->rbtdb;
2523
2524         dns_name_init(&prefix, NULL);
2525         dns_fixedname_init(&fnext);
2526         next = dns_fixedname_name(&fnext);
2527         dns_fixedname_init(&forigin);
2528         origin = dns_fixedname_name(&forigin);
2529
2530         result = dns_rbtnodechain_next(chain, NULL, NULL);
2531         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2532                 node = NULL;
2533                 result = dns_rbtnodechain_current(chain, &prefix,
2534                                                   origin, &node);
2535                 if (result != ISC_R_SUCCESS)
2536                         break;
2537                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2538                           isc_rwlocktype_read);
2539                 for (header = node->data;
2540                      header != NULL;
2541                      header = header->next) {
2542                         if (header->serial <= search->serial &&
2543                             !IGNORE(header) && EXISTS(header))
2544                                 break;
2545                 }
2546                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2547                             isc_rwlocktype_read);
2548                 if (header != NULL)
2549                         break;
2550                 result = dns_rbtnodechain_next(chain, NULL, NULL);
2551         }
2552         if (result == ISC_R_SUCCESS)
2553                 result = dns_name_concatenate(&prefix, origin, next, NULL);
2554         if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name))
2555                 answer = ISC_TRUE;
2556         return (answer);
2557 }
2558
2559 static inline isc_boolean_t
2560 activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) {
2561         dns_fixedname_t fnext;
2562         dns_fixedname_t forigin;
2563         dns_fixedname_t fprev;
2564         dns_name_t *next;
2565         dns_name_t *origin;
2566         dns_name_t *prev;
2567         dns_name_t name;
2568         dns_name_t rname;
2569         dns_name_t tname;
2570         dns_rbtdb_t *rbtdb;
2571         dns_rbtnode_t *node;
2572         dns_rbtnodechain_t chain;
2573         isc_boolean_t check_next = ISC_TRUE;
2574         isc_boolean_t check_prev = ISC_TRUE;
2575         isc_boolean_t answer = ISC_FALSE;
2576         isc_result_t result;
2577         rdatasetheader_t *header;
2578         unsigned int n;
2579
2580         rbtdb = search->rbtdb;
2581
2582         dns_name_init(&name, NULL);
2583         dns_name_init(&tname, NULL);
2584         dns_name_init(&rname, NULL);
2585         dns_fixedname_init(&fnext);
2586         next = dns_fixedname_name(&fnext);
2587         dns_fixedname_init(&fprev);
2588         prev = dns_fixedname_name(&fprev);
2589         dns_fixedname_init(&forigin);
2590         origin = dns_fixedname_name(&forigin);
2591
2592         /*
2593          * Find if qname is at or below a empty node.
2594          * Use our own copy of the chain.
2595          */
2596
2597         chain = search->chain;
2598         do {
2599                 node = NULL;
2600                 result = dns_rbtnodechain_current(&chain, &name,
2601                                                   origin, &node);
2602                 if (result != ISC_R_SUCCESS)
2603                         break;
2604                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2605                           isc_rwlocktype_read);
2606                 for (header = node->data;
2607                      header != NULL;
2608                      header = header->next) {
2609                         if (header->serial <= search->serial &&
2610                             !IGNORE(header) && EXISTS(header))
2611                                 break;
2612                 }
2613                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2614                             isc_rwlocktype_read);
2615                 if (header != NULL)
2616                         break;
2617                 result = dns_rbtnodechain_prev(&chain, NULL, NULL);
2618         } while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
2619         if (result == ISC_R_SUCCESS)
2620                 result = dns_name_concatenate(&name, origin, prev, NULL);
2621         if (result != ISC_R_SUCCESS)
2622                 check_prev = ISC_FALSE;
2623
2624         result = dns_rbtnodechain_next(&chain, NULL, NULL);
2625         while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2626                 node = NULL;
2627                 result = dns_rbtnodechain_current(&chain, &name,
2628                                                   origin, &node);
2629                 if (result != ISC_R_SUCCESS)
2630                         break;
2631                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2632                           isc_rwlocktype_read);
2633                 for (header = node->data;
2634                      header != NULL;
2635                      header = header->next) {
2636                         if (header->serial <= search->serial &&
2637                             !IGNORE(header) && EXISTS(header))
2638                                 break;
2639                 }
2640                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2641                             isc_rwlocktype_read);
2642                 if (header != NULL)
2643                         break;
2644                 result = dns_rbtnodechain_next(&chain, NULL, NULL);
2645         }
2646         if (result == ISC_R_SUCCESS)
2647                 result = dns_name_concatenate(&name, origin, next, NULL);
2648         if (result != ISC_R_SUCCESS)
2649                 check_next = ISC_FALSE;
2650
2651         dns_name_clone(qname, &rname);
2652
2653         /*
2654          * Remove the wildcard label to find the terminal name.
2655          */
2656         n = dns_name_countlabels(wname);
2657         dns_name_getlabelsequence(wname, 1, n - 1, &tname);
2658
2659         do {
2660                 if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
2661                     (check_next && dns_name_issubdomain(next, &rname))) {
2662                         answer = ISC_TRUE;
2663                         break;
2664                 }
2665                 /*
2666                  * Remove the left hand label.
2667                  */
2668                 n = dns_name_countlabels(&rname);
2669                 dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
2670         } while (!dns_name_equal(&rname, &tname));
2671         return (answer);
2672 }
2673
2674 static inline isc_result_t
2675 find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
2676               dns_name_t *qname)
2677 {
2678         unsigned int i, j;
2679         dns_rbtnode_t *node, *level_node, *wnode;
2680         rdatasetheader_t *header;
2681         isc_result_t result = ISC_R_NOTFOUND;
2682         dns_name_t name;
2683         dns_name_t *wname;
2684         dns_fixedname_t fwname;
2685         dns_rbtdb_t *rbtdb;
2686         isc_boolean_t done, wild, active;
2687         dns_rbtnodechain_t wchain;
2688
2689         /*
2690          * Caller must be holding the tree lock and MUST NOT be holding
2691          * any node locks.
2692          */
2693
2694         /*
2695          * Examine each ancestor level.  If the level's wild bit
2696          * is set, then construct the corresponding wildcard name and
2697          * search for it.  If the wildcard node exists, and is active in
2698          * this version, we're done.  If not, then we next check to see
2699          * if the ancestor is active in this version.  If so, then there
2700          * can be no possible wildcard match and again we're done.  If not,
2701          * continue the search.
2702          */
2703
2704         rbtdb = search->rbtdb;
2705         i = search->chain.level_matches;
2706         done = ISC_FALSE;
2707         node = *nodep;
2708         do {
2709                 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2710                           isc_rwlocktype_read);
2711
2712                 /*
2713                  * First we try to figure out if this node is active in
2714                  * the search's version.  We do this now, even though we
2715                  * may not need the information, because it simplifies the
2716                  * locking and code flow.
2717                  */
2718                 for (header = node->data;
2719                      header != NULL;
2720                      header = header->next) {
2721                         if (header->serial <= search->serial &&
2722                             !IGNORE(header) && EXISTS(header))
2723                                 break;
2724                 }
2725                 if (header != NULL)
2726                         active = ISC_TRUE;
2727                 else
2728                         active = ISC_FALSE;
2729
2730                 if (node->wild)
2731                         wild = ISC_TRUE;
2732                 else
2733                         wild = ISC_FALSE;
2734
2735                 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2736                             isc_rwlocktype_read);
2737
2738                 if (wild) {
2739                         /*
2740                          * Construct the wildcard name for this level.
2741                          */
2742                         dns_name_init(&name, NULL);
2743                         dns_rbt_namefromnode(node, &name);
2744                         dns_fixedname_init(&fwname);
2745                         wname = dns_fixedname_name(&fwname);
2746                         result = dns_name_concatenate(dns_wildcardname, &name,
2747                                                       wname, NULL);
2748                         j = i;
2749                         while (result == ISC_R_SUCCESS && j != 0) {
2750                                 j--;
2751                                 level_node = search->chain.levels[j];
2752                                 dns_name_init(&name, NULL);
2753                                 dns_rbt_namefromnode(level_node, &name);
2754                                 result = dns_name_concatenate(wname,
2755                                                               &name,
2756                                                               wname,
2757                                                               NULL);
2758                         }
2759                         if (result != ISC_R_SUCCESS)
2760                                 break;
2761
2762                         wnode = NULL;
2763                         dns_rbtnodechain_init(&wchain, NULL);
2764                         result = dns_rbt_findnode(rbtdb->tree, wname,
2765                                                   NULL, &wnode, &wchain,
2766                                                   DNS_RBTFIND_EMPTYDATA,
2767                                                   NULL, NULL);
2768                         if (result == ISC_R_SUCCESS) {
2769                                 nodelock_t *lock;
2770
2771                                 /*
2772                                  * We have found the wildcard node.  If it
2773                                  * is active in the search's version, we're
2774                                  * done.
2775                                  */
2776                                 lock = &rbtdb->node_locks[wnode->locknum].lock;
2777                                 NODE_LOCK(lock, isc_rwlocktype_read);
2778                                 for (header = wnode->data;
2779                                      header != NULL;
2780                                      header = header->next) {
2781                                         if (header->serial <= search->serial &&
2782                                             !IGNORE(header) && EXISTS(header))
2783                                                 break;
2784                                 }
2785                                 NODE_UNLOCK(lock, isc_rwlocktype_read);
2786                                 if (header != NULL ||
2787                                     activeempty(search, &wchain, wname)) {
2788                                         if (activeemtpynode(search, qname,
2789                                                             wname)) {
2790                                                 return (ISC_R_NOTFOUND);
2791                                         }
2792                                         /*
2793                                          * The wildcard node is active!
2794                                          *
2795                                          * Note: result is still ISC_R_SUCCESS
2796                                          * so we don't have to set it.
2797                                          */
2798                                         *nodep = wnode;
2799                                         break;
2800                                 }
2801                         } else if (result != ISC_R_NOTFOUND &&
2802                                    result != DNS_R_PARTIALMATCH) {
2803                                 /*
2804                                  * An error has occurred.  Bail out.
2805                                  */
2806                                 break;
2807                         }
2808                 }
2809
2810                 if (active) {
2811                         /*
2812                          * The level node is active.  Any wildcarding
2813                          * present at higher levels has no
2814                          * effect and we're done.
2815                          */
2816                         result = ISC_R_NOTFOUND;
2817                         break;
2818                 }
2819
2820                 if (i > 0) {
2821                         i--;
2822                         node = search->chain.levels[i];
2823                 } else
2824                         done = ISC_TRUE;
2825         } while (!done);
2826
2827         return (result);
2828 }
2829
2830 static inline isc_result_t
2831 find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
2832                   dns_name_t *foundname, dns_rdataset_t *rdataset,
2833                   dns_rdataset_t *sigrdataset, isc_boolean_t need_sig)
2834 {
2835         dns_rbtnode_t *node;
2836         rdatasetheader_t *header, *header_next, *found, *foundsig;
2837         isc_boolean_t empty_node;
2838         isc_result_t result;
2839         dns_fixedname_t fname, forigin;
2840         dns_name_t *name, *origin;
2841
2842         do {
2843                 node = NULL;
2844                 dns_fixedname_init(&fname);
2845                 name = dns_fixedname_name(&fname);
2846                 dns_fixedname_init(&forigin);
2847                 origin = dns_fixedname_name(&forigin);
2848                 result = dns_rbtnodechain_current(&search->chain, name,
2849                                                   origin, &node);
2850                 if (result != ISC_R_SUCCESS)
2851                         return (result);
2852                 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2853                           isc_rwlocktype_read);
2854                 found = NULL;
2855                 foundsig = NULL;
2856                 empty_node = ISC_TRUE;
2857                 for (header = node->data;
2858                      header != NULL;
2859                      header = header_next) {
2860                         header_next = header->next;
2861                         /*
2862                          * Look for an active, extant NSEC or RRSIG NSEC.
2863                          */
2864                         do {
2865                                 if (header->serial <= search->serial &&
2866                                     !IGNORE(header)) {
2867                                         /*
2868                                          * Is this a "this rdataset doesn't
2869                                          * exist" record?
2870                                          */
2871                                         if (NONEXISTENT(header))
2872                                                 header = NULL;
2873                                         break;
2874                                 } else
2875                                         header = header->down;
2876                         } while (header != NULL);
2877                         if (header != NULL) {
2878                                 /*
2879                                  * We now know that there is at least one
2880                                  * active rdataset at this node.
2881                                  */
2882                                 empty_node = ISC_FALSE;
2883                                 if (header->type == dns_rdatatype_nsec) {
2884                                         found = header;
2885                                         if (foundsig != NULL)
2886                                                 break;
2887                                 } else if (header->type ==
2888                                            RBTDB_RDATATYPE_SIGNSEC) {
2889                                         foundsig = header;
2890                                         if (found != NULL)
2891                                                 break;
2892                                 }
2893                         }
2894                 }
2895                 if (!empty_node) {
2896                         if (found != NULL &&
2897                             (foundsig != NULL || !need_sig))
2898                         {
2899                                 /*
2900                                  * We've found the right NSEC record.
2901                                  *
2902                                  * Note: for this to really be the right
2903                                  * NSEC record, it's essential that the NSEC
2904                                  * records of any nodes obscured by a zone
2905                                  * cut have been removed; we assume this is
2906                                  * the case.
2907                                  */
2908                                 result = dns_name_concatenate(name, origin,
2909                                                               foundname, NULL);
2910                                 if (result == ISC_R_SUCCESS) {
2911                                         if (nodep != NULL) {
2912                                                 new_reference(search->rbtdb,
2913                                                               node);
2914                                                 *nodep = node;
2915                                         }
2916                                         bind_rdataset(search->rbtdb, node,
2917                                                       found, search->now,
2918                                                       rdataset);
2919                                         if (foundsig != NULL)
2920                                                 bind_rdataset(search->rbtdb,
2921                                                               node,
2922                                                               foundsig,
2923                                                               search->now,
2924                                                               sigrdataset);
2925                                 }
2926                         } else if (found == NULL && foundsig == NULL) {
2927                                 /*
2928                                  * This node is active, but has no NSEC or
2929                                  * RRSIG NSEC.  That means it's glue or
2930                                  * other obscured zone data that isn't
2931                                  * relevant for our search.  Treat the
2932                                  * node as if it were empty and keep looking.
2933                                  */
2934                                 empty_node = ISC_TRUE;
2935                                 result = dns_rbtnodechain_prev(&search->chain,
2936                                                                NULL, NULL);
2937                         } else {
2938                                 /*
2939                                  * We found an active node, but either the
2940                                  * NSEC or the RRSIG NSEC is missing.  This
2941                                  * shouldn't happen.
2942                                  */
2943                                 result = DNS_R_BADDB;
2944                         }
2945                 } else {
2946                         /*
2947                          * This node isn't active.  We've got to keep
2948                          * looking.
2949                          */
2950                         result = dns_rbtnodechain_prev(&search->chain, NULL,
2951                                                        NULL);
2952                 }
2953                 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2954                             isc_rwlocktype_read);
2955         } while (empty_node && result == ISC_R_SUCCESS);
2956
2957         /*
2958          * If the result is ISC_R_NOMORE, then we got to the beginning of
2959          * the database and didn't find a NSEC record.  This shouldn't
2960          * happen.
2961          */
2962         if (result == ISC_R_NOMORE)
2963                 result = DNS_R_BADDB;
2964
2965         return (result);
2966 }
2967
2968 static isc_result_t
2969 zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
2970           dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
2971           dns_dbnode_t **nodep, dns_name_t *foundname,
2972           dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
2973 {
2974         dns_rbtnode_t *node = NULL;
2975         isc_result_t result;
2976         rbtdb_search_t search;
2977         isc_boolean_t cname_ok = ISC_TRUE;
2978         isc_boolean_t close_version = ISC_FALSE;
2979         isc_boolean_t maybe_zonecut = ISC_FALSE;
2980         isc_boolean_t at_zonecut = ISC_FALSE;
2981         isc_boolean_t wild;
2982         isc_boolean_t empty_node;
2983         rdatasetheader_t *header, *header_next, *found, *nsecheader;
2984         rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
2985         rbtdb_rdatatype_t sigtype;
2986         isc_boolean_t active;
2987         dns_rbtnodechain_t chain;
2988         nodelock_t *lock;
2989
2990
2991         search.rbtdb = (dns_rbtdb_t *)db;
2992
2993         REQUIRE(VALID_RBTDB(search.rbtdb));
2994
2995         /*
2996          * We don't care about 'now'.
2997          */
2998         UNUSED(now);
2999
3000         /*
3001          * If the caller didn't supply a version, attach to the current
3002          * version.
3003          */
3004         if (version == NULL) {
3005                 currentversion(db, &version);
3006                 close_version = ISC_TRUE;
3007         }
3008
3009         search.rbtversion = version;
3010         search.serial = search.rbtversion->serial;
3011         search.options = options;
3012         search.copy_name = ISC_FALSE;
3013         search.need_cleanup = ISC_FALSE;
3014         search.wild = ISC_FALSE;
3015         search.zonecut = NULL;
3016         dns_fixedname_init(&search.zonecut_name);
3017         dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3018         search.now = 0;
3019
3020         /*
3021          * 'wild' will be true iff. we've matched a wildcard.
3022          */
3023         wild = ISC_FALSE;
3024
3025         RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3026
3027         /*
3028          * Search down from the root of the tree.  If, while going down, we
3029          * encounter a callback node, zone_zonecut_callback() will search the
3030          * rdatasets at the zone cut for active DNAME or NS rdatasets.
3031          */
3032         result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3033                                   &search.chain, DNS_RBTFIND_EMPTYDATA,
3034                                   zone_zonecut_callback, &search);
3035
3036         if (result == DNS_R_PARTIALMATCH) {
3037         partial_match:
3038                 if (search.zonecut != NULL) {
3039                     result = setup_delegation(&search, nodep, foundname,
3040                                               rdataset, sigrdataset);
3041                     goto tree_exit;
3042                 }
3043
3044                 if (search.wild) {
3045                         /*
3046                          * At least one of the levels in the search chain
3047                          * potentially has a wildcard.  For each such level,
3048                          * we must see if there's a matching wildcard active
3049                          * in the current version.
3050                          */
3051                         result = find_wildcard(&search, &node, name);
3052                         if (result == ISC_R_SUCCESS) {
3053                                 result = dns_name_copy(name, foundname, NULL);
3054                                 if (result != ISC_R_SUCCESS)
3055                                         goto tree_exit;
3056                                 wild = ISC_TRUE;
3057                                 goto found;
3058                         }
3059                         else if (result != ISC_R_NOTFOUND)
3060                                 goto tree_exit;
3061                 }
3062
3063                 chain = search.chain;
3064                 active = activeempty(&search, &chain, name);
3065
3066                 /*
3067                  * If we're here, then the name does not exist, is not
3068                  * beneath a zonecut, and there's no matching wildcard.
3069                  */
3070                 if (search.rbtdb->secure ||
3071                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3072                 {
3073                         result = find_closest_nsec(&search, nodep, foundname,
3074                                                    rdataset, sigrdataset,
3075                                                    search.rbtdb->secure);
3076                         if (result == ISC_R_SUCCESS)
3077                                 result = active ? DNS_R_EMPTYNAME :
3078                                                   DNS_R_NXDOMAIN;
3079                 } else
3080                         result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
3081                 goto tree_exit;
3082         } else if (result != ISC_R_SUCCESS)
3083                 goto tree_exit;
3084
3085  found:
3086         /*
3087          * We have found a node whose name is the desired name, or we
3088          * have matched a wildcard.
3089          */
3090
3091         if (search.zonecut != NULL) {
3092                 /*
3093                  * If we're beneath a zone cut, we don't want to look for
3094                  * CNAMEs because they're not legitimate zone glue.
3095                  */
3096                 cname_ok = ISC_FALSE;
3097         } else {
3098                 /*
3099                  * The node may be a zone cut itself.  If it might be one,
3100                  * make sure we check for it later.
3101                  */
3102                 if (node->find_callback &&
3103                     (node != search.rbtdb->origin_node ||
3104                      IS_STUB(search.rbtdb)) &&
3105                     !dns_rdatatype_atparent(type))
3106                         maybe_zonecut = ISC_TRUE;
3107         }
3108
3109         /*
3110          * Certain DNSSEC types are not subject to CNAME matching
3111          * (RFC4035, section 2.5 and RFC3007).
3112          *
3113          * We don't check for RRSIG, because we don't store RRSIG records
3114          * directly.
3115          */
3116         if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3117                 cname_ok = ISC_FALSE;
3118
3119         /*
3120          * We now go looking for rdata...
3121          */
3122
3123         NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3124                   isc_rwlocktype_read);
3125
3126         found = NULL;
3127         foundsig = NULL;
3128         sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3129         nsecheader = NULL;
3130         nsecsig = NULL;
3131         cnamesig = NULL;
3132         empty_node = ISC_TRUE;
3133         for (header = node->data; header != NULL; header = header_next) {
3134                 header_next = header->next;
3135                 /*
3136                  * Look for an active, extant rdataset.
3137                  */
3138                 do {
3139                         if (header->serial <= search.serial &&
3140                             !IGNORE(header)) {
3141                                 /*
3142                                  * Is this a "this rdataset doesn't
3143                                  * exist" record?
3144                                  */
3145                                 if (NONEXISTENT(header))
3146                                         header = NULL;
3147                                 break;
3148                         } else
3149                                 header = header->down;
3150                 } while (header != NULL);
3151                 if (header != NULL) {
3152                         /*
3153                          * We now know that there is at least one active
3154                          * rdataset at this node.
3155                          */
3156                         empty_node = ISC_FALSE;
3157
3158                         /*
3159                          * Do special zone cut handling, if requested.
3160                          */
3161                         if (maybe_zonecut &&
3162                             header->type == dns_rdatatype_ns) {
3163                                 /*
3164                                  * We increment the reference count on node to
3165                                  * ensure that search->zonecut_rdataset will
3166                                  * still be valid later.
3167                                  */
3168                                 new_reference(search.rbtdb, node);
3169                                 search.zonecut = node;
3170                                 search.zonecut_rdataset = header;
3171                                 search.zonecut_sigrdataset = NULL;
3172                                 search.need_cleanup = ISC_TRUE;
3173                                 maybe_zonecut = ISC_FALSE;
3174                                 at_zonecut = ISC_TRUE;
3175                                 /*
3176                                  * It is not clear if KEY should still be
3177                                  * allowed at the parent side of the zone
3178                                  * cut or not.  It is needed for RFC3007
3179                                  * validated updates.
3180                                  */
3181                                 if ((search.options & DNS_DBFIND_GLUEOK) == 0
3182                                     && type != dns_rdatatype_nsec
3183                                     && type != dns_rdatatype_key) {
3184                                         /*
3185                                          * Glue is not OK, but any answer we
3186                                          * could return would be glue.  Return
3187                                          * the delegation.
3188                                          */
3189                                         found = NULL;
3190                                         break;
3191                                 }
3192                                 if (found != NULL && foundsig != NULL)
3193                                         break;
3194                         }
3195
3196                         /*
3197                          * If we found a type we were looking for,
3198                          * remember it.
3199                          */
3200                         if (header->type == type ||
3201                             type == dns_rdatatype_any ||
3202                             (header->type == dns_rdatatype_cname &&
3203                              cname_ok)) {
3204                                 /*
3205                                  * We've found the answer!
3206                                  */
3207                                 found = header;
3208                                 if (header->type == dns_rdatatype_cname &&
3209                                     cname_ok) {
3210                                         /*
3211                                          * We may be finding a CNAME instead
3212                                          * of the desired type.
3213                                          *
3214                                          * If we've already got the CNAME RRSIG,
3215                                          * use it, otherwise change sigtype
3216                                          * so that we find it.
3217                                          */
3218                                         if (cnamesig != NULL)
3219                                                 foundsig = cnamesig;
3220                                         else
3221                                                 sigtype =
3222                                                     RBTDB_RDATATYPE_SIGCNAME;
3223                                 }
3224                                 /*
3225                                  * If we've got all we need, end the search.
3226                                  */
3227                                 if (!maybe_zonecut && foundsig != NULL)
3228                                         break;
3229                         } else if (header->type == sigtype) {
3230                                 /*
3231                                  * We've found the RRSIG rdataset for our
3232                                  * target type.  Remember it.
3233                                  */
3234                                 foundsig = header;
3235                                 /*
3236                                  * If we've got all we need, end the search.
3237                                  */
3238                                 if (!maybe_zonecut && found != NULL)
3239                                         break;
3240                         } else if (header->type == dns_rdatatype_nsec) {
3241                                 /*
3242                                  * Remember a NSEC rdataset even if we're
3243                                  * not specifically looking for it, because
3244                                  * we might need it later.
3245                                  */
3246                                 nsecheader = header;
3247                         } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
3248                                 /*
3249                                  * If we need the NSEC rdataset, we'll also
3250                                  * need its signature.
3251                                  */
3252                                 nsecsig = header;
3253                         } else if (cname_ok &&
3254                                    header->type == RBTDB_RDATATYPE_SIGCNAME) {
3255                                 /*
3256                                  * If we get a CNAME match, we'll also need
3257                                  * its signature.
3258                                  */
3259                                 cnamesig = header;
3260                         }
3261                 }
3262         }
3263
3264         if (empty_node) {
3265                 /*
3266                  * We have an exact match for the name, but there are no
3267                  * active rdatasets in the desired version.  That means that
3268                  * this node doesn't exist in the desired version, and that
3269                  * we really have a partial match.
3270                  */
3271                 if (!wild) {
3272                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3273                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3274                         goto partial_match;
3275                 }
3276         }
3277
3278         /*
3279          * If we didn't find what we were looking for...
3280          */
3281         if (found == NULL) {
3282                 if (search.zonecut != NULL) {
3283                         /*
3284                          * We were trying to find glue at a node beneath a
3285                          * zone cut, but didn't.
3286                          *
3287                          * Return the delegation.
3288                          */
3289                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3290                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3291                         result = setup_delegation(&search, nodep, foundname,
3292                                                   rdataset, sigrdataset);
3293                         goto tree_exit;
3294                 }
3295                 /*
3296                  * The desired type doesn't exist.
3297                  */
3298                 result = DNS_R_NXRRSET;
3299                 if (search.rbtdb->secure &&
3300                     (nsecheader == NULL || nsecsig == NULL)) {
3301                         /*
3302                          * The zone is secure but there's no NSEC,
3303                          * or the NSEC has no signature!
3304                          */
3305                         if (!wild) {
3306                                 result = DNS_R_BADDB;
3307                                 goto node_exit;
3308                         }
3309
3310                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3311                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3312                         result = find_closest_nsec(&search, nodep, foundname,
3313                                                    rdataset, sigrdataset,
3314                                                    search.rbtdb->secure);
3315                         if (result == ISC_R_SUCCESS)
3316                                 result = DNS_R_EMPTYWILD;
3317                         goto tree_exit;
3318                 }
3319                 if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
3320                     nsecheader == NULL)
3321                 {
3322                         /*
3323                          * There's no NSEC record, and we were told
3324                          * to find one.
3325                          */
3326                         result = DNS_R_BADDB;
3327                         goto node_exit;
3328                 }
3329                 if (nodep != NULL) {
3330                         new_reference(search.rbtdb, node);
3331                         *nodep = node;
3332                 }
3333                 if (search.rbtdb->secure ||
3334                     (search.options & DNS_DBFIND_FORCENSEC) != 0)
3335                 {
3336                         bind_rdataset(search.rbtdb, node, nsecheader,
3337                                       0, rdataset);
3338                         if (nsecsig != NULL)
3339                                 bind_rdataset(search.rbtdb, node,
3340                                               nsecsig, 0, sigrdataset);
3341                 }
3342                 if (wild)
3343                         foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3344                 goto node_exit;
3345         }
3346
3347         /*
3348          * We found what we were looking for, or we found a CNAME.
3349          */
3350
3351         if (type != found->type &&
3352             type != dns_rdatatype_any &&
3353             found->type == dns_rdatatype_cname) {
3354                 /*
3355                  * We weren't doing an ANY query and we found a CNAME instead
3356                  * of the type we were looking for, so we need to indicate
3357                  * that result to the caller.
3358                  */
3359                 result = DNS_R_CNAME;
3360         } else if (search.zonecut != NULL) {
3361                 /*
3362                  * If we're beneath a zone cut, we must indicate that the
3363                  * result is glue, unless we're actually at the zone cut
3364                  * and the type is NSEC or KEY.
3365                  */
3366                 if (search.zonecut == node) {
3367                         /*
3368                          * It is not clear if KEY should still be
3369                          * allowed at the parent side of the zone
3370                          * cut or not.  It is needed for RFC3007
3371                          * validated updates.
3372                          */
3373                         if (type == dns_rdatatype_nsec ||
3374                             type == dns_rdatatype_key)
3375                                 result = ISC_R_SUCCESS;
3376                         else if (type == dns_rdatatype_any)
3377                                 result = DNS_R_ZONECUT;
3378                         else
3379                                 result = DNS_R_GLUE;
3380                 } else
3381                         result = DNS_R_GLUE;
3382                 /*
3383                  * We might have found data that isn't glue, but was occluded
3384                  * by a dynamic update.  If the caller cares about this, they
3385                  * will have told us to validate glue.
3386                  *
3387                  * XXX We should cache the glue validity state!
3388                  */
3389                 if (result == DNS_R_GLUE &&
3390                     (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
3391                     !valid_glue(&search, foundname, type, node)) {
3392                         lock = &search.rbtdb->node_locks[node->locknum].lock;
3393                         NODE_UNLOCK(lock, isc_rwlocktype_read);
3394                         result = setup_delegation(&search, nodep, foundname,
3395                                                   rdataset, sigrdataset);
3396                     goto tree_exit;
3397                 }
3398         } else {
3399                 /*
3400                  * An ordinary successful query!
3401                  */
3402                 result = ISC_R_SUCCESS;
3403         }
3404
3405         if (nodep != NULL) {
3406                 if (!at_zonecut)
3407                         new_reference(search.rbtdb, node);
3408                 else
3409                         search.need_cleanup = ISC_FALSE;
3410                 *nodep = node;
3411         }
3412
3413         if (type != dns_rdatatype_any) {
3414                 bind_rdataset(search.rbtdb, node, found, 0, rdataset);
3415                 if (foundsig != NULL)
3416                         bind_rdataset(search.rbtdb, node, foundsig, 0,
3417                                       sigrdataset);
3418         }
3419
3420         if (wild)
3421                 foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3422
3423  node_exit:
3424         NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3425                     isc_rwlocktype_read);
3426
3427  tree_exit:
3428         RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3429
3430         /*
3431          * If we found a zonecut but aren't going to use it, we have to
3432          * let go of it.
3433          */
3434         if (search.need_cleanup) {
3435                 node = search.zonecut;
3436                 lock = &(search.rbtdb->node_locks[node->locknum].lock);
3437
3438                 NODE_LOCK(lock, isc_rwlocktype_read);
3439                 decrement_reference(search.rbtdb, node, 0,
3440                                     isc_rwlocktype_read, isc_rwlocktype_none,
3441                                     ISC_FALSE);
3442                 NODE_UNLOCK(lock, isc_rwlocktype_read);
3443         }
3444
3445         if (close_version)
3446                 closeversion(db, &version, ISC_FALSE);
3447
3448         dns_rbtnodechain_reset(&search.chain);
3449
3450         return (result);
3451 }
3452
3453 static isc_result_t
3454 zone_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
3455                  isc_stdtime_t now, dns_dbnode_t **nodep,
3456                  dns_name_t *foundname,
3457                  dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3458 {
3459         UNUSED(db);
3460         UNUSED(name);
3461         UNUSED(options);
3462         UNUSED(now);
3463         UNUSED(nodep);
3464         UNUSED(foundname);
3465         UNUSED(rdataset);
3466         UNUSED(sigrdataset);
3467
3468         FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
3469
3470         return (ISC_R_NOTIMPLEMENTED);
3471 }
3472
3473 static isc_result_t
3474 cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
3475         rbtdb_search_t *search = arg;
3476         rdatasetheader_t *header, *header_prev, *header_next;
3477         rdatasetheader_t *dname_header, *sigdname_header;
3478         isc_result_t result;
3479         nodelock_t *lock;
3480         isc_rwlocktype_t locktype;
3481
3482         /* XXX comment */
3483
3484         REQUIRE(search->zonecut == NULL);
3485
3486         /*
3487          * Keep compiler silent.
3488          */
3489         UNUSED(name);
3490
3491         lock = &(search->rbtdb->node_locks[node->locknum].lock);
3492         locktype = isc_rwlocktype_read;
3493         NODE_LOCK(lock, locktype);
3494
3495         /*
3496          * Look for a DNAME or RRSIG DNAME rdataset.
3497          */
3498         dname_header = NULL;
3499         sigdname_header = NULL;
3500         header_prev = NULL;
3501         for (header = node->data; header != NULL; header = header_next) {
3502                 header_next = header->next;
3503                 if (header->rdh_ttl <= search->now) {
3504                         /*
3505                          * This rdataset is stale.  If no one else is
3506                          * using the node, we can clean it up right
3507                          * now, otherwise we mark it as stale, and
3508                          * the node as dirty, so it will get cleaned
3509                          * up later.
3510                          */
3511                         if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) &&
3512                             (locktype == isc_rwlocktype_write ||
3513                              NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3514                                 /*
3515                                  * We update the node's status only when we
3516                                  * can get write access; otherwise, we leave
3517                                  * others to this work.  Periodical cleaning
3518                                  * will eventually take the job as the last
3519                                  * resort.
3520                                  * We won't downgrade the lock, since other
3521                                  * rdatasets are probably stale, too.
3522                                  */
3523                                 locktype = isc_rwlocktype_write;
3524
3525                                 if (dns_rbtnode_refcurrent(node) == 0) {
3526                                         isc_mem_t *mctx;
3527
3528                                         /*
3529                                          * header->down can be non-NULL if the
3530