bind - Removed version tag from contrib directory and updated README.DRAGONFLY.
[dragonfly.git] / contrib / bind / lib / dns / rbtdb.c
CommitLineData
bbbf71a3
JL
1/*
2 * Copyright (C) 2004-2009 Internet Systems Consortium, Inc. ("ISC")
3 * Copyright (C) 1999-2003 Internet Software Consortium.
4 *
5 * Permission to use, copy, modify, and/or distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
15 * PERFORMANCE OF THIS SOFTWARE.
16 */
17
c4067435 18/* $Id: rbtdb.c,v 1.248.12.18.2.1 2009/11/18 23:41:18 marka Exp $ */
bbbf71a3
JL
19
20/*! \file */
21
22/*
23 * Principal Author: Bob Halley
24 */
25
26#include <config.h>
27
28#include <isc/heap.h>
29#include <isc/event.h>
30#include <isc/mem.h>
31#include <isc/platform.h>
32#include <isc/print.h>
33#include <isc/mutex.h>
34#include <isc/random.h>
35#include <isc/refcount.h>
36#include <isc/rwlock.h>
37#include <isc/string.h>
38#include <isc/task.h>
39#include <isc/time.h>
40#include <isc/util.h>
41
42#include <dns/acache.h>
43#include <dns/db.h>
44#include <dns/dbiterator.h>
45#include <dns/events.h>
46#include <dns/fixedname.h>
47#include <dns/lib.h>
48#include <dns/log.h>
49#include <dns/masterdump.h>
50#include <dns/rbt.h>
51#include <dns/rdata.h>
52#include <dns/rdataset.h>
53#include <dns/rdatasetiter.h>
54#include <dns/rdataslab.h>
55#include <dns/result.h>
56#include <dns/stats.h>
57#include <dns/view.h>
58#include <dns/zone.h>
59#include <dns/zonekey.h>
60
61#ifdef DNS_RBTDB_VERSION64
62#include "rbtdb64.h"
63#else
64#include "rbtdb.h"
65#endif
66
67#ifdef DNS_RBTDB_VERSION64
68#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '8')
69#else
70#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4')
71#endif
72
73/*%
74 * Note that "impmagic" is not the first four bytes of the struct, so
75 * ISC_MAGIC_VALID cannot be used.
76 */
77#define VALID_RBTDB(rbtdb) ((rbtdb) != NULL && \
78 (rbtdb)->common.impmagic == RBTDB_MAGIC)
79
80#ifdef DNS_RBTDB_VERSION64
81typedef isc_uint64_t rbtdb_serial_t;
82/*%
83 * Make casting easier in symbolic debuggers by using different names
84 * for the 64 bit version.
85 */
86#define dns_rbtdb_t dns_rbtdb64_t
87#define rdatasetheader_t rdatasetheader64_t
88#define rbtdb_version_t rbtdb_version64_t
89#else
90typedef isc_uint32_t rbtdb_serial_t;
91#endif
92
93typedef isc_uint32_t rbtdb_rdatatype_t;
94
95#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF))
96#define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16))
97#define RBTDB_RDATATYPE_VALUE(b, e) (((e) << 16) | (b))
98
99#define RBTDB_RDATATYPE_SIGNSEC \
100 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
101#define RBTDB_RDATATYPE_SIGNS \
102 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
103#define RBTDB_RDATATYPE_SIGCNAME \
104 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
105#define RBTDB_RDATATYPE_SIGDNAME \
106 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
107#define RBTDB_RDATATYPE_NCACHEANY \
108 RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
109
110/*
111 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
112 * Using rwlock is effective with regard to lookup performance only when
113 * it is implemented in an efficient way.
114 * Otherwise, it is generally wise to stick to the simple locking since rwlock
115 * would require more memory or can even make lookups slower due to its own
116 * overhead (when it internally calls mutex locks).
117 */
118#ifdef ISC_RWLOCK_USEATOMIC
119#define DNS_RBTDB_USERWLOCK 1
120#else
121#define DNS_RBTDB_USERWLOCK 0
122#endif
123
124#if DNS_RBTDB_USERWLOCK
125#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0)
126#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l)
127#define RBTDB_LOCK(l, t) RWLOCK((l), (t))
128#define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t))
129#else
130#define RBTDB_INITLOCK(l) isc_mutex_init(l)
131#define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l)
132#define RBTDB_LOCK(l, t) LOCK(l)
133#define RBTDB_UNLOCK(l, t) UNLOCK(l)
134#endif
135
136/*
137 * Since node locking is sensitive to both performance and memory footprint,
138 * we need some trick here. If we have both high-performance rwlock and
139 * high performance and small-memory reference counters, we use rwlock for
140 * node lock and isc_refcount for node references. In this case, we don't have
141 * to protect the access to the counters by locks.
142 * Otherwise, we simply use ordinary mutex lock for node locking, and use
143 * simple integers as reference counters which is protected by the lock.
144 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
145 * NODE_UNLOCK. In some other cases, however, we need to protect reference
146 * counters first and then protect other parts of a node as read-only data.
147 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
148 * provided for these special cases. When we can use the efficient backend
149 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
150 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
151 * section including the access to the reference counter.
152 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
153 * section is also protected by NODE_STRONGLOCK().
154 */
155#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
156typedef isc_rwlock_t nodelock_t;
157
158#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0)
159#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l)
160#define NODE_LOCK(l, t) RWLOCK((l), (t))
161#define NODE_UNLOCK(l, t) RWUNLOCK((l), (t))
162#define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l)
163
164#define NODE_STRONGLOCK(l) ((void)0)
165#define NODE_STRONGUNLOCK(l) ((void)0)
166#define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t)
167#define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t)
168#define NODE_WEAKDOWNGRADE(l) isc_rwlock_downgrade(l)
169#else
170typedef isc_mutex_t nodelock_t;
171
172#define NODE_INITLOCK(l) isc_mutex_init(l)
173#define NODE_DESTROYLOCK(l) DESTROYLOCK(l)
174#define NODE_LOCK(l, t) LOCK(l)
175#define NODE_UNLOCK(l, t) UNLOCK(l)
176#define NODE_TRYUPGRADE(l) ISC_R_SUCCESS
177
178#define NODE_STRONGLOCK(l) LOCK(l)
179#define NODE_STRONGUNLOCK(l) UNLOCK(l)
180#define NODE_WEAKLOCK(l, t) ((void)0)
181#define NODE_WEAKUNLOCK(l, t) ((void)0)
182#define NODE_WEAKDOWNGRADE(l) ((void)0)
183#endif
184
185/*%
186 * Whether to rate-limit updating the LRU to avoid possible thread contention.
187 * Our performance measurement has shown the cost is marginal, so it's defined
188 * to be 0 by default either with or without threads.
189 */
190#ifndef DNS_RBTDB_LIMITLRUUPDATE
191#define DNS_RBTDB_LIMITLRUUPDATE 0
192#endif
193
194/*
195 * Allow clients with a virtual time of up to 5 minutes in the past to see
196 * records that would have otherwise have expired.
197 */
198#define RBTDB_VIRTUAL 300
199
200struct noqname {
201 dns_name_t name;
202 void * nsec;
203 void * nsecsig;
204};
205
206typedef struct acachectl acachectl_t;
207
208typedef struct rdatasetheader {
209 /*%
210 * Locked by the owning node's lock.
211 */
212 rbtdb_serial_t serial;
213 dns_ttl_t rdh_ttl;
214 rbtdb_rdatatype_t type;
215 isc_uint16_t attributes;
216 dns_trust_t trust;
217 struct noqname *noqname;
218 /*%<
219 * We don't use the LIST macros, because the LIST structure has
220 * both head and tail pointers, and is doubly linked.
221 */
222
223 struct rdatasetheader *next;
224 /*%<
225 * If this is the top header for an rdataset, 'next' points
226 * to the top header for the next rdataset (i.e., the next type).
227 * Otherwise, it points up to the header whose down pointer points
228 * at this header.
229 */
230
231 struct rdatasetheader *down;
232 /*%<
233 * Points to the header for the next older version of
234 * this rdataset.
235 */
236
237 isc_uint32_t count;
238 /*%<
239 * Monotonously increased every time this rdataset is bound so that
240 * it is used as the base of the starting point in DNS responses
241 * when the "cyclic" rrset-order is required. Since the ordering
242 * should not be so crucial, no lock is set for the counter for
243 * performance reasons.
244 */
245
246 acachectl_t *additional_auth;
247 acachectl_t *additional_glue;
248
249 dns_rbtnode_t *node;
250 isc_stdtime_t last_used;
251 ISC_LINK(struct rdatasetheader) lru_link;
252 /*%<
253 * Used for LRU-based cache management. We should probably make
254 * these cache-DB specific. We might also make it a pointer and
255 * ensure only the top header has a valid link to save memory.
256 * The linked-list is locked by the rbtdb->lrulock.
257 */
258
259 /*
260 * It's possible this should not be here anymore, but instead
261 * referenced from the bucket's heap directly.
262 */
263#if 0
264 isc_heap_t *heap;
265#endif
266 unsigned int heap_index;
267 /*%<
268 * Used for TTL-based cache cleaning.
269 */
270} rdatasetheader_t;
271
272typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t;
273typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t;
274
275#define RDATASET_ATTR_NONEXISTENT 0x0001
276#define RDATASET_ATTR_STALE 0x0002
277#define RDATASET_ATTR_IGNORE 0x0004
278#define RDATASET_ATTR_RETAIN 0x0008
279#define RDATASET_ATTR_NXDOMAIN 0x0010
280#define RDATASET_ATTR_RESIGN 0x0020
281#define RDATASET_ATTR_STATCOUNT 0x0040
282
283typedef struct acache_cbarg {
284 dns_rdatasetadditional_t type;
285 unsigned int count;
286 dns_db_t *db;
287 dns_dbnode_t *node;
288 rdatasetheader_t *header;
289} acache_cbarg_t;
290
291struct acachectl {
292 dns_acacheentry_t *entry;
293 acache_cbarg_t *cbarg;
294};
295
296/*
297 * XXX
298 * When the cache will pre-expire data (due to memory low or other
299 * situations) before the rdataset's TTL has expired, it MUST
300 * respect the RETAIN bit and not expire the data until its TTL is
301 * expired.
302 */
303
304#undef IGNORE /* WIN32 winbase.h defines this. */
305
306#define EXISTS(header) \
307 (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
308#define NONEXISTENT(header) \
309 (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
310#define IGNORE(header) \
311 (((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
312#define RETAIN(header) \
313 (((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
314#define NXDOMAIN(header) \
315 (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
316
317#define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */
318
319/*%
320 * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
321 * There is a tradeoff issue about configuring this value: if this is too
322 * small, it may cause heavier contention between threads; if this is too large,
323 * LRU purge algorithm won't work well (entries tend to be purged prematurely).
324 * The default value should work well for most environments, but this can
325 * also be configurable at compilation time via the
326 * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable. This value must be larger than
327 * 1 due to the assumption of overmem_purge().
328 */
329#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
330#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
331#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
332#else
333#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
334#endif
335#else
336#define DEFAULT_CACHE_NODE_LOCK_COUNT 16
337#endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
338
339typedef struct {
340 nodelock_t lock;
341 /* Protected in the refcount routines. */
342 isc_refcount_t references;
343 /* Locked by lock. */
344 isc_boolean_t exiting;
345} rbtdb_nodelock_t;
346
347typedef struct rbtdb_changed {
348 dns_rbtnode_t * node;
349 isc_boolean_t dirty;
350 ISC_LINK(struct rbtdb_changed) link;
351} rbtdb_changed_t;
352
353typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t;
354
355typedef struct rbtdb_version {
356 /* Not locked */
357 rbtdb_serial_t serial;
358 /*
359 * Protected in the refcount routines.
360 * XXXJT: should we change the lock policy based on the refcount
361 * performance?
362 */
363 isc_refcount_t references;
364 /* Locked by database lock. */
365 isc_boolean_t writer;
366 isc_boolean_t commit_ok;
367 rbtdb_changedlist_t changed_list;
368 ISC_LINK(struct rbtdb_version) link;
369} rbtdb_version_t;
370
371typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t;
372
373typedef struct {
374 /* Unlocked. */
375 dns_db_t common;
376#if DNS_RBTDB_USERWLOCK
377 isc_rwlock_t lock;
378#else
379 isc_mutex_t lock;
380#endif
381 isc_rwlock_t tree_lock;
382 unsigned int node_lock_count;
383 rbtdb_nodelock_t * node_locks;
384 dns_rbtnode_t * origin_node;
385 dns_stats_t * rrsetstats; /* cache DB only */
386 /* Locked by lock. */
387 unsigned int active;
388 isc_refcount_t references;
389 unsigned int attributes;
390 rbtdb_serial_t current_serial;
391 rbtdb_serial_t least_serial;
392 rbtdb_serial_t next_serial;
393 rbtdb_version_t * current_version;
394 rbtdb_version_t * future_version;
395 rbtdb_versionlist_t open_versions;
396 isc_boolean_t overmem;
397 isc_task_t * task;
398 dns_dbnode_t *soanode;
399 dns_dbnode_t *nsnode;
400
401 /*
402 * This is a linked list used to implement the LRU cache. There will
403 * be node_lock_count linked lists here. Nodes in bucket 1 will be
404 * placed on the linked list rdatasets[1].
405 */
406 rdatasetheaderlist_t *rdatasets;
407
408 /*%
409 * Temporary storage for stale cache nodes and dynamically deleted
410 * nodes that await being cleaned up.
411 */
412 rbtnodelist_t *deadnodes;
413
414 /*
415 * Heaps. Each of these is used for TTL based expiry.
416 */
417 isc_heap_t **heaps;
418
419 /* Locked by tree_lock. */
420 dns_rbt_t * tree;
421 isc_boolean_t secure;
422
423 /* Unlocked */
424 unsigned int quantum;
425} dns_rbtdb_t;
426
427#define RBTDB_ATTR_LOADED 0x01
428#define RBTDB_ATTR_LOADING 0x02
429
430/*%
431 * Search Context
432 */
433typedef struct {
434 dns_rbtdb_t * rbtdb;
435 rbtdb_version_t * rbtversion;
436 rbtdb_serial_t serial;
437 unsigned int options;
438 dns_rbtnodechain_t chain;
439 isc_boolean_t copy_name;
440 isc_boolean_t need_cleanup;
441 isc_boolean_t wild;
442 dns_rbtnode_t * zonecut;
443 rdatasetheader_t * zonecut_rdataset;
444 rdatasetheader_t * zonecut_sigrdataset;
445 dns_fixedname_t zonecut_name;
446 isc_stdtime_t now;
447} rbtdb_search_t;
448
449/*%
450 * Load Context
451 */
452typedef struct {
453 dns_rbtdb_t * rbtdb;
454 isc_stdtime_t now;
455} rbtdb_load_t;
456
457static void rdataset_disassociate(dns_rdataset_t *rdataset);
458static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
459static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
460static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
461static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
462static unsigned int rdataset_count(dns_rdataset_t *rdataset);
463static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
464 dns_name_t *name,
465 dns_rdataset_t *nsec,
466 dns_rdataset_t *nsecsig);
467static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
468 dns_rdatasetadditional_t type,
469 dns_rdatatype_t qtype,
470 dns_acache_t *acache,
471 dns_zone_t **zonep,
472 dns_db_t **dbp,
473 dns_dbversion_t **versionp,
474 dns_dbnode_t **nodep,
475 dns_name_t *fname,
476 dns_message_t *msg,
477 isc_stdtime_t now);
478static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
479 dns_rdatasetadditional_t type,
480 dns_rdatatype_t qtype,
481 dns_acache_t *acache,
482 dns_zone_t *zone,
483 dns_db_t *db,
484 dns_dbversion_t *version,
485 dns_dbnode_t *node,
486 dns_name_t *fname);
487static isc_result_t rdataset_putadditional(dns_acache_t *acache,
488 dns_rdataset_t *rdataset,
489 dns_rdatasetadditional_t type,
490 dns_rdatatype_t qtype);
491static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
492 isc_stdtime_t now);
493static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
494 isc_stdtime_t now);
495static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
496 isc_boolean_t tree_locked);
497static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
498 isc_stdtime_t now, isc_boolean_t tree_locked);
499static void prune_tree(isc_task_t *task, isc_event_t *event);
500
501static dns_rdatasetmethods_t rdataset_methods = {
502 rdataset_disassociate,
503 rdataset_first,
504 rdataset_next,
505 rdataset_current,
506 rdataset_clone,
507 rdataset_count,
508 NULL,
509 rdataset_getnoqname,
510 rdataset_getadditional,
511 rdataset_setadditional,
512 rdataset_putadditional
513};
514
515static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
516static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
517static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
518static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
519 dns_rdataset_t *rdataset);
520
521static dns_rdatasetitermethods_t rdatasetiter_methods = {
522 rdatasetiter_destroy,
523 rdatasetiter_first,
524 rdatasetiter_next,
525 rdatasetiter_current
526};
527
528typedef struct rbtdb_rdatasetiter {
529 dns_rdatasetiter_t common;
530 rdatasetheader_t * current;
531} rbtdb_rdatasetiter_t;
532
533static void dbiterator_destroy(dns_dbiterator_t **iteratorp);
534static isc_result_t dbiterator_first(dns_dbiterator_t *iterator);
535static isc_result_t dbiterator_last(dns_dbiterator_t *iterator);
536static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator,
537 dns_name_t *name);
538static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator);
539static isc_result_t dbiterator_next(dns_dbiterator_t *iterator);
540static isc_result_t dbiterator_current(dns_dbiterator_t *iterator,
541 dns_dbnode_t **nodep,
542 dns_name_t *name);
543static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator);
544static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator,
545 dns_name_t *name);
546
547static dns_dbiteratormethods_t dbiterator_methods = {
548 dbiterator_destroy,
549 dbiterator_first,
550 dbiterator_last,
551 dbiterator_seek,
552 dbiterator_prev,
553 dbiterator_next,
554 dbiterator_current,
555 dbiterator_pause,
556 dbiterator_origin
557};
558
559#define DELETION_BATCH_MAX 64
560
561/*
562 * If 'paused' is ISC_TRUE, then the tree lock is not being held.
563 */
564typedef struct rbtdb_dbiterator {
565 dns_dbiterator_t common;
566 isc_boolean_t paused;
567 isc_boolean_t new_origin;
568 isc_rwlocktype_t tree_locked;
569 isc_result_t result;
570 dns_fixedname_t name;
571 dns_fixedname_t origin;
572 dns_rbtnodechain_t chain;
573 dns_rbtnode_t *node;
574 dns_rbtnode_t *deletions[DELETION_BATCH_MAX];
575 int delete;
576} rbtdb_dbiterator_t;
577
578
579#define IS_STUB(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0)
580#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
581
582static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
583 isc_event_t *event);
584static void overmem(dns_db_t *db, isc_boolean_t overmem);
585
586/*%
587 * 'init_count' is used to initialize 'newheader->count' which inturn
588 * is used to determine where in the cycle rrset-order cyclic starts.
589 * We don't lock this as we don't care about simultaneous updates.
590 *
591 * Note:
592 * Both init_count and header->count can be ISC_UINT32_MAX.
593 * The count on the returned rdataset however can't be as
594 * that indicates that the database does not implement cyclic
595 * processing.
596 */
597static unsigned int init_count;
598
599/*
600 * Locking
601 *
602 * If a routine is going to lock more than one lock in this module, then
603 * the locking must be done in the following order:
604 *
605 * Tree Lock
606 *
607 * Node Lock (Only one from the set may be locked at one time by
608 * any caller)
609 *
610 * Database Lock
611 *
612 * Failure to follow this hierarchy can result in deadlock.
613 */
614
615/*
616 * Deleting Nodes
617 *
618 * For zone databases the node for the origin of the zone MUST NOT be deleted.
619 */
620
621
622/*
623 * DB Routines
624 */
625
626static void
627attach(dns_db_t *source, dns_db_t **targetp) {
628 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
629
630 REQUIRE(VALID_RBTDB(rbtdb));
631
632 isc_refcount_increment(&rbtdb->references, NULL);
633
634 *targetp = source;
635}
636
637static void
638free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
639 dns_rbtdb_t *rbtdb = event->ev_arg;
640
641 UNUSED(task);
642
643 free_rbtdb(rbtdb, ISC_TRUE, event);
644}
645
646static void
647update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
648 isc_boolean_t increment)
649{
650 dns_rdatastatstype_t statattributes = 0;
651 dns_rdatastatstype_t base = 0;
652 dns_rdatastatstype_t type;
653
654 /* At the moment we count statistics only for cache DB */
655 INSIST(IS_CACHE(rbtdb));
656
657 if (NXDOMAIN(header))
658 statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
659 else if (RBTDB_RDATATYPE_BASE(header->type) == 0) {
660 statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
661 base = RBTDB_RDATATYPE_EXT(header->type);
662 } else
663 base = RBTDB_RDATATYPE_BASE(header->type);
664
665 type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
666 if (increment)
667 dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
668 else
669 dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
670}
671
672static void
673set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
674 int idx;
675 isc_heap_t *heap;
676 dns_ttl_t oldttl;
677
678 oldttl = header->rdh_ttl;
679 header->rdh_ttl = newttl;
680
681 /*
682 * It's possible the rbtdb is not a cache. If this is the case,
683 * we will not have a heap, and we move on. If we do, though,
684 * we might need to adjust things.
685 */
686 if (header->heap_index == 0 || newttl == oldttl)
687 return;
688 idx = header->node->locknum;
689 if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
690 return;
691 heap = rbtdb->heaps[idx];
692
693 if (newttl < oldttl)
694 isc_heap_increased(heap, header->heap_index);
695 else
696 isc_heap_decreased(heap, header->heap_index);
697}
698
699/*%
700 * This function allows the heap code to rank the priority of each
701 * element. It returns ISC_TRUE if v1 happens "sooner" than v2.
702 */
703static isc_boolean_t
704ttl_sooner(void *v1, void *v2) {
705 rdatasetheader_t *h1 = v1;
706 rdatasetheader_t *h2 = v2;
707
708 if (h1->rdh_ttl < h2->rdh_ttl)
709 return (ISC_TRUE);
710 return (ISC_FALSE);
711}
712
713/*%
714 * This function sets the heap index into the header.
715 */
716static void
717ttl_set_index(void *what, unsigned int index) {
718 rdatasetheader_t *h = what;
719
720 h->heap_index = index;
721}
722
723/*%
724 * Work out how many nodes can be deleted in the time between two
725 * requests to the nameserver. Smooth the resulting number and use it
726 * as a estimate for the number of nodes to be deleted in the next
727 * iteration.
728 */
729static unsigned int
730adjust_quantum(unsigned int old, isc_time_t *start) {
731 unsigned int pps = dns_pps; /* packets per second */
732 unsigned int interval;
733 isc_uint64_t usecs;
734 isc_time_t end;
735 unsigned int new;
736
737 if (pps < 100)
738 pps = 100;
739 isc_time_now(&end);
740
741 interval = 1000000 / pps; /* interval in usec */
742 if (interval == 0)
743 interval = 1;
744 usecs = isc_time_microdiff(&end, start);
745 if (usecs == 0) {
746 /*
747 * We were unable to measure the amount of time taken.
748 * Double the nodes deleted next time.
749 */
750 old *= 2;
751 if (old > 1000)
752 old = 1000;
753 return (old);
754 }
755 new = old * interval;
756 new /= (unsigned int)usecs;
757 if (new == 0)
758 new = 1;
759 else if (new > 1000)
760 new = 1000;
761
762 /* Smooth */
763 new = (new + old * 3) / 4;
764
765 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
766 ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
767
768 return (new);
769}
770
771static void
772free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
773 unsigned int i;
774 isc_ondestroy_t ondest;
775 isc_result_t result;
776 char buf[DNS_NAME_FORMATSIZE];
777 isc_time_t start;
778
779 if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
780 overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
781
782 REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
783 REQUIRE(rbtdb->future_version == NULL);
784
785 if (rbtdb->current_version != NULL) {
786 unsigned int refs;
787
788 isc_refcount_decrement(&rbtdb->current_version->references,
789 &refs);
790 INSIST(refs == 0);
791 UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
792 isc_refcount_destroy(&rbtdb->current_version->references);
793 isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
794 sizeof(rbtdb_version_t));
795 }
796
797 /*
798 * We assume the number of remaining dead nodes is reasonably small;
799 * the overhead of unlinking all nodes here should be negligible.
800 */
801 for (i = 0; i < rbtdb->node_lock_count; i++) {
802 dns_rbtnode_t *node;
803
804 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
805 while (node != NULL) {
806 ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
807 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
808 }
809 }
810
811 if (event == NULL)
812 rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
813 again:
814 if (rbtdb->tree != NULL) {
815 isc_time_now(&start);
816 result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
817 if (result == ISC_R_QUOTA) {
818 INSIST(rbtdb->task != NULL);
819 if (rbtdb->quantum != 0)
820 rbtdb->quantum = adjust_quantum(rbtdb->quantum,
821 &start);
822 if (event == NULL)
823 event = isc_event_allocate(rbtdb->common.mctx,
824 NULL,
825 DNS_EVENT_FREESTORAGE,
826 free_rbtdb_callback,
827 rbtdb,
828 sizeof(isc_event_t));
829 if (event == NULL)
830 goto again;
831 isc_task_send(rbtdb->task, &event);
832 return;
833 }
834 INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
835 }
836 if (event != NULL)
837 isc_event_free(&event);
838 if (log) {
839 if (dns_name_dynamic(&rbtdb->common.origin))
840 dns_name_format(&rbtdb->common.origin, buf,
841 sizeof(buf));
842 else
843 strcpy(buf, "<UNKNOWN>");
844 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
845 DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
846 "done free_rbtdb(%s)", buf);
847 }
848 if (dns_name_dynamic(&rbtdb->common.origin))
849 dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
850 for (i = 0; i < rbtdb->node_lock_count; i++) {
851 isc_refcount_destroy(&rbtdb->node_locks[i].references);
852 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
853 }
854
855 /*
856 * Clean up LRU cache objects.
857 */
858 if (rbtdb->rdatasets != NULL) {
859 for (i = 0; i < rbtdb->node_lock_count; i++)
860 INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
861 isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
862 rbtdb->node_lock_count *
863 sizeof(rdatasetheaderlist_t));
864 }
865 /*
866 * Clean up dead node buckets.
867 */
868 if (rbtdb->deadnodes != NULL) {
869 for (i = 0; i < rbtdb->node_lock_count; i++)
870 INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
871 isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
872 rbtdb->node_lock_count * sizeof(rbtnodelist_t));
873 }
874 /*
875 * Clean up TTL heap cache objects.
876 */
877 if (rbtdb->heaps != NULL) {
878 for (i = 0; i < rbtdb->node_lock_count; i++)
879 isc_heap_destroy(&rbtdb->heaps[i]);
880 isc_mem_put(rbtdb->common.mctx, rbtdb->heaps,
881 rbtdb->node_lock_count *
882 sizeof(isc_heap_t *));
883 }
884
885 if (rbtdb->rrsetstats != NULL)
886 dns_stats_detach(&rbtdb->rrsetstats);
887
888 isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
889 rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
890 isc_rwlock_destroy(&rbtdb->tree_lock);
891 isc_refcount_destroy(&rbtdb->references);
892 if (rbtdb->task != NULL)
893 isc_task_detach(&rbtdb->task);
894
895 RBTDB_DESTROYLOCK(&rbtdb->lock);
896 rbtdb->common.magic = 0;
897 rbtdb->common.impmagic = 0;
898 ondest = rbtdb->common.ondest;
899 isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
900 isc_ondestroy_notify(&ondest, rbtdb);
901}
902
903static inline void
904maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
905 isc_boolean_t want_free = ISC_FALSE;
906 unsigned int i;
907 unsigned int inactive = 0;
908
909 /* XXX check for open versions here */
910
911 if (rbtdb->soanode != NULL)
912 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
913 if (rbtdb->nsnode != NULL)
914 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
915
916 /*
917 * Even though there are no external direct references, there still
918 * may be nodes in use.
919 */
920 for (i = 0; i < rbtdb->node_lock_count; i++) {
921 NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
922 rbtdb->node_locks[i].exiting = ISC_TRUE;
923 NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
924 if (isc_refcount_current(&rbtdb->node_locks[i].references)
925 == 0) {
926 inactive++;
927 }
928 }
929
930 if (inactive != 0) {
931 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
932 rbtdb->active -= inactive;
933 if (rbtdb->active == 0)
934 want_free = ISC_TRUE;
935 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
936 if (want_free) {
937 char buf[DNS_NAME_FORMATSIZE];
938 if (dns_name_dynamic(&rbtdb->common.origin))
939 dns_name_format(&rbtdb->common.origin, buf,
940 sizeof(buf));
941 else
942 strcpy(buf, "<UNKNOWN>");
943 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
944 DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
945 "calling free_rbtdb(%s)", buf);
946 free_rbtdb(rbtdb, ISC_TRUE, NULL);
947 }
948 }
949}
950
951static void
952detach(dns_db_t **dbp) {
953 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
954 unsigned int refs;
955
956 REQUIRE(VALID_RBTDB(rbtdb));
957
958 isc_refcount_decrement(&rbtdb->references, &refs);
959
960 if (refs == 0)
961 maybe_free_rbtdb(rbtdb);
962
963 *dbp = NULL;
964}
965
966static void
967currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
968 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
969 rbtdb_version_t *version;
970 unsigned int refs;
971
972 REQUIRE(VALID_RBTDB(rbtdb));
973
974 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
975 version = rbtdb->current_version;
976 isc_refcount_increment(&version->references, &refs);
977 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
978
979 *versionp = (dns_dbversion_t *)version;
980}
981
982static inline rbtdb_version_t *
983allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
984 unsigned int references, isc_boolean_t writer)
985{
986 isc_result_t result;
987 rbtdb_version_t *version;
988
989 version = isc_mem_get(mctx, sizeof(*version));
990 if (version == NULL)
991 return (NULL);
992 version->serial = serial;
993 result = isc_refcount_init(&version->references, references);
994 if (result != ISC_R_SUCCESS) {
995 isc_mem_put(mctx, version, sizeof(*version));
996 return (NULL);
997 }
998 version->writer = writer;
999 version->commit_ok = ISC_FALSE;
1000 ISC_LIST_INIT(version->changed_list);
1001 ISC_LINK_INIT(version, link);
1002
1003 return (version);
1004}
1005
1006static isc_result_t
1007newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1008 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1009 rbtdb_version_t *version;
1010
1011 REQUIRE(VALID_RBTDB(rbtdb));
1012 REQUIRE(versionp != NULL && *versionp == NULL);
1013 REQUIRE(rbtdb->future_version == NULL);
1014
1015 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1016 RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */
1017 version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1018 ISC_TRUE);
1019 if (version != NULL) {
1020 version->commit_ok = ISC_TRUE;
1021 rbtdb->next_serial++;
1022 rbtdb->future_version = version;
1023 }
1024 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1025
1026 if (version == NULL)
1027 return (ISC_R_NOMEMORY);
1028
1029 *versionp = version;
1030
1031 return (ISC_R_SUCCESS);
1032}
1033
1034static void
1035attachversion(dns_db_t *db, dns_dbversion_t *source,
1036 dns_dbversion_t **targetp)
1037{
1038 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1039 rbtdb_version_t *rbtversion = source;
1040 unsigned int refs;
1041
1042 REQUIRE(VALID_RBTDB(rbtdb));
1043
1044 isc_refcount_increment(&rbtversion->references, &refs);
1045 INSIST(refs > 1);
1046
1047 *targetp = rbtversion;
1048}
1049
1050static rbtdb_changed_t *
1051add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1052 dns_rbtnode_t *node)
1053{
1054 rbtdb_changed_t *changed;
1055 unsigned int refs;
1056
1057 /*
1058 * Caller must be holding the node lock if its reference must be
1059 * protected by the lock.
1060 */
1061
1062 changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1063
1064 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1065
1066 REQUIRE(version->writer);
1067
1068 if (changed != NULL) {
1069 dns_rbtnode_refincrement(node, &refs);
1070 INSIST(refs != 0);
1071 changed->node = node;
1072 changed->dirty = ISC_FALSE;
1073 ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1074 } else
1075 version->commit_ok = ISC_FALSE;
1076
1077 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1078
1079 return (changed);
1080}
1081
1082static void
1083free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
1084 acachectl_t *array)
1085{
1086 unsigned int count;
1087 unsigned int i;
1088 unsigned char *raw; /* RDATASLAB */
1089
1090 /*
1091 * The caller must be holding the corresponding node lock.
1092 */
1093
1094 if (array == NULL)
1095 return;
1096
1097 raw = (unsigned char *)header + sizeof(*header);
1098 count = raw[0] * 256 + raw[1];
1099
1100 /*
1101 * Sanity check: since an additional cache entry has a reference to
1102 * the original DB node (in the callback arg), there should be no
1103 * acache entries when the node can be freed.
1104 */
1105 for (i = 0; i < count; i++)
1106 INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1107
1108 isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1109}
1110
1111static inline void
1112free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1113
1114 if (dns_name_dynamic(&(*noqname)->name))
1115 dns_name_free(&(*noqname)->name, mctx);
1116 if ((*noqname)->nsec != NULL)
1117 isc_mem_put(mctx, (*noqname)->nsec,
1118 dns_rdataslab_size((*noqname)->nsec, 0));
1119 if ((*noqname)->nsecsig != NULL)
1120 isc_mem_put(mctx, (*noqname)->nsecsig,
1121 dns_rdataslab_size((*noqname)->nsecsig, 0));
1122 isc_mem_put(mctx, *noqname, sizeof(**noqname));
1123 *noqname = NULL;
1124}
1125
1126static inline void
1127init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
1128{
1129 ISC_LINK_INIT(h, lru_link);
1130 h->heap_index = 0;
1131
1132#if TRACE_HEADER
1133 if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1134 fprintf(stderr, "initialized header: %p\n", h);
1135#else
1136 UNUSED(rbtdb);
1137#endif
1138}
1139
1140static inline rdatasetheader_t *
1141new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
1142{
1143 rdatasetheader_t *h;
1144
1145 h = isc_mem_get(mctx, sizeof(*h));
1146 if (h == NULL)
1147 return (NULL);
1148
1149#if TRACE_HEADER
1150 if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1151 fprintf(stderr, "allocated header: %p\n", h);
1152#endif
1153 init_rdataset(rbtdb, h);
1154 return (h);
1155}
1156
1157static inline void
1158free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
1159{
1160 unsigned int size;
1161
1162 if (EXISTS(rdataset) &&
1163 (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
1164 update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
1165 }
1166
1167 if (IS_CACHE(rbtdb) && ISC_LINK_LINKED(rdataset, lru_link)) {
1168 int idx = rdataset->node->locknum;
1169 ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link);
1170 if (rdataset->heap_index != 0) {
1171 isc_heap_delete(rbtdb->heaps[idx],
1172 rdataset->heap_index);
1173 }
1174 rdataset->heap_index = 0;
1175 }
1176
1177 if (rdataset->noqname != NULL)
1178 free_noqname(mctx, &rdataset->noqname);
1179
1180 free_acachearray(mctx, rdataset, rdataset->additional_auth);
1181 free_acachearray(mctx, rdataset, rdataset->additional_glue);
1182
1183 if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
1184 size = sizeof(*rdataset);
1185 else
1186 size = dns_rdataslab_size((unsigned char *)rdataset,
1187 sizeof(*rdataset));
1188 isc_mem_put(mctx, rdataset, size);
1189}
1190
1191static inline void
1192rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1193 rdatasetheader_t *header, *dcurrent;
1194 isc_boolean_t make_dirty = ISC_FALSE;
1195
1196 /*
1197 * Caller must hold the node lock.
1198 */
1199
1200 /*
1201 * We set the IGNORE attribute on rdatasets with serial number
1202 * 'serial'. When the reference count goes to zero, these rdatasets
1203 * will be cleaned up; until that time, they will be ignored.
1204 */
1205 for (header = node->data; header != NULL; header = header->next) {
1206 if (header->serial == serial) {
1207 header->attributes |= RDATASET_ATTR_IGNORE;
1208 make_dirty = ISC_TRUE;
1209 }
1210 for (dcurrent = header->down;
1211 dcurrent != NULL;
1212 dcurrent = dcurrent->down) {
1213 if (dcurrent->serial == serial) {
1214 dcurrent->attributes |= RDATASET_ATTR_IGNORE;
1215 make_dirty = ISC_TRUE;
1216 }
1217 }
1218 }
1219 if (make_dirty)
1220 node->dirty = 1;
1221}
1222
1223static inline void
1224clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
1225{
1226 rdatasetheader_t *d, *down_next;
1227
1228 for (d = top->down; d != NULL; d = down_next) {
1229 down_next = d->down;
1230 free_rdataset(rbtdb, mctx, d);
1231 }
1232 top->down = NULL;
1233}
1234
1235static inline void
1236clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1237 rdatasetheader_t *current, *top_prev, *top_next;
1238 isc_mem_t *mctx = rbtdb->common.mctx;
1239
1240 /*
1241 * Caller must be holding the node lock.
1242 */
1243
1244 top_prev = NULL;
1245 for (current = node->data; current != NULL; current = top_next) {
1246 top_next = current->next;
1247 clean_stale_headers(rbtdb, mctx, current);
1248 /*
1249 * If current is nonexistent or stale, we can clean it up.
1250 */
1251 if ((current->attributes &
1252 (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
1253 if (top_prev != NULL)
1254 top_prev->next = current->next;
1255 else
1256 node->data = current->next;
1257 free_rdataset(rbtdb, mctx, current);
1258 } else
1259 top_prev = current;
1260 }
1261 node->dirty = 0;
1262}
1263
1264static inline void
1265clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1266 rbtdb_serial_t least_serial)
1267{
1268 rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1269 rdatasetheader_t *top_prev, *top_next;
1270 isc_mem_t *mctx = rbtdb->common.mctx;
1271 isc_boolean_t still_dirty = ISC_FALSE;
1272
1273 /*
1274 * Caller must be holding the node lock.
1275 */
1276 REQUIRE(least_serial != 0);
1277
1278 top_prev = NULL;
1279 for (current = node->data; current != NULL; current = top_next) {
1280 top_next = current->next;
1281
1282 /*
1283 * First, we clean up any instances of multiple rdatasets
1284 * with the same serial number, or that have the IGNORE
1285 * attribute.
1286 */
1287 dparent = current;
1288 for (dcurrent = current->down;
1289 dcurrent != NULL;
1290 dcurrent = down_next) {
1291 down_next = dcurrent->down;
1292 INSIST(dcurrent->serial <= dparent->serial);
1293 if (dcurrent->serial == dparent->serial ||
1294 IGNORE(dcurrent)) {
1295 if (down_next != NULL)
1296 down_next->next = dparent;
1297 dparent->down = down_next;
1298 free_rdataset(rbtdb, mctx, dcurrent);
1299 } else
1300 dparent = dcurrent;
1301 }
1302
1303 /*
1304 * We've now eliminated all IGNORE datasets with the possible
1305 * exception of current, which we now check.
1306 */
1307 if (IGNORE(current)) {
1308 down_next = current->down;
1309 if (down_next == NULL) {
1310 if (top_prev != NULL)
1311 top_prev->next = current->next;
1312 else
1313 node->data = current->next;
1314 free_rdataset(rbtdb, mctx, current);
1315 /*
1316 * current no longer exists, so we can
1317 * just continue with the loop.
1318 */
1319 continue;
1320 } else {
1321 /*
1322 * Pull up current->down, making it the new
1323 * current.
1324 */
1325 if (top_prev != NULL)
1326 top_prev->next = down_next;
1327 else
1328 node->data = down_next;
1329 down_next->next = top_next;
1330 free_rdataset(rbtdb, mctx, current);
1331 current = down_next;
1332 }
1333 }
1334
1335 /*
1336 * We now try to find the first down node less than the
1337 * least serial.
1338 */
1339 dparent = current;
1340 for (dcurrent = current->down;
1341 dcurrent != NULL;
1342 dcurrent = down_next) {
1343 down_next = dcurrent->down;
1344 if (dcurrent->serial < least_serial)
1345 break;
1346 dparent = dcurrent;
1347 }
1348
1349 /*
1350 * If there is a such an rdataset, delete it and any older
1351 * versions.
1352 */
1353 if (dcurrent != NULL) {
1354 do {
1355 down_next = dcurrent->down;
1356 INSIST(dcurrent->serial <= least_serial);
1357 free_rdataset(rbtdb, mctx, dcurrent);
1358 dcurrent = down_next;
1359 } while (dcurrent != NULL);
1360 dparent->down = NULL;
1361 }
1362
1363 /*
1364 * Note. The serial number of 'current' might be less than
1365 * least_serial too, but we cannot delete it because it is
1366 * the most recent version, unless it is a NONEXISTENT
1367 * rdataset.
1368 */
1369 if (current->down != NULL) {
1370 still_dirty = ISC_TRUE;
1371 top_prev = current;
1372 } else {
1373 /*
1374 * If this is a NONEXISTENT rdataset, we can delete it.
1375 */
1376 if (NONEXISTENT(current)) {
1377 if (top_prev != NULL)
1378 top_prev->next = current->next;
1379 else
1380 node->data = current->next;
1381 free_rdataset(rbtdb, mctx, current);
1382 } else
1383 top_prev = current;
1384 }
1385 }
1386 if (!still_dirty)
1387 node->dirty = 0;
1388}
1389
1390/*%
1391 * Clean up dead nodes. These are nodes which have no references, and
1392 * have no data. They are dead but we could not or chose not to delete
1393 * them when we deleted all the data at that node because we did not want
1394 * to wait for the tree write lock.
1395 *
1396 * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1397 */
1398static void
1399cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1400 dns_rbtnode_t *node;
1401 isc_result_t result;
1402 int count = 10; /* XXXJT: should be adjustable */
1403
1404 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1405 while (node != NULL && count > 0) {
1406 ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1407
1408 /*
1409 * Since we're holding a tree write lock, it should be
1410 * impossible for this node to be referenced by others.
1411 */
1412 INSIST(dns_rbtnode_refcurrent(node) == 0 &&
1413 node->data == NULL);
1414
1415 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1416 if (result != ISC_R_SUCCESS)
1417 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1418 DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1419 "cleanup_dead_nodes: "
1420 "dns_rbt_deletenode: %s",
1421 isc_result_totext(result));
1422 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1423 count--;
1424 }
1425}
1426
1427/*
1428 * Caller must be holding the node lock if its reference must be protected
1429 * by the lock.
1430 */
1431static inline void
1432new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1433 unsigned int lockrefs, noderefs;
1434 isc_refcount_t *lockref;
1435
1436 dns_rbtnode_refincrement0(node, &noderefs);
1437 if (noderefs == 1) { /* this is the first reference to the node */
1438 lockref = &rbtdb->node_locks[node->locknum].references;
1439 isc_refcount_increment0(lockref, &lockrefs);
1440 INSIST(lockrefs != 0);
1441 }
1442 INSIST(noderefs != 0);
1443}
1444
1445/*
1446 * This function is assumed to be called when a node is newly referenced
1447 * and can be in the deadnode list. In that case the node must be retrieved
1448 * from the list because it is going to be used. In addition, if the caller
1449 * happens to hold a write lock on the tree, it's a good chance to purge dead
1450 * nodes.
1451 * Note: while a new reference is gained in multiple places, there are only very
1452 * few cases where the node can be in the deadnode list (only empty nodes can
1453 * have been added to the list).
1454 */
1455static inline void
1456reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1457 isc_rwlocktype_t treelocktype)
1458{
1459 isc_boolean_t need_relock = ISC_FALSE;
1460
1461 NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock);
1462 new_reference(rbtdb, node);
1463
1464 NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1465 isc_rwlocktype_read);
1466 if (ISC_LINK_LINKED(node, deadlink))
1467 need_relock = ISC_TRUE;
1468 else if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1469 treelocktype == isc_rwlocktype_write)
1470 need_relock = ISC_TRUE;
1471 NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1472 isc_rwlocktype_read);
1473 if (need_relock) {
1474 NODE_WEAKLOCK(&rbtdb->node_locks[node->locknum].lock,
1475 isc_rwlocktype_write);
1476 if (ISC_LINK_LINKED(node, deadlink))
1477 ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
1478 node, deadlink);
1479 if (treelocktype == isc_rwlocktype_write)
1480 cleanup_dead_nodes(rbtdb, node->locknum);
1481 NODE_WEAKUNLOCK(&rbtdb->node_locks[node->locknum].lock,
1482 isc_rwlocktype_write);
1483 }
1484
1485 NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock);
1486}
1487
1488/*
1489 * Caller must be holding the node lock; either the "strong", read or write
1490 * lock. Note that the lock must be held even when node references are
1491 * atomically modified; in that case the decrement operation itself does not
1492 * have to be protected, but we must avoid a race condition where multiple
1493 * threads are decreasing the reference to zero simultaneously and at least
1494 * one of them is going to free the node.
1495 * This function returns ISC_TRUE if and only if the node reference decreases
1496 * to zero.
1497 */
1498static isc_boolean_t
1499decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1500 rbtdb_serial_t least_serial,
1501 isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
1502 isc_boolean_t pruning)
1503{
1504 isc_result_t result;
1505 isc_boolean_t write_locked;
1506 rbtdb_nodelock_t *nodelock;
1507 unsigned int refs, nrefs;
1508 int bucket = node->locknum;
1509 isc_boolean_t no_reference;
1510
1511 nodelock = &rbtdb->node_locks[bucket];
1512
1513 /* Handle easy and typical case first. */
1514 if (!node->dirty && (node->data != NULL || node->down != NULL)) {
1515 dns_rbtnode_refdecrement(node, &nrefs);
1516 INSIST((int)nrefs >= 0);
1517 if (nrefs == 0) {
1518 isc_refcount_decrement(&nodelock->references, &refs);
1519 INSIST((int)refs >= 0);
1520 }
1521 return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
1522 }
1523
1524 /* Upgrade the lock? */
1525 if (nlock == isc_rwlocktype_read) {
1526 NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
1527 NODE_WEAKLOCK(&nodelock->lock, isc_rwlocktype_write);
1528 }
1529 dns_rbtnode_refdecrement(node, &nrefs);
1530 INSIST((int)nrefs >= 0);
1531 if (nrefs > 0) {
1532 /* Restore the lock? */
1533 if (nlock == isc_rwlocktype_read)
1534 NODE_WEAKDOWNGRADE(&nodelock->lock);
1535 return (ISC_FALSE);
1536 }
1537
1538 if (node->dirty && dns_rbtnode_refcurrent(node) == 0) {
1539 if (IS_CACHE(rbtdb))
1540 clean_cache_node(rbtdb, node);
1541 else {
1542 if (least_serial == 0) {
1543 /*
1544 * Caller doesn't know the least serial.
1545 * Get it.
1546 */
1547 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1548 least_serial = rbtdb->least_serial;
1549 RBTDB_UNLOCK(&rbtdb->lock,
1550 isc_rwlocktype_read);
1551 }
1552 clean_zone_node(rbtdb, node, least_serial);
1553 }
1554 }
1555
1556 isc_refcount_decrement(&nodelock->references, &refs);
1557 INSIST((int)refs >= 0);
1558
1559 /*
1560 * XXXDCL should this only be done for cache zones?
1561 */
1562 if (node->data != NULL || node->down != NULL) {
1563 /* Restore the lock? */
1564 if (nlock == isc_rwlocktype_read)
1565 NODE_WEAKDOWNGRADE(&nodelock->lock);
1566 return (ISC_TRUE);
1567 }
1568
1569 /*
1570 * Attempt to switch to a write lock on the tree. If this fails,
1571 * we will add this node to a linked list of nodes in this locking
1572 * bucket which we will free later.
1573 */
1574 if (tlock != isc_rwlocktype_write) {
1575 /*
1576 * Locking hierarchy notwithstanding, we don't need to free
1577 * the node lock before acquiring the tree write lock because
1578 * we only do a trylock.
1579 */
1580 if (tlock == isc_rwlocktype_read)
1581 result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
1582 else
1583 result = isc_rwlock_trylock(&rbtdb->tree_lock,
1584 isc_rwlocktype_write);
1585 RUNTIME_CHECK(result == ISC_R_SUCCESS ||
1586 result == ISC_R_LOCKBUSY);
1587
1588 write_locked = ISC_TF(result == ISC_R_SUCCESS);
1589 } else
1590 write_locked = ISC_TRUE;
1591
1592 no_reference = ISC_TRUE;
1593 if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
1594 /*
1595 * We can now delete the node if the reference counter is
1596 * zero. This should be typically the case, but a different
1597 * thread may still gain a (new) reference just before the
1598 * current thread locks the tree (e.g., in findnode()).
1599 */
1600
1601 /*
1602 * If this node is the only one in the level it's in, deleting
1603 * this node may recursively make its parent the only node in
1604 * the parent level; if so, and if no one is currently using
1605 * the parent node, this is almost the only opportunity to
1606 * clean it up. But the recursive cleanup is not that trivial
1607 * since the child and parent may be in different lock buckets,
1608 * which would cause a lock order reversal problem. To avoid
1609 * the trouble, we'll dispatch a separate event for batch
1610 * cleaning. We need to check whether we're deleting the node
1611 * as a result of pruning to avoid infinite dispatching.
1612 * Note: pruning happens only when a task has been set for the
1613 * rbtdb. If the user of the rbtdb chooses not to set a task,
1614 * it's their responsibility to purge stale leaves (e.g. by
1615 * periodic walk-through).
1616 */
1617 if (!pruning && node->parent != NULL &&
1618 node->parent->down == node && node->left == NULL &&
1619 node->right == NULL && rbtdb->task != NULL) {
1620 isc_event_t *ev;
1621 dns_db_t *db;
1622
1623 ev = isc_event_allocate(rbtdb->common.mctx, NULL,
1624 DNS_EVENT_RBTPRUNE,
1625 prune_tree, node,
1626 sizeof(isc_event_t));
1627 if (ev != NULL) {
1628 new_reference(rbtdb, node);
1629 db = NULL;
1630 attach((dns_db_t *)rbtdb, &db);
1631 ev->ev_sender = db;
1632 isc_task_send(rbtdb->task, &ev);
1633 no_reference = ISC_FALSE;
1634 } else {
1635 /*
1636 * XXX: this is a weird situation. We could
1637 * ignore this error case, but then the stale
1638 * node will unlikely be purged except via a
1639 * rare condition such as manual cleanup. So
1640 * we queue it in the deadnodes list, hoping
1641 * the memory shortage is temporary and the node
1642 * will be deleted later.
1643 */
1644 isc_log_write(dns_lctx,
1645 DNS_LOGCATEGORY_DATABASE,
1646 DNS_LOGMODULE_CACHE,
1647 ISC_LOG_INFO,
1648 "decrement_reference: failed to "
1649 "allocate pruning event");
1650 INSIST(!ISC_LINK_LINKED(node, deadlink));
1651 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
1652 deadlink);
1653 }
1654 } else {
1655 if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
1656 char printname[DNS_NAME_FORMATSIZE];
1657
1658 isc_log_write(dns_lctx,
1659 DNS_LOGCATEGORY_DATABASE,
1660 DNS_LOGMODULE_CACHE,
1661 ISC_LOG_DEBUG(1),
1662 "decrement_reference: "
1663 "delete from rbt: %p %s",
1664 node,
1665 dns_rbt_formatnodename(node,
1666 printname,
1667 sizeof(printname)));
1668 }
1669
1670 INSIST(!ISC_LINK_LINKED(node, deadlink));
1671 result = dns_rbt_deletenode(rbtdb->tree, node,
1672 ISC_FALSE);
1673 if (result != ISC_R_SUCCESS) {
1674 isc_log_write(dns_lctx,
1675 DNS_LOGCATEGORY_DATABASE,
1676 DNS_LOGMODULE_CACHE,
1677 ISC_LOG_WARNING,
1678 "decrement_reference: "
1679 "dns_rbt_deletenode: %s",
1680 isc_result_totext(result));
1681 }
1682 }
1683 } else if (dns_rbtnode_refcurrent(node) == 0) {
1684 INSIST(!ISC_LINK_LINKED(node, deadlink));
1685 ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
1686 }
1687
1688 /* Restore the lock? */
1689 if (nlock == isc_rwlocktype_read)
1690 NODE_WEAKDOWNGRADE(&nodelock->lock);
1691
1692 /*
1693 * Relock a read lock, or unlock the write lock if no lock was held.
1694 */
1695 if (tlock == isc_rwlocktype_none)
1696 if (write_locked)
1697 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1698
1699 if (tlock == isc_rwlocktype_read)
1700 if (write_locked)
1701 isc_rwlock_downgrade(&rbtdb->tree_lock);
1702
1703 return (no_reference);
1704}
1705
1706/*
1707 * Prune the tree by recursively cleaning-up single leaves. In the worst
1708 * case, the number of iteration is the number of tree levels, which is at
1709 * most the maximum number of domain name labels, i.e, 127. In practice, this
1710 * should be much smaller (only a few times), and even the worst case would be
1711 * acceptable for a single event.
1712 */
1713static void
1714prune_tree(isc_task_t *task, isc_event_t *event) {
1715 dns_rbtdb_t *rbtdb = event->ev_sender;
1716 dns_rbtnode_t *node = event->ev_arg;
1717 dns_rbtnode_t *parent;
1718 unsigned int locknum;
1719
1720 UNUSED(task);
1721
1722 isc_event_free(&event);
1723
1724 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1725 locknum = node->locknum;
1726 NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1727 do {
1728 parent = node->parent;
1729 decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
1730 isc_rwlocktype_write, ISC_TRUE);
1731
1732 if (parent != NULL && parent->down == NULL) {
1733 /*
1734 * node was the only down child of the parent and has
1735 * just been removed. We'll then need to examine the
1736 * parent. Keep the lock if possible; otherwise,
1737 * release the old lock and acquire one for the parent.
1738 */
1739 if (parent->locknum != locknum) {
1740 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
1741 isc_rwlocktype_write);
1742 locknum = parent->locknum;
1743 NODE_LOCK(&rbtdb->node_locks[locknum].lock,
1744 isc_rwlocktype_write);
1745 }
1746
1747 /*
1748 * We need to gain a reference to the node before
1749 * decrementing it in the next iteration. In addition,
1750 * if the node is in the dead-nodes list, extract it
1751 * from the list beforehand as we do in
1752 * reactivate_node().
1753 */
1754 new_reference(rbtdb, parent);
1755 if (ISC_LINK_LINKED(parent, deadlink)) {
1756 ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
1757 parent, deadlink);
1758 }
1759 } else
1760 parent = NULL;
1761
1762 node = parent;
1763 } while (node != NULL);
1764 NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1765 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
1766
1767 detach((dns_db_t **)&rbtdb);
1768}
1769
1770static inline void
1771make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1772 rbtdb_changedlist_t *cleanup_list)
1773{
1774 /*
1775 * Caller must be holding the database lock.
1776 */
1777
1778 rbtdb->least_serial = version->serial;
1779 *cleanup_list = version->changed_list;
1780 ISC_LIST_INIT(version->changed_list);
1781}
1782
1783static inline void
1784cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
1785 rbtdb_changed_t *changed, *next_changed;
1786
1787 /*
1788 * If the changed record is dirty, then
1789 * an update created multiple versions of
1790 * a given rdataset. We keep this list
1791 * until we're the least open version, at
1792 * which point it's safe to get rid of any
1793 * older versions.
1794 *
1795 * If the changed record isn't dirty, then
1796 * we don't need it anymore since we're
1797 * committing and not rolling back.
1798 *
1799 * The caller must be holding the database lock.
1800 */
1801 for (changed = HEAD(version->changed_list);
1802 changed != NULL;
1803 changed = next_changed) {
1804 next_changed = NEXT(changed, link);
1805 if (!changed->dirty) {
1806 UNLINK(version->changed_list,
1807 changed, link);
1808 APPEND(*cleanup_list,
1809 changed, link);
1810 }
1811 }
1812}
1813
1814static isc_boolean_t
1815iszonesecure(dns_db_t *db, dns_dbnode_t *origin) {
1816 dns_rdataset_t keyset;
1817 dns_rdataset_t nsecset, signsecset;
1818 isc_boolean_t haszonekey = ISC_FALSE;
1819 isc_boolean_t hasnsec = ISC_FALSE;
1820 isc_result_t result;
1821
1822 dns_rdataset_init(&keyset);
1823 result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_dnskey, 0,
1824 0, &keyset, NULL);
1825 if (result == ISC_R_SUCCESS) {
1826 dns_rdata_t keyrdata = DNS_RDATA_INIT;
1827 result = dns_rdataset_first(&keyset);
1828 while (result == ISC_R_SUCCESS) {
1829 dns_rdataset_current(&keyset, &keyrdata);
1830 if (dns_zonekey_iszonekey(&keyrdata)) {
1831 haszonekey = ISC_TRUE;
1832 break;
1833 }
1834 result = dns_rdataset_next(&keyset);
1835 }
1836 dns_rdataset_disassociate(&keyset);
1837 }
1838 if (!haszonekey)
1839 return (ISC_FALSE);
1840
1841 dns_rdataset_init(&nsecset);
1842 dns_rdataset_init(&signsecset);
1843 result = dns_db_findrdataset(db, origin, NULL, dns_rdatatype_nsec, 0,
1844 0, &nsecset, &signsecset);
1845 if (result == ISC_R_SUCCESS) {
1846 if (dns_rdataset_isassociated(&signsecset)) {
1847 hasnsec = ISC_TRUE;
1848 dns_rdataset_disassociate(&signsecset);
1849 }
1850 dns_rdataset_disassociate(&nsecset);
1851 }
1852 return (hasnsec);
1853}
1854
1855static void
1856closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
1857 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1858 rbtdb_version_t *version, *cleanup_version, *least_greater;
1859 isc_boolean_t rollback = ISC_FALSE;
1860 rbtdb_changedlist_t cleanup_list;
1861 rbtdb_changed_t *changed, *next_changed;
1862 rbtdb_serial_t serial, least_serial;
1863 dns_rbtnode_t *rbtnode;
1864 unsigned int refs;
1865 isc_boolean_t writer;
1866
1867 REQUIRE(VALID_RBTDB(rbtdb));
1868 version = (rbtdb_version_t *)*versionp;
1869
1870 cleanup_version = NULL;
1871 ISC_LIST_INIT(cleanup_list);
1872
1873 isc_refcount_decrement(&version->references, &refs);
1874 if (refs > 0) { /* typical and easy case first */
1875 if (commit) {
1876 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1877 INSIST(!version->writer);
1878 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1879 }
1880 goto end;
1881 }
1882
1883 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1884 serial = version->serial;
1885 writer = version->writer;
1886 if (version->writer) {
1887 if (commit) {
1888 unsigned cur_ref;
1889 rbtdb_version_t *cur_version;
1890
1891 INSIST(version->commit_ok);
1892 INSIST(version == rbtdb->future_version);
1893 /*
1894 * The current version is going to be replaced.
1895 * Release the (likely last) reference to it from the
1896 * DB itself and unlink it from the open list.
1897 */
1898 cur_version = rbtdb->current_version;
1899 isc_refcount_decrement(&cur_version->references,
1900 &cur_ref);
1901 if (cur_ref == 0) {
1902 if (cur_version->serial == rbtdb->least_serial)
1903 INSIST(EMPTY(cur_version->changed_list));
1904 UNLINK(rbtdb->open_versions,
1905 cur_version, link);
1906 }
1907 if (EMPTY(rbtdb->open_versions)) {
1908 /*
1909 * We're going to become the least open
1910 * version.
1911 */
1912 make_least_version(rbtdb, version,
1913 &cleanup_list);
1914 } else {
1915 /*
1916 * Some other open version is the
1917 * least version. We can't cleanup
1918 * records that were changed in this
1919 * version because the older versions
1920 * may still be in use by an open
1921 * version.
1922 *
1923 * We can, however, discard the
1924 * changed records for things that
1925 * we've added that didn't exist in
1926 * prior versions.
1927 */
1928 cleanup_nondirty(version, &cleanup_list);
1929 }
1930 /*
1931 * If the (soon to be former) current version
1932 * isn't being used by anyone, we can clean
1933 * it up.
1934 */
1935 if (cur_ref == 0) {
1936 cleanup_version = cur_version;
1937 APPENDLIST(version->changed_list,
1938 cleanup_version->changed_list,
1939 link);
1940 }
1941 /*
1942 * Become the current version.
1943 */
1944 version->writer = ISC_FALSE;
1945 rbtdb->current_version = version;
1946 rbtdb->current_serial = version->serial;
1947 rbtdb->future_version = NULL;
1948
1949 /*
1950 * Keep the current version in the open list, and
1951 * gain a reference for the DB itself (see the DB
1952 * creation function below). This must be the only
1953 * case where we need to increment the counter from
1954 * zero and need to use isc_refcount_increment0().
1955 */
1956 isc_refcount_increment0(&version->references,
1957 &cur_ref);
1958 INSIST(cur_ref == 1);
1959 PREPEND(rbtdb->open_versions,
1960 rbtdb->current_version, link);
1961 } else {
1962 /*
1963 * We're rolling back this transaction.
1964 */
1965 cleanup_list = version->changed_list;
1966 ISC_LIST_INIT(version->changed_list);
1967 rollback = ISC_TRUE;
1968 cleanup_version = version;
1969 rbtdb->future_version = NULL;
1970 }
1971 } else {
1972 if (version != rbtdb->current_version) {
1973 /*
1974 * There are no external or internal references
1975 * to this version and it can be cleaned up.
1976 */
1977 cleanup_version = version;
1978
1979 /*
1980 * Find the version with the least serial
1981 * number greater than ours.
1982 */
1983 least_greater = PREV(version, link);
1984 if (least_greater == NULL)
1985 least_greater = rbtdb->current_version;
1986
1987 INSIST(version->serial < least_greater->serial);
1988 /*
1989 * Is this the least open version?
1990 */
1991 if (version->serial == rbtdb->least_serial) {
1992 /*
1993 * Yes. Install the new least open
1994 * version.
1995 */
1996 make_least_version(rbtdb,
1997 least_greater,
1998 &cleanup_list);
1999 } else {
2000 /*
2001 * Add any unexecuted cleanups to
2002 * those of the least greater version.
2003 */
2004 APPENDLIST(least_greater->changed_list,
2005 version->changed_list,
2006 link);
2007 }
2008 } else if (version->serial == rbtdb->least_serial)
2009 INSIST(EMPTY(version->changed_list));
2010 UNLINK(rbtdb->open_versions, version, link);
2011 }
2012 least_serial = rbtdb->least_serial;
2013 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
2014
2015 /*
2016 * Update the zone's secure status.
2017 */
2018 if (writer && commit && !IS_CACHE(rbtdb))
2019 rbtdb->secure = iszonesecure(db, rbtdb->origin_node);
2020
2021 if (cleanup_version != NULL) {
2022 INSIST(EMPTY(cleanup_version->changed_list));
2023 isc_mem_put(rbtdb->common.mctx, cleanup_version,
2024 sizeof(*cleanup_version));
2025 }
2026
2027 if (!EMPTY(cleanup_list)) {
2028 /*
2029 * We acquire a tree write lock here in order to make sure
2030 * that stale nodes will be removed in decrement_reference().
2031 * If we didn't have the lock, those nodes could miss the
2032 * chance to be removed until the server stops. The write lock
2033 * is expensive, but this event should be rare enough to justify
2034 * the cost.
2035 */
2036 RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2037 for (changed = HEAD(cleanup_list);
2038 changed != NULL;
2039 changed = next_changed) {
2040 nodelock_t *lock;
2041
2042 next_changed = NEXT(changed, link);
2043 rbtnode = changed->node;
2044 lock = &rbtdb->node_locks[rbtnode->locknum].lock;
2045
2046 NODE_LOCK(lock, isc_rwlocktype_write);
2047 /*
2048 * This is a good opportunity to purge any dead nodes,
2049 * so use it.
2050 */
2051 cleanup_dead_nodes(rbtdb, rbtnode->locknum);
2052
2053 if (rollback)
2054 rollback_node(rbtnode, serial);
2055 decrement_reference(rbtdb, rbtnode, least_serial,
2056 isc_rwlocktype_write,
2057 isc_rwlocktype_write, ISC_FALSE);
2058
2059 NODE_UNLOCK(lock, isc_rwlocktype_write);
2060
2061 isc_mem_put(rbtdb->common.mctx, changed,
2062 sizeof(*changed));
2063 }
2064 RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
2065 }
2066
2067 end:
2068 *versionp = NULL;
2069}
2070
2071/*
2072 * Add the necessary magic for the wildcard name 'name'
2073 * to be found in 'rbtdb'.
2074 *
2075 * In order for wildcard matching to work correctly in
2076 * zone_find(), we must ensure that a node for the wildcarding
2077 * level exists in the database, and has its 'find_callback'
2078 * and 'wild' bits set.
2079 *
2080 * E.g. if the wildcard name is "*.sub.example." then we
2081 * must ensure that "sub.example." exists and is marked as
2082 * a wildcard level.
2083 */
2084static isc_result_t
2085add_wildcard_magic(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2086 isc_result_t result;
2087 dns_name_t foundname;
2088 dns_offsets_t offsets;
2089 unsigned int n;
2090 dns_rbtnode_t *node = NULL;
2091
2092 dns_name_init(&foundname, offsets);
2093 n = dns_name_countlabels(name);
2094 INSIST(n >= 2);
2095 n--;
2096 dns_name_getlabelsequence(name, 1, n, &foundname);
2097 result = dns_rbt_addnode(rbtdb->tree, &foundname, &node);
2098 if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2099 return (result);
2100 node->find_callback = 1;
2101 node->wild = 1;
2102 return (ISC_R_SUCCESS);
2103}
2104
2105static isc_result_t
2106add_empty_wildcards(dns_rbtdb_t *rbtdb, dns_name_t *name) {
2107 isc_result_t result;
2108 dns_name_t foundname;
2109 dns_offsets_t offsets;
2110 unsigned int n, l, i;
2111
2112 dns_name_init(&foundname, offsets);
2113 n = dns_name_countlabels(name);
2114 l = dns_name_countlabels(&rbtdb->common.origin);
2115 i = l + 1;
2116 while (i < n) {
2117 dns_rbtnode_t *node = NULL; /* dummy */
2118 dns_name_getlabelsequence(name, n - i, i, &foundname);
2119 if (dns_name_iswildcard(&foundname)) {
2120 result = add_wildcard_magic(rbtdb, &foundname);
2121 if (result != ISC_R_SUCCESS)
2122 return (result);
2123 result = dns_rbt_addnode(rbtdb->tree, &foundname,
2124 &node);
2125 if (result != ISC_R_SUCCESS && result != ISC_R_EXISTS)
2126 return (result);
2127 }
2128 i++;
2129 }
2130 return (ISC_R_SUCCESS);
2131}
2132
2133static isc_result_t
2134findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create,
2135 dns_dbnode_t **nodep)
2136{
2137 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
2138 dns_rbtnode_t *node = NULL;
2139 dns_name_t nodename;
2140 isc_result_t result;
2141 isc_rwlocktype_t locktype = isc_rwlocktype_read;
2142
2143 REQUIRE(VALID_RBTDB(rbtdb));
2144
2145 dns_name_init(&nodename, NULL);
2146 RWLOCK(&rbtdb->tree_lock, locktype);
2147 result = dns_rbt_findnode(rbtdb->tree, name, NULL, &node, NULL,
2148 DNS_RBTFIND_EMPTYDATA, NULL, NULL);
2149 if (result != ISC_R_SUCCESS) {
2150 RWUNLOCK(&rbtdb->tree_lock, locktype);
2151 if (!create) {
2152 if (result == DNS_R_PARTIALMATCH)
2153 result = ISC_R_NOTFOUND;
2154 return (result);
2155 }
2156 /*
2157 * It would be nice to try to upgrade the lock instead of
2158 * unlocking then relocking.
2159 */
2160 locktype = isc_rwlocktype_write;
2161 RWLOCK(&rbtdb->tree_lock, locktype);
2162 node = NULL;
2163 result = dns_rbt_addnode(rbtdb->tree, name, &node);
2164 if (result == ISC_R_SUCCESS) {
2165 dns_rbt_namefromnode(node, &nodename);
2166#ifdef DNS_RBT_USEHASH
2167 node->locknum = node->hashval % rbtdb->node_lock_count;
2168#else
2169 node->locknum = dns_name_hash(&nodename, ISC_TRUE) %
2170 rbtdb->node_lock_count;
2171#endif
2172 add_empty_wildcards(rbtdb, name);
2173
2174 if (dns_name_iswildcard(name)) {
2175 result = add_wildcard_magic(rbtdb, name);
2176 if (result != ISC_R_SUCCESS) {
2177 RWUNLOCK(&rbtdb->tree_lock, locktype);
2178 return (result);
2179 }
2180 }
2181 } else if (result != ISC_R_EXISTS) {
2182 RWUNLOCK(&rbtdb->tree_lock, locktype);
2183 return (result);
2184 }
2185 }
2186 reactivate_node(rbtdb, node, locktype);
2187 RWUNLOCK(&rbtdb->tree_lock, locktype);
2188
2189 *nodep = (dns_dbnode_t *)node;
2190
2191 return (ISC_R_SUCCESS);
2192}
2193
2194static isc_result_t
2195zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
2196 rbtdb_search_t *search = arg;
2197 rdatasetheader_t *header, *header_next;
2198 rdatasetheader_t *dname_header, *sigdname_header, *ns_header;
2199 rdatasetheader_t *found;
2200 isc_result_t result;
2201 dns_rbtnode_t *onode;
2202
2203 /*
2204 * We only want to remember the topmost zone cut, since it's the one
2205 * that counts, so we'll just continue if we've already found a
2206 * zonecut.
2207 */
2208 if (search->zonecut != NULL)
2209 return (DNS_R_CONTINUE);
2210
2211 found = NULL;
2212 result = DNS_R_CONTINUE;
2213 onode = search->rbtdb->origin_node;
2214
2215 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2216 isc_rwlocktype_read);
2217
2218 /*
2219 * Look for an NS or DNAME rdataset active in our version.
2220 */
2221 ns_header = NULL;
2222 dname_header = NULL;
2223 sigdname_header = NULL;
2224 for (header = node->data; header != NULL; header = header_next) {
2225 header_next = header->next;
2226 if (header->type == dns_rdatatype_ns ||
2227 header->type == dns_rdatatype_dname ||
2228 header->type == RBTDB_RDATATYPE_SIGDNAME) {
2229 do {
2230 if (header->serial <= search->serial &&
2231 !IGNORE(header)) {
2232 /*
2233 * Is this a "this rdataset doesn't
2234 * exist" record?
2235 */
2236 if (NONEXISTENT(header))
2237 header = NULL;
2238 break;
2239 } else
2240 header = header->down;
2241 } while (header != NULL);
2242 if (header != NULL) {
2243 if (header->type == dns_rdatatype_dname)
2244 dname_header = header;
2245 else if (header->type ==
2246 RBTDB_RDATATYPE_SIGDNAME)
2247 sigdname_header = header;
2248 else if (node != onode ||
2249 IS_STUB(search->rbtdb)) {
2250 /*
2251 * We've found an NS rdataset that
2252 * isn't at the origin node. We check
2253 * that they're not at the origin node,
2254 * because otherwise we'd erroneously
2255 * treat the zone top as if it were
2256 * a delegation.
2257 */
2258 ns_header = header;
2259 }
2260 }
2261 }
2262 }
2263
2264 /*
2265 * Did we find anything?
2266 */
2267 if (dname_header != NULL) {
2268 /*
2269 * Note that DNAME has precedence over NS if both exist.
2270 */
2271 found = dname_header;
2272 search->zonecut_sigrdataset = sigdname_header;
2273 } else if (ns_header != NULL) {
2274 found = ns_header;
2275 search->zonecut_sigrdataset = NULL;
2276 }
2277
2278 if (found != NULL) {
2279 /*
2280 * We increment the reference count on node to ensure that
2281 * search->zonecut_rdataset will still be valid later.
2282 */
2283 new_reference(search->rbtdb, node);
2284 search->zonecut = node;
2285 search->zonecut_rdataset = found;
2286 search->need_cleanup = ISC_TRUE;
2287 /*
2288 * Since we've found a zonecut, anything beneath it is
2289 * glue and is not subject to wildcard matching, so we
2290 * may clear search->wild.
2291 */
2292 search->wild = ISC_FALSE;
2293 if ((search->options & DNS_DBFIND_GLUEOK) == 0) {
2294 /*
2295 * If the caller does not want to find glue, then
2296 * this is the best answer and the search should
2297 * stop now.
2298 */
2299 result = DNS_R_PARTIALMATCH;
2300 } else {
2301 dns_name_t *zcname;
2302
2303 /*
2304 * The search will continue beneath the zone cut.
2305 * This may or may not be the best match. In case it
2306 * is, we need to remember the node name.
2307 */
2308 zcname = dns_fixedname_name(&search->zonecut_name);
2309 RUNTIME_CHECK(dns_name_copy(name, zcname, NULL) ==
2310 ISC_R_SUCCESS);
2311 search->copy_name = ISC_TRUE;
2312 }
2313 } else {
2314 /*
2315 * There is no zonecut at this node which is active in this
2316 * version.
2317 *
2318 * If this is a "wild" node and the caller hasn't disabled
2319 * wildcard matching, remember that we've seen a wild node
2320 * in case we need to go searching for wildcard matches
2321 * later on.
2322 */
2323 if (node->wild && (search->options & DNS_DBFIND_NOWILD) == 0)
2324 search->wild = ISC_TRUE;
2325 }
2326
2327 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2328 isc_rwlocktype_read);
2329
2330 return (result);
2331}
2332
2333static inline void
2334bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
2335 rdatasetheader_t *header, isc_stdtime_t now,
2336 dns_rdataset_t *rdataset)
2337{
2338 unsigned char *raw; /* RDATASLAB */
2339
2340 /*
2341 * Caller must be holding the node reader lock.
2342 * XXXJT: technically, we need a writer lock, since we'll increment
2343 * the header count below. However, since the actual counter value
2344 * doesn't matter, we prioritize performance here. (We may want to
2345 * use atomic increment when available).
2346 */
2347
2348 if (rdataset == NULL)
2349 return;
2350
2351 new_reference(rbtdb, node);
2352
2353 INSIST(rdataset->methods == NULL); /* We must be disassociated. */
2354
2355 rdataset->methods = &rdataset_methods;
2356 rdataset->rdclass = rbtdb->common.rdclass;
2357 rdataset->type = RBTDB_RDATATYPE_BASE(header->type);
2358 rdataset->covers = RBTDB_RDATATYPE_EXT(header->type);
2359 rdataset->ttl = header->rdh_ttl - now;
2360 rdataset->trust = header->trust;
2361 if (NXDOMAIN(header))
2362 rdataset->attributes |= DNS_RDATASETATTR_NXDOMAIN;
2363 rdataset->private1 = rbtdb;
2364 rdataset->private2 = node;
2365 raw = (unsigned char *)header + sizeof(*header);
2366 rdataset->private3 = raw;
2367 rdataset->count = header->count++;
2368 if (rdataset->count == ISC_UINT32_MAX)
2369 rdataset->count = 0;
2370
2371 /*
2372 * Reset iterator state.
2373 */
2374 rdataset->privateuint4 = 0;
2375 rdataset->private5 = NULL;
2376
2377 /*
2378 * Add noqname proof.
2379 */
2380 rdataset->private6 = header->noqname;
2381 if (rdataset->private6 != NULL)
2382 rdataset->attributes |= DNS_RDATASETATTR_NOQNAME;
2383}
2384
2385static inline isc_result_t
2386setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep,
2387 dns_name_t *foundname, dns_rdataset_t *rdataset,
2388 dns_rdataset_t *sigrdataset)
2389{
2390 isc_result_t result;
2391 dns_name_t *zcname;
2392 rbtdb_rdatatype_t type;
2393 dns_rbtnode_t *node;
2394
2395 /*
2396 * The caller MUST NOT be holding any node locks.
2397 */
2398
2399 node = search->zonecut;
2400 type = search->zonecut_rdataset->type;
2401
2402 /*
2403 * If we have to set foundname, we do it before anything else.
2404 * If we were to set foundname after we had set nodep or bound the
2405 * rdataset, then we'd have to undo that work if dns_name_copy()
2406 * failed. By setting foundname first, there's nothing to undo if
2407 * we have trouble.
2408 */
2409 if (foundname != NULL && search->copy_name) {
2410 zcname = dns_fixedname_name(&search->zonecut_name);
2411 result = dns_name_copy(zcname, foundname, NULL);
2412 if (result != ISC_R_SUCCESS)
2413 return (result);
2414 }
2415 if (nodep != NULL) {
2416 /*
2417 * Note that we don't have to increment the node's reference
2418 * count here because we're going to use the reference we
2419 * already have in the search block.
2420 */
2421 *nodep = node;
2422 search->need_cleanup = ISC_FALSE;
2423 }
2424 if (rdataset != NULL) {
2425 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2426 isc_rwlocktype_read);
2427 bind_rdataset(search->rbtdb, node, search->zonecut_rdataset,
2428 search->now, rdataset);
2429 if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL)
2430 bind_rdataset(search->rbtdb, node,
2431 search->zonecut_sigrdataset,
2432 search->now, sigrdataset);
2433 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2434 isc_rwlocktype_read);
2435 }
2436
2437 if (type == dns_rdatatype_dname)
2438 return (DNS_R_DNAME);
2439 return (DNS_R_DELEGATION);
2440}
2441
2442static inline isc_boolean_t
2443valid_glue(rbtdb_search_t *search, dns_name_t *name, rbtdb_rdatatype_t type,
2444 dns_rbtnode_t *node)
2445{
2446 unsigned char *raw; /* RDATASLAB */
2447 unsigned int count, size;
2448 dns_name_t ns_name;
2449 isc_boolean_t valid = ISC_FALSE;
2450 dns_offsets_t offsets;
2451 isc_region_t region;
2452 rdatasetheader_t *header;
2453
2454 /*
2455 * No additional locking is required.
2456 */
2457
2458 /*
2459 * Valid glue types are A, AAAA, A6. NS is also a valid glue type
2460 * if it occurs at a zone cut, but is not valid below it.
2461 */
2462 if (type == dns_rdatatype_ns) {
2463 if (node != search->zonecut) {
2464 return (ISC_FALSE);
2465 }
2466 } else if (type != dns_rdatatype_a &&
2467 type != dns_rdatatype_aaaa &&
2468 type != dns_rdatatype_a6) {
2469 return (ISC_FALSE);
2470 }
2471
2472 header = search->zonecut_rdataset;
2473 raw = (unsigned char *)header + sizeof(*header);
2474 count = raw[0] * 256 + raw[1];
2475#if DNS_RDATASET_FIXED
2476 raw += 2 + (4 * count);
2477#else
2478 raw += 2;
2479#endif
2480
2481 while (count > 0) {
2482 count--;
2483 size = raw[0] * 256 + raw[1];
2484#if DNS_RDATASET_FIXED
2485 raw += 4;
2486#else
2487 raw += 2;
2488#endif
2489 region.base = raw;
2490 region.length = size;
2491 raw += size;
2492 /*
2493 * XXX Until we have rdata structures, we have no choice but
2494 * to directly access the rdata format.
2495 */
2496 dns_name_init(&ns_name, offsets);
2497 dns_name_fromregion(&ns_name, &region);
2498 if (dns_name_compare(&ns_name, name) == 0) {
2499 valid = ISC_TRUE;
2500 break;
2501 }
2502 }
2503
2504 return (valid);
2505}
2506
2507static inline isc_boolean_t
2508activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain,
2509 dns_name_t *name)
2510{
2511 dns_fixedname_t fnext;
2512 dns_fixedname_t forigin;
2513 dns_name_t *next;
2514 dns_name_t *origin;
2515 dns_name_t prefix;
2516 dns_rbtdb_t *rbtdb;
2517 dns_rbtnode_t *node;
2518 isc_result_t result;
2519 isc_boolean_t answer = ISC_FALSE;
2520 rdatasetheader_t *header;
2521
2522 rbtdb = search->rbtdb;
2523
2524 dns_name_init(&prefix, NULL);
2525 dns_fixedname_init(&fnext);
2526 next = dns_fixedname_name(&fnext);
2527 dns_fixedname_init(&forigin);
2528 origin = dns_fixedname_name(&forigin);
2529
2530 result = dns_rbtnodechain_next(chain, NULL, NULL);
2531 while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2532 node = NULL;
2533 result = dns_rbtnodechain_current(chain, &prefix,
2534 origin, &node);
2535 if (result != ISC_R_SUCCESS)
2536 break;
2537 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2538 isc_rwlocktype_read);
2539 for (header = node->data;
2540 header != NULL;
2541 header = header->next) {
2542 if (header->serial <= search->serial &&
2543 !IGNORE(header) && EXISTS(header))
2544 break;
2545 }
2546 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2547 isc_rwlocktype_read);
2548 if (header != NULL)
2549 break;
2550 result = dns_rbtnodechain_next(chain, NULL, NULL);
2551 }
2552 if (result == ISC_R_SUCCESS)
2553 result = dns_name_concatenate(&prefix, origin, next, NULL);
2554 if (result == ISC_R_SUCCESS && dns_name_issubdomain(next, name))
2555 answer = ISC_TRUE;
2556 return (answer);
2557}
2558
2559static inline isc_boolean_t
2560activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) {
2561 dns_fixedname_t fnext;
2562 dns_fixedname_t forigin;
2563 dns_fixedname_t fprev;
2564 dns_name_t *next;
2565 dns_name_t *origin;
2566 dns_name_t *prev;
2567 dns_name_t name;
2568 dns_name_t rname;
2569 dns_name_t tname;
2570 dns_rbtdb_t *rbtdb;
2571 dns_rbtnode_t *node;
2572 dns_rbtnodechain_t chain;
2573 isc_boolean_t check_next = ISC_TRUE;
2574 isc_boolean_t check_prev = ISC_TRUE;
2575 isc_boolean_t answer = ISC_FALSE;
2576 isc_result_t result;
2577 rdatasetheader_t *header;
2578 unsigned int n;
2579
2580 rbtdb = search->rbtdb;
2581
2582 dns_name_init(&name, NULL);
2583 dns_name_init(&tname, NULL);
2584 dns_name_init(&rname, NULL);
2585 dns_fixedname_init(&fnext);
2586 next = dns_fixedname_name(&fnext);
2587 dns_fixedname_init(&fprev);
2588 prev = dns_fixedname_name(&fprev);
2589 dns_fixedname_init(&forigin);
2590 origin = dns_fixedname_name(&forigin);
2591
2592 /*
2593 * Find if qname is at or below a empty node.
2594 * Use our own copy of the chain.
2595 */
2596
2597 chain = search->chain;
2598 do {
2599 node = NULL;
2600 result = dns_rbtnodechain_current(&chain, &name,
2601 origin, &node);
2602 if (result != ISC_R_SUCCESS)
2603 break;
2604 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2605 isc_rwlocktype_read);
2606 for (header = node->data;
2607 header != NULL;
2608 header = header->next) {
2609 if (header->serial <= search->serial &&
2610 !IGNORE(header) && EXISTS(header))
2611 break;
2612 }
2613 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2614 isc_rwlocktype_read);
2615 if (header != NULL)
2616 break;
2617 result = dns_rbtnodechain_prev(&chain, NULL, NULL);
2618 } while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN);
2619 if (result == ISC_R_SUCCESS)
2620 result = dns_name_concatenate(&name, origin, prev, NULL);
2621 if (result != ISC_R_SUCCESS)
2622 check_prev = ISC_FALSE;
2623
2624 result = dns_rbtnodechain_next(&chain, NULL, NULL);
2625 while (result == ISC_R_SUCCESS || result == DNS_R_NEWORIGIN) {
2626 node = NULL;
2627 result = dns_rbtnodechain_current(&chain, &name,
2628 origin, &node);
2629 if (result != ISC_R_SUCCESS)
2630 break;
2631 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2632 isc_rwlocktype_read);
2633 for (header = node->data;
2634 header != NULL;
2635 header = header->next) {
2636 if (header->serial <= search->serial &&
2637 !IGNORE(header) && EXISTS(header))
2638 break;
2639 }
2640 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2641 isc_rwlocktype_read);
2642 if (header != NULL)
2643 break;
2644 result = dns_rbtnodechain_next(&chain, NULL, NULL);
2645 }
2646 if (result == ISC_R_SUCCESS)
2647 result = dns_name_concatenate(&name, origin, next, NULL);
2648 if (result != ISC_R_SUCCESS)
2649 check_next = ISC_FALSE;
2650
2651 dns_name_clone(qname, &rname);
2652
2653 /*
2654 * Remove the wildcard label to find the terminal name.
2655 */
2656 n = dns_name_countlabels(wname);
2657 dns_name_getlabelsequence(wname, 1, n - 1, &tname);
2658
2659 do {
2660 if ((check_prev && dns_name_issubdomain(prev, &rname)) ||
2661 (check_next && dns_name_issubdomain(next, &rname))) {
2662 answer = ISC_TRUE;
2663 break;
2664 }
2665 /*
2666 * Remove the left hand label.
2667 */
2668 n = dns_name_countlabels(&rname);
2669 dns_name_getlabelsequence(&rname, 1, n - 1, &rname);
2670 } while (!dns_name_equal(&rname, &tname));
2671 return (answer);
2672}
2673
2674static inline isc_result_t
2675find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep,
2676 dns_name_t *qname)
2677{
2678 unsigned int i, j;
2679 dns_rbtnode_t *node, *level_node, *wnode;
2680 rdatasetheader_t *header;
2681 isc_result_t result = ISC_R_NOTFOUND;
2682 dns_name_t name;
2683 dns_name_t *wname;
2684 dns_fixedname_t fwname;
2685 dns_rbtdb_t *rbtdb;
2686 isc_boolean_t done, wild, active;
2687 dns_rbtnodechain_t wchain;
2688
2689 /*
2690 * Caller must be holding the tree lock and MUST NOT be holding
2691 * any node locks.
2692 */
2693
2694 /*
2695 * Examine each ancestor level. If the level's wild bit
2696 * is set, then construct the corresponding wildcard name and
2697 * search for it. If the wildcard node exists, and is active in
2698 * this version, we're done. If not, then we next check to see
2699 * if the ancestor is active in this version. If so, then there
2700 * can be no possible wildcard match and again we're done. If not,
2701 * continue the search.
2702 */
2703
2704 rbtdb = search->rbtdb;
2705 i = search->chain.level_matches;
2706 done = ISC_FALSE;
2707 node = *nodep;
2708 do {
2709 NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock),
2710 isc_rwlocktype_read);
2711
2712 /*
2713 * First we try to figure out if this node is active in
2714 * the search's version. We do this now, even though we
2715 * may not need the information, because it simplifies the
2716 * locking and code flow.
2717 */
2718 for (header = node->data;
2719 header != NULL;
2720 header = header->next) {
2721 if (header->serial <= search->serial &&
2722 !IGNORE(header) && EXISTS(header))
2723 break;
2724 }
2725 if (header != NULL)
2726 active = ISC_TRUE;
2727 else
2728 active = ISC_FALSE;
2729
2730 if (node->wild)
2731 wild = ISC_TRUE;
2732 else
2733 wild = ISC_FALSE;
2734
2735 NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock),
2736 isc_rwlocktype_read);
2737
2738 if (wild) {
2739 /*
2740 * Construct the wildcard name for this level.
2741 */
2742 dns_name_init(&name, NULL);
2743 dns_rbt_namefromnode(node, &name);
2744 dns_fixedname_init(&fwname);
2745 wname = dns_fixedname_name(&fwname);
2746 result = dns_name_concatenate(dns_wildcardname, &name,
2747 wname, NULL);
2748 j = i;
2749 while (result == ISC_R_SUCCESS && j != 0) {
2750 j--;
2751 level_node = search->chain.levels[j];
2752 dns_name_init(&name, NULL);
2753 dns_rbt_namefromnode(level_node, &name);
2754 result = dns_name_concatenate(wname,
2755 &name,
2756 wname,
2757 NULL);
2758 }
2759 if (result != ISC_R_SUCCESS)
2760 break;
2761
2762 wnode = NULL;
2763 dns_rbtnodechain_init(&wchain, NULL);
2764 result = dns_rbt_findnode(rbtdb->tree, wname,
2765 NULL, &wnode, &wchain,
2766 DNS_RBTFIND_EMPTYDATA,
2767 NULL, NULL);
2768 if (result == ISC_R_SUCCESS) {
2769 nodelock_t *lock;
2770
2771 /*
2772 * We have found the wildcard node. If it
2773 * is active in the search's version, we're
2774 * done.
2775 */
2776 lock = &rbtdb->node_locks[wnode->locknum].lock;
2777 NODE_LOCK(lock, isc_rwlocktype_read);
2778 for (header = wnode->data;
2779 header != NULL;
2780 header = header->next) {
2781 if (header->serial <= search->serial &&
2782 !IGNORE(header) && EXISTS(header))
2783 break;
2784 }
2785 NODE_UNLOCK(lock, isc_rwlocktype_read);
2786 if (header != NULL ||
2787 activeempty(search, &wchain, wname)) {
2788 if (activeemtpynode(search, qname,
2789 wname)) {
2790 return (ISC_R_NOTFOUND);
2791 }
2792 /*
2793 * The wildcard node is active!
2794 *
2795 * Note: result is still ISC_R_SUCCESS
2796 * so we don't have to set it.
2797 */
2798 *nodep = wnode;
2799 break;
2800 }
2801 } else if (result != ISC_R_NOTFOUND &&
2802 result != DNS_R_PARTIALMATCH) {
2803 /*
2804 * An error has occurred. Bail out.
2805 */
2806 break;
2807 }
2808 }
2809
2810 if (active) {
2811 /*
2812 * The level node is active. Any wildcarding
2813 * present at higher levels has no
2814 * effect and we're done.
2815 */
2816 result = ISC_R_NOTFOUND;
2817 break;
2818 }
2819
2820 if (i > 0) {
2821 i--;
2822 node = search->chain.levels[i];
2823 } else
2824 done = ISC_TRUE;
2825 } while (!done);
2826
2827 return (result);
2828}
2829
2830static inline isc_result_t
2831find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
2832 dns_name_t *foundname, dns_rdataset_t *rdataset,
2833 dns_rdataset_t *sigrdataset, isc_boolean_t need_sig)
2834{
2835 dns_rbtnode_t *node;
2836 rdatasetheader_t *header, *header_next, *found, *foundsig;
2837 isc_boolean_t empty_node;
2838 isc_result_t result;
2839 dns_fixedname_t fname, forigin;
2840 dns_name_t *name, *origin;
2841
2842 do {
2843 node = NULL;
2844 dns_fixedname_init(&fname);
2845 name = dns_fixedname_name(&fname);
2846 dns_fixedname_init(&forigin);
2847 origin = dns_fixedname_name(&forigin);
2848 result = dns_rbtnodechain_current(&search->chain, name,
2849 origin, &node);
2850 if (result != ISC_R_SUCCESS)
2851 return (result);
2852 NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2853 isc_rwlocktype_read);
2854 found = NULL;
2855 foundsig = NULL;
2856 empty_node = ISC_TRUE;
2857 for (header = node->data;
2858 header != NULL;
2859 header = header_next) {
2860 header_next = header->next;
2861 /*
2862 * Look for an active, extant NSEC or RRSIG NSEC.
2863 */
2864 do {
2865 if (header->serial <= search->serial &&
2866 !IGNORE(header)) {
2867 /*
2868 * Is this a "this rdataset doesn't
2869 * exist" record?
2870 */
2871 if (NONEXISTENT(header))
2872 header = NULL;
2873 break;
2874 } else
2875 header = header->down;
2876 } while (header != NULL);
2877 if (header != NULL) {
2878 /*
2879 * We now know that there is at least one
2880 * active rdataset at this node.
2881 */
2882 empty_node = ISC_FALSE;
2883 if (header->type == dns_rdatatype_nsec) {
2884 found = header;
2885 if (foundsig != NULL)
2886 break;
2887 } else if (header->type ==
2888 RBTDB_RDATATYPE_SIGNSEC) {
2889 foundsig = header;
2890 if (found != NULL)
2891 break;
2892 }
2893 }
2894 }
2895 if (!empty_node) {
2896 if (found != NULL &&
2897 (foundsig != NULL || !need_sig))
2898 {
2899 /*
2900 * We've found the right NSEC record.
2901 *
2902 * Note: for this to really be the right
2903 * NSEC record, it's essential that the NSEC
2904 * records of any nodes obscured by a zone
2905 * cut have been removed; we assume this is
2906 * the case.
2907 */
2908 result = dns_name_concatenate(name, origin,
2909 foundname, NULL);
2910 if (result == ISC_R_SUCCESS) {
2911 if (nodep != NULL) {
2912 new_reference(search->rbtdb,
2913 node);
2914 *nodep = node;
2915 }
2916 bind_rdataset(search->rbtdb, node,
2917 found, search->now,
2918 rdataset);
2919 if (foundsig != NULL)
2920 bind_rdataset(search->rbtdb,
2921 node,
2922 foundsig,
2923 search->now,
2924 sigrdataset);
2925 }
2926 } else if (found == NULL && foundsig == NULL) {
2927 /*
2928 * This node is active, but has no NSEC or
2929 * RRSIG NSEC. That means it's glue or
2930 * other obscured zone data that isn't
2931 * relevant for our search. Treat the
2932 * node as if it were empty and keep looking.
2933 */
2934 empty_node = ISC_TRUE;
2935 result = dns_rbtnodechain_prev(&search->chain,
2936 NULL, NULL);
2937 } else {
2938 /*
2939 * We found an active node, but either the
2940 * NSEC or the RRSIG NSEC is missing. This
2941 * shouldn't happen.
2942 */
2943 result = DNS_R_BADDB;
2944 }
2945 } else {
2946 /*
2947 * This node isn't active. We've got to keep
2948 * looking.
2949 */
2950 result = dns_rbtnodechain_prev(&search->chain, NULL,
2951 NULL);
2952 }
2953 NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock),
2954 isc_rwlocktype_read);
2955 } while (empty_node && result == ISC_R_SUCCESS);
2956
2957 /*
2958 * If the result is ISC_R_NOMORE, then we got to the beginning of
2959 * the database and didn't find a NSEC record. This shouldn't
2960 * happen.
2961 */
2962 if (result == ISC_R_NOMORE)
2963 result = DNS_R_BADDB;
2964
2965 return (result);
2966}
2967
2968static isc_result_t
2969zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
2970 dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
2971 dns_dbnode_t **nodep, dns_name_t *foundname,
2972 dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
2973{
2974 dns_rbtnode_t *node = NULL;
2975 isc_result_t result;
2976 rbtdb_search_t search;
2977 isc_boolean_t cname_ok = ISC_TRUE;
2978 isc_boolean_t close_version = ISC_FALSE;
2979 isc_boolean_t maybe_zonecut = ISC_FALSE;
2980 isc_boolean_t at_zonecut = ISC_FALSE;
2981 isc_boolean_t wild;
2982 isc_boolean_t empty_node;
2983 rdatasetheader_t *header, *header_next, *found, *nsecheader;
2984 rdatasetheader_t *foundsig, *cnamesig, *nsecsig;
2985 rbtdb_rdatatype_t sigtype;
2986 isc_boolean_t active;
2987 dns_rbtnodechain_t chain;
2988 nodelock_t *lock;
2989
2990
2991 search.rbtdb = (dns_rbtdb_t *)db;
2992
2993 REQUIRE(VALID_RBTDB(search.rbtdb));
2994
2995 /*
2996 * We don't care about 'now'.
2997 */
2998 UNUSED(now);
2999
3000 /*
3001 * If the caller didn't supply a version, attach to the current
3002 * version.
3003 */
3004 if (version == NULL) {
3005 currentversion(db, &version);
3006 close_version = ISC_TRUE;
3007 }
3008
3009 search.rbtversion = version;
3010 search.serial = search.rbtversion->serial;
3011 search.options = options;
3012 search.copy_name = ISC_FALSE;
3013 search.need_cleanup = ISC_FALSE;
3014 search.wild = ISC_FALSE;
3015 search.zonecut = NULL;
3016 dns_fixedname_init(&search.zonecut_name);
3017 dns_rbtnodechain_init(&search.chain, search.rbtdb->common.mctx);
3018 search.now = 0;
3019
3020 /*
3021 * 'wild' will be true iff. we've matched a wildcard.
3022 */
3023 wild = ISC_FALSE;
3024
3025 RWLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3026
3027 /*
3028 * Search down from the root of the tree. If, while going down, we
3029 * encounter a callback node, zone_zonecut_callback() will search the
3030 * rdatasets at the zone cut for active DNAME or NS rdatasets.
3031 */
3032 result = dns_rbt_findnode(search.rbtdb->tree, name, foundname, &node,
3033 &search.chain, DNS_RBTFIND_EMPTYDATA,
3034 zone_zonecut_callback, &search);
3035
3036 if (result == DNS_R_PARTIALMATCH) {
3037 partial_match:
3038 if (search.zonecut != NULL) {
3039 result = setup_delegation(&search, nodep, foundname,
3040 rdataset, sigrdataset);
3041 goto tree_exit;
3042 }
3043
3044 if (search.wild) {
3045 /*
3046 * At least one of the levels in the search chain
3047 * potentially has a wildcard. For each such level,
3048 * we must see if there's a matching wildcard active
3049 * in the current version.
3050 */
3051 result = find_wildcard(&search, &node, name);
3052 if (result == ISC_R_SUCCESS) {
3053 result = dns_name_copy(name, foundname, NULL);
3054 if (result != ISC_R_SUCCESS)
3055 goto tree_exit;
3056 wild = ISC_TRUE;
3057 goto found;
3058 }
3059 else if (result != ISC_R_NOTFOUND)
3060 goto tree_exit;
3061 }
3062
3063 chain = search.chain;
3064 active = activeempty(&search, &chain, name);
3065
3066 /*
3067 * If we're here, then the name does not exist, is not
3068 * beneath a zonecut, and there's no matching wildcard.
3069 */
3070 if (search.rbtdb->secure ||
3071 (search.options & DNS_DBFIND_FORCENSEC) != 0)
3072 {
3073 result = find_closest_nsec(&search, nodep, foundname,
3074 rdataset, sigrdataset,
3075 search.rbtdb->secure);
3076 if (result == ISC_R_SUCCESS)
3077 result = active ? DNS_R_EMPTYNAME :
3078 DNS_R_NXDOMAIN;
3079 } else
3080 result = active ? DNS_R_EMPTYNAME : DNS_R_NXDOMAIN;
3081 goto tree_exit;
3082 } else if (result != ISC_R_SUCCESS)
3083 goto tree_exit;
3084
3085 found:
3086 /*
3087 * We have found a node whose name is the desired name, or we
3088 * have matched a wildcard.
3089 */
3090
3091 if (search.zonecut != NULL) {
3092 /*
3093 * If we're beneath a zone cut, we don't want to look for
3094 * CNAMEs because they're not legitimate zone glue.
3095 */
3096 cname_ok = ISC_FALSE;
3097 } else {
3098 /*
3099 * The node may be a zone cut itself. If it might be one,
3100 * make sure we check for it later.
3101 */
3102 if (node->find_callback &&
3103 (node != search.rbtdb->origin_node ||
3104 IS_STUB(search.rbtdb)) &&
3105 !dns_rdatatype_atparent(type))
3106 maybe_zonecut = ISC_TRUE;
3107 }
3108
3109 /*
3110 * Certain DNSSEC types are not subject to CNAME matching
3111 * (RFC4035, section 2.5 and RFC3007).
3112 *
3113 * We don't check for RRSIG, because we don't store RRSIG records
3114 * directly.
3115 */
3116 if (type == dns_rdatatype_key || type == dns_rdatatype_nsec)
3117 cname_ok = ISC_FALSE;
3118
3119 /*
3120 * We now go looking for rdata...
3121 */
3122
3123 NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3124 isc_rwlocktype_read);
3125
3126 found = NULL;
3127 foundsig = NULL;
3128 sigtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, type);
3129 nsecheader = NULL;
3130 nsecsig = NULL;
3131 cnamesig = NULL;
3132 empty_node = ISC_TRUE;
3133 for (header = node->data; header != NULL; header = header_next) {
3134 header_next = header->next;
3135 /*
3136 * Look for an active, extant rdataset.
3137 */
3138 do {
3139 if (header->serial <= search.serial &&
3140 !IGNORE(header)) {
3141 /*
3142 * Is this a "this rdataset doesn't
3143 * exist" record?
3144 */
3145 if (NONEXISTENT(header))
3146 header = NULL;
3147 break;
3148 } else
3149 header = header->down;
3150 } while (header != NULL);
3151 if (header != NULL) {
3152 /*
3153 * We now know that there is at least one active
3154 * rdataset at this node.
3155 */
3156 empty_node = ISC_FALSE;
3157
3158 /*
3159 * Do special zone cut handling, if requested.
3160 */
3161 if (maybe_zonecut &&
3162 header->type == dns_rdatatype_ns) {
3163 /*
3164 * We increment the reference count on node to
3165 * ensure that search->zonecut_rdataset will
3166 * still be valid later.
3167 */
3168 new_reference(search.rbtdb, node);
3169 search.zonecut = node;
3170 search.zonecut_rdataset = header;
3171 search.zonecut_sigrdataset = NULL;
3172 search.need_cleanup = ISC_TRUE;
3173 maybe_zonecut = ISC_FALSE;
3174 at_zonecut = ISC_TRUE;
3175 /*
3176 * It is not clear if KEY should still be
3177 * allowed at the parent side of the zone
3178 * cut or not. It is needed for RFC3007
3179 * validated updates.
3180 */
3181 if ((search.options & DNS_DBFIND_GLUEOK) == 0
3182 && type != dns_rdatatype_nsec
3183 && type != dns_rdatatype_key) {
3184 /*
3185 * Glue is not OK, but any answer we
3186 * could return would be glue. Return
3187 * the delegation.
3188 */
3189 found = NULL;
3190 break;
3191 }
3192 if (found != NULL && foundsig != NULL)
3193 break;
3194 }
3195
3196 /*
3197 * If we found a type we were looking for,
3198 * remember it.
3199 */
3200 if (header->type == type ||
3201 type == dns_rdatatype_any ||
3202 (header->type == dns_rdatatype_cname &&
3203 cname_ok)) {
3204 /*
3205 * We've found the answer!
3206 */
3207 found = header;
3208 if (header->type == dns_rdatatype_cname &&
3209 cname_ok) {
3210 /*
3211 * We may be finding a CNAME instead
3212 * of the desired type.
3213 *
3214 * If we've already got the CNAME RRSIG,
3215 * use it, otherwise change sigtype
3216 * so that we find it.
3217 */
3218 if (cnamesig != NULL)
3219 foundsig = cnamesig;
3220 else
3221 sigtype =
3222 RBTDB_RDATATYPE_SIGCNAME;
3223 }
3224 /*
3225 * If we've got all we need, end the search.
3226 */
3227 if (!maybe_zonecut && foundsig != NULL)
3228 break;
3229 } else if (header->type == sigtype) {
3230 /*
3231 * We've found the RRSIG rdataset for our
3232 * target type. Remember it.
3233 */
3234 foundsig = header;
3235 /*
3236 * If we've got all we need, end the search.
3237 */
3238 if (!maybe_zonecut && found != NULL)
3239 break;
3240 } else if (header->type == dns_rdatatype_nsec) {
3241 /*
3242 * Remember a NSEC rdataset even if we're
3243 * not specifically looking for it, because
3244 * we might need it later.
3245 */
3246 nsecheader = header;
3247 } else if (header->type == RBTDB_RDATATYPE_SIGNSEC) {
3248 /*
3249 * If we need the NSEC rdataset, we'll also
3250 * need its signature.
3251 */
3252 nsecsig = header;
3253 } else if (cname_ok &&
3254 header->type == RBTDB_RDATATYPE_SIGCNAME) {
3255 /*
3256 * If we get a CNAME match, we'll also need
3257 * its signature.
3258 */
3259 cnamesig = header;
3260 }
3261 }
3262 }
3263
3264 if (empty_node) {
3265 /*
3266 * We have an exact match for the name, but there are no
3267 * active rdatasets in the desired version. That means that
3268 * this node doesn't exist in the desired version, and that
3269 * we really have a partial match.
3270 */
3271 if (!wild) {
3272 lock = &search.rbtdb->node_locks[node->locknum].lock;
3273 NODE_UNLOCK(lock, isc_rwlocktype_read);
3274 goto partial_match;
3275 }
3276 }
3277
3278 /*
3279 * If we didn't find what we were looking for...
3280 */
3281 if (found == NULL) {
3282 if (search.zonecut != NULL) {
3283 /*
3284 * We were trying to find glue at a node beneath a
3285 * zone cut, but didn't.
3286 *
3287 * Return the delegation.
3288 */
3289 lock = &search.rbtdb->node_locks[node->locknum].lock;
3290 NODE_UNLOCK(lock, isc_rwlocktype_read);
3291 result = setup_delegation(&search, nodep, foundname,
3292 rdataset, sigrdataset);
3293 goto tree_exit;
3294 }
3295 /*
3296 * The desired type doesn't exist.
3297 */
3298 result = DNS_R_NXRRSET;
3299 if (search.rbtdb->secure &&
3300 (nsecheader == NULL || nsecsig == NULL)) {
3301 /*
3302 * The zone is secure but there's no NSEC,
3303 * or the NSEC has no signature!
3304 */
3305 if (!wild) {
3306 result = DNS_R_BADDB;
3307 goto node_exit;
3308 }
3309
3310 lock = &search.rbtdb->node_locks[node->locknum].lock;
3311 NODE_UNLOCK(lock, isc_rwlocktype_read);
3312 result = find_closest_nsec(&search, nodep, foundname,
3313 rdataset, sigrdataset,
3314 search.rbtdb->secure);
3315 if (result == ISC_R_SUCCESS)
3316 result = DNS_R_EMPTYWILD;
3317 goto tree_exit;
3318 }
3319 if ((search.options & DNS_DBFIND_FORCENSEC) != 0 &&
3320 nsecheader == NULL)
3321 {
3322 /*
3323 * There's no NSEC record, and we were told
3324 * to find one.
3325 */
3326 result = DNS_R_BADDB;
3327 goto node_exit;
3328 }
3329 if (nodep != NULL) {
3330 new_reference(search.rbtdb, node);
3331 *nodep = node;
3332 }
3333 if (search.rbtdb->secure ||
3334 (search.options & DNS_DBFIND_FORCENSEC) != 0)
3335 {
3336 bind_rdataset(search.rbtdb, node, nsecheader,
3337 0, rdataset);
3338 if (nsecsig != NULL)
3339 bind_rdataset(search.rbtdb, node,
3340 nsecsig, 0, sigrdataset);
3341 }
3342 if (wild)
3343 foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3344 goto node_exit;
3345 }
3346
3347 /*
3348 * We found what we were looking for, or we found a CNAME.
3349 */
3350
3351 if (type != found->type &&
3352 type != dns_rdatatype_any &&
3353 found->type == dns_rdatatype_cname) {
3354 /*
3355 * We weren't doing an ANY query and we found a CNAME instead
3356 * of the type we were looking for, so we need to indicate
3357 * that result to the caller.
3358 */
3359 result = DNS_R_CNAME;
3360 } else if (search.zonecut != NULL) {
3361 /*
3362 * If we're beneath a zone cut, we must indicate that the
3363 * result is glue, unless we're actually at the zone cut
3364 * and the type is NSEC or KEY.
3365 */
3366 if (search.zonecut == node) {
3367 /*
3368 * It is not clear if KEY should still be
3369 * allowed at the parent side of the zone
3370 * cut or not. It is needed for RFC3007
3371 * validated updates.
3372 */
3373 if (type == dns_rdatatype_nsec ||
3374 type == dns_rdatatype_key)
3375 result = ISC_R_SUCCESS;
3376 else if (type == dns_rdatatype_any)
3377 result = DNS_R_ZONECUT;
3378 else
3379 result = DNS_R_GLUE;
3380 } else
3381 result = DNS_R_GLUE;
3382 /*
3383 * We might have found data that isn't glue, but was occluded
3384 * by a dynamic update. If the caller cares about this, they
3385 * will have told us to validate glue.
3386 *
3387 * XXX We should cache the glue validity state!
3388 */
3389 if (result == DNS_R_GLUE &&
3390 (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 &&
3391 !valid_glue(&search, foundname, type, node)) {
3392 lock = &search.rbtdb->node_locks[node->locknum].lock;
3393 NODE_UNLOCK(lock, isc_rwlocktype_read);
3394 result = setup_delegation(&search, nodep, foundname,
3395 rdataset, sigrdataset);
3396 goto tree_exit;
3397 }
3398 } else {
3399 /*
3400 * An ordinary successful query!
3401 */
3402 result = ISC_R_SUCCESS;
3403 }
3404
3405 if (nodep != NULL) {
3406 if (!at_zonecut)
3407 new_reference(search.rbtdb, node);
3408 else
3409 search.need_cleanup = ISC_FALSE;
3410 *nodep = node;
3411 }
3412
3413 if (type != dns_rdatatype_any) {
3414 bind_rdataset(search.rbtdb, node, found, 0, rdataset);
3415 if (foundsig != NULL)
3416 bind_rdataset(search.rbtdb, node, foundsig, 0,
3417 sigrdataset);
3418 }
3419
3420 if (wild)
3421 foundname->attributes |= DNS_NAMEATTR_WILDCARD;
3422
3423 node_exit:
3424 NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock),
3425 isc_rwlocktype_read);
3426
3427 tree_exit:
3428 RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read);
3429
3430 /*
3431 * If we found a zonecut but aren't going to use it, we have to
3432 * let go of it.
3433 */
3434 if (search.need_cleanup) {
3435 node = search.zonecut;
3436 lock = &(search.rbtdb->node_locks[node->locknum].lock);
3437
3438 NODE_LOCK(lock, isc_rwlocktype_read);
3439 decrement_reference(search.rbtdb, node, 0,
3440 isc_rwlocktype_read, isc_rwlocktype_none,
3441 ISC_FALSE);
3442 NODE_UNLOCK(lock, isc_rwlocktype_read);
3443 }
3444
3445 if (close_version)
3446 closeversion(db, &version, ISC_FALSE);
3447
3448 dns_rbtnodechain_reset(&search.chain);
3449
3450 return (result);
3451}
3452
3453static isc_result_t
3454zone_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options,
3455 isc_stdtime_t now, dns_dbnode_t **nodep,
3456 dns_name_t *foundname,
3457 dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3458{
3459 UNUSED(db);
3460 UNUSED(name);
3461 UNUSED(options);
3462 UNUSED(now);
3463 UNUSED(nodep);
3464 UNUSED(foundname);
3465 UNUSED(rdataset);
3466 UNUSED(sigrdataset);
3467
3468 FATAL_ERROR(__FILE__, __LINE__, "zone_findzonecut() called!");
3469
3470 return (ISC_R_NOTIMPLEMENTED);
3471}
3472
3473static isc_result_t
3474cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) {
3475 rbtdb_search_t *search = arg;
3476 rdatasetheader_t *header, *header_prev, *header_next;
3477 rdatasetheader_t *dname_header, *sigdname_header;
3478 isc_result_t result;
3479 nodelock_t *lock;
3480 isc_rwlocktype_t locktype;
3481
3482 /* XXX comment */
3483
3484 REQUIRE(search->zonecut == NULL);
3485
3486 /*
3487 * Keep compiler silent.
3488 */
3489 UNUSED(name);
3490
3491 lock = &(search->rbtdb->node_locks[node->locknum].lock);
3492 locktype = isc_rwlocktype_read;
3493 NODE_LOCK(lock, locktype);
3494
3495 /*
3496 * Look for a DNAME or RRSIG DNAME rdataset.
3497 */
3498 dname_header = NULL;
3499 sigdname_header = NULL;
3500 header_prev = NULL;
3501 for (header = node->data; header != NULL; header = header_next) {
3502 header_next = header->next;
3503 if (header->rdh_ttl <= search->now) {
3504 /*
3505 * This rdataset is stale. If no one else is
3506 * using the node, we can clean it up right
3507 * now, otherwise we mark it as stale, and
3508 * the node as dirty, so it will get cleaned
3509 * up later.
3510 */
3511 if ((header->rdh_ttl <= search->now - RBTDB_VIRTUAL) &&
3512 (locktype == isc_rwlocktype_write ||
3513 NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3514 /*
3515 * We update the node's status only when we
3516 * can get write access; otherwise, we leave
3517 * others to this work. Periodical cleaning
3518 * will eventually take the job as the last
3519 * resort.
3520 * We won't downgrade the lock, since other
3521 * rdatasets are probably stale, too.
3522 */
3523 locktype = isc_rwlocktype_write;
3524
3525 if (dns_rbtnode_refcurrent(node) == 0) {
3526 isc_mem_t *mctx;
3527
3528 /*
3529 * header->down can be non-NULL if the
3530 * refcount has just decremented to 0
3531 * but decrement_reference() has not
3532 * performed clean_cache_node(), in
3533 * which case we need to purge the
3534 * stale headers first.
3535 */
3536 mctx = search->rbtdb->common.mctx;
3537 clean_stale_headers(search->rbtdb,
3538 mctx,
3539 header);
3540 if (header_prev != NULL)
3541 header_prev->next =
3542 header->next;
3543 else
3544 node->data = header->next;
3545 free_rdataset(search->rbtdb, mctx,
3546 header);
3547 } else {
3548 header->attributes |=
3549 RDATASET_ATTR_STALE;
3550 node->dirty = 1;
3551 header_prev = header;
3552 }
3553 } else
3554 header_prev = header;
3555 } else if (header->type == dns_rdatatype_dname &&
3556 EXISTS(header)) {
3557 dname_header = header;
3558 header_prev = header;
3559 } else if (header->type == RBTDB_RDATATYPE_SIGDNAME &&
3560 EXISTS(header)) {
3561 sigdname_header = header;
3562 header_prev = header;
3563 } else
3564 header_prev = header;
3565 }
3566
3567 if (dname_header != NULL &&
c4067435 3568 (!DNS_TRUST_PENDING(dname_header->trust) ||
bbbf71a3
JL
3569 (search->options & DNS_DBFIND_PENDINGOK) != 0)) {
3570 /*
3571 * We increment the reference count on node to ensure that
3572 * search->zonecut_rdataset will still be valid later.
3573 */
3574 new_reference(search->rbtdb, node);
3575 INSIST(!ISC_LINK_LINKED(node, deadlink));
3576 search->zonecut = node;
3577 search->zonecut_rdataset = dname_header;
3578 search->zonecut_sigrdataset = sigdname_header;
3579 search->need_cleanup = ISC_TRUE;
3580 result = DNS_R_PARTIALMATCH;
3581 } else
3582 result = DNS_R_CONTINUE;
3583
3584 NODE_UNLOCK(lock, locktype);
3585
3586 return (result);
3587}
3588
3589static inline isc_result_t
3590find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node,
3591 dns_dbnode_t **nodep, dns_name_t *foundname,
3592 dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3593{
3594 unsigned int i;
3595 dns_rbtnode_t *level_node;
3596 rdatasetheader_t *header, *header_prev, *header_next;
3597 rdatasetheader_t *found, *foundsig;
3598 isc_result_t result = ISC_R_NOTFOUND;
3599 dns_name_t name;
3600 dns_rbtdb_t *rbtdb;
3601 isc_boolean_t done;
3602 nodelock_t *lock;
3603 isc_rwlocktype_t locktype;
3604
3605 /*
3606 * Caller must be holding the tree lock.
3607 */
3608
3609 rbtdb = search->rbtdb;
3610 i = search->chain.level_matches;
3611 done = ISC_FALSE;
3612 do {
3613 locktype = isc_rwlocktype_read;
3614 lock = &rbtdb->node_locks[node->locknum].lock;
3615 NODE_LOCK(lock, locktype);
3616
3617 /*
3618 * Look for NS and RRSIG NS rdatasets.
3619 */
3620 found = NULL;
3621 foundsig = NULL;
3622 header_prev = NULL;
3623 for (header = node->data;
3624 header != NULL;
3625 header = header_next) {
3626 header_next = header->next;
3627 if (header->rdh_ttl <= search->now) {
3628 /*
3629 * This rdataset is stale. If no one else is
3630 * using the node, we can clean it up right
3631 * now, otherwise we mark it as stale, and
3632 * the node as dirty, so it will get cleaned
3633 * up later.
3634 */
3635 if ((header->rdh_ttl <= search->now -
3636 RBTDB_VIRTUAL) &&
3637 (locktype == isc_rwlocktype_write ||
3638 NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3639 /*
3640 * We update the node's status only
3641 * when we can get write access.
3642 */
3643 locktype = isc_rwlocktype_write;
3644
3645 if (dns_rbtnode_refcurrent(node)
3646 == 0) {
3647 isc_mem_t *m;
3648
3649 m = search->rbtdb->common.mctx;
3650 clean_stale_headers(
3651 search->rbtdb,
3652 m, header);
3653 if (header_prev != NULL)
3654 header_prev->next =
3655 header->next;
3656 else
3657 node->data =
3658 header->next;
3659 free_rdataset(rbtdb, m,
3660 header);
3661 } else {
3662 header->attributes |=
3663 RDATASET_ATTR_STALE;
3664 node->dirty = 1;
3665 header_prev = header;
3666 }
3667 } else
3668 header_prev = header;
3669 } else if (EXISTS(header)) {
3670 /*
3671 * We've found an extant rdataset. See if
3672 * we're interested in it.
3673 */
3674 if (header->type == dns_rdatatype_ns) {
3675 found = header;
3676 if (foundsig != NULL)
3677 break;
3678 } else if (header->type ==
3679 RBTDB_RDATATYPE_SIGNS) {
3680 foundsig = header;
3681 if (found != NULL)
3682 break;
3683 }
3684 header_prev = header;
3685 } else
3686 header_prev = header;
3687 }
3688
3689 if (found != NULL) {
3690 /*
3691 * If we have to set foundname, we do it before
3692 * anything else. If we were to set foundname after
3693 * we had set nodep or bound the rdataset, then we'd
3694 * have to undo that work if dns_name_concatenate()
3695 * failed. By setting foundname first, there's
3696 * nothing to undo if we have trouble.
3697 */
3698 if (foundname != NULL) {
3699 dns_name_init(&name, NULL);
3700 dns_rbt_namefromnode(node, &name);
3701 result = dns_name_copy(&name, foundname, NULL);
3702 while (result == ISC_R_SUCCESS && i > 0) {
3703 i--;
3704 level_node = search->chain.levels[i];
3705 dns_name_init(&name, NULL);
3706 dns_rbt_namefromnode(level_node,
3707 &name);
3708 result =
3709 dns_name_concatenate(foundname,
3710 &name,
3711 foundname,
3712 NULL);
3713 }
3714 if (result != ISC_R_SUCCESS) {
3715 *nodep = NULL;
3716 goto node_exit;
3717 }
3718 }
3719 result = DNS_R_DELEGATION;
3720 if (nodep != NULL) {
3721 new_reference(search->rbtdb, node);
3722 *nodep = node;
3723 }
3724 bind_rdataset(search->rbtdb, node, found, search->now,
3725 rdataset);
3726 if (foundsig != NULL)
3727 bind_rdataset(search->rbtdb, node, foundsig,
3728 search->now, sigrdataset);
3729 if (need_headerupdate(found, search->now) ||
3730 (foundsig != NULL &&
3731 need_headerupdate(foundsig, search->now))) {
3732 if (locktype != isc_rwlocktype_write) {
3733 NODE_UNLOCK(lock, locktype);
3734 NODE_LOCK(lock, isc_rwlocktype_write);
3735 locktype = isc_rwlocktype_write;
3736 }
3737 if (need_headerupdate(found, search->now))
3738 update_header(search->rbtdb, found,
3739 search->now);
3740 if (foundsig != NULL &&
3741 need_headerupdate(foundsig, search->now)) {
3742 update_header(search->rbtdb, foundsig,
3743 search->now);
3744 }
3745 }
3746 }
3747
3748 node_exit:
3749 NODE_UNLOCK(lock, locktype);
3750
3751 if (found == NULL && i > 0) {
3752 i--;
3753 node = search->chain.levels[i];
3754 } else
3755 done = ISC_TRUE;
3756
3757 } while (!done);
3758
3759 return (result);
3760}
3761
3762static isc_result_t
3763find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep,
3764 isc_stdtime_t now, dns_name_t *foundname,
3765 dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3766{
3767 dns_rbtnode_t *node;
3768 rdatasetheader_t *header, *header_next, *header_prev;
3769 rdatasetheader_t *found, *foundsig;
3770 isc_boolean_t empty_node;
3771 isc_result_t result;
3772 dns_fixedname_t fname, forigin;
3773 dns_name_t *name, *origin;
3774 rbtdb_rdatatype_t matchtype, sigmatchtype;
3775 nodelock_t *lock;
3776 isc_rwlocktype_t locktype;
3777
3778 matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0);
3779 sigmatchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig,
3780 dns_rdatatype_nsec);
3781
3782 do {
3783 node = NULL;
3784 dns_fixedname_init(&fname);
3785 name = dns_fixedname_name(&fname);
3786 dns_fixedname_init(&forigin);
3787 origin = dns_fixedname_name(&forigin);
3788 result = dns_rbtnodechain_current(&search->chain, name,
3789 origin, &node);
3790 if (result != ISC_R_SUCCESS)
3791 return (result);
3792 locktype = isc_rwlocktype_read;
3793 lock = &(search->rbtdb->node_locks[node->locknum].lock);
3794 NODE_LOCK(lock, locktype);
3795 found = NULL;
3796 foundsig = NULL;
3797 empty_node = ISC_TRUE;
3798 header_prev = NULL;
3799 for (header = node->data;
3800 header != NULL;
3801 header = header_next) {
3802 header_next = header->next;
3803 if (header->rdh_ttl <= now) {
3804 /*
3805 * This rdataset is stale. If no one else is
3806 * using the node, we can clean it up right
3807 * now, otherwise we mark it as stale, and the
3808 * node as dirty, so it will get cleaned up
3809 * later.
3810 */
3811 if ((header->rdh_ttl <= now - RBTDB_VIRTUAL) &&
3812 (locktype == isc_rwlocktype_write ||
3813 NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS)) {
3814 /*
3815 * We update the node's status only
3816 * when we can get write access.
3817 */
3818 locktype = isc_rwlocktype_write;
3819
3820 if (dns_rbtnode_refcurrent(node)
3821 == 0) {
3822 isc_mem_t *m;
3823
3824 m = search->rbtdb->common.mctx;
3825 clean_stale_headers(
3826 search->rbtdb,
3827 m, header);
3828 if (header_prev != NULL)
3829 header_prev->next =
3830 header->next;
3831 else
3832 node->data = header->next;
3833 free_rdataset(search->rbtdb, m,
3834 header);
3835 } else {
3836 header->attributes |=
3837 RDATASET_ATTR_STALE;
3838 node->dirty = 1;
3839 header_prev = header;
3840 }
3841 } else
3842 header_prev = header;
3843 continue;
3844 }
3845 if (NONEXISTENT(header) ||
3846 RBTDB_RDATATYPE_BASE(header->type) == 0) {
3847 header_prev = header;
3848 continue;
3849 }
3850 empty_node = ISC_FALSE;
3851 if (header->type == matchtype)
3852 found = header;
3853 else if (header->type == sigmatchtype)
3854 foundsig = header;
3855 header_prev = header;
3856 }
3857 if (found != NULL) {
3858 result = dns_name_concatenate(name, origin,
3859 foundname, NULL);
3860 if (result != ISC_R_SUCCESS)
3861 goto unlock_node;
3862 bind_rdataset(search->rbtdb, node, found,
3863 now, rdataset);
3864 if (foundsig != NULL)
3865 bind_rdataset(search->rbtdb, node, foundsig,
3866 now, sigrdataset);
3867 new_reference(search->rbtdb, node);
3868 *nodep = node;
3869 result = DNS_R_COVERINGNSEC;
3870 } else if (!empty_node) {
3871 result = ISC_R_NOTFOUND;
3872 } else
3873 result = dns_rbtnodechain_prev(&search->chain, NULL,
3874 NULL);
3875 unlock_node:
3876 NODE_UNLOCK(lock, locktype);
3877 } while (empty_node && result == ISC_R_SUCCESS);
3878 return (result);
3879}
3880
3881static isc_result_t
3882cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
3883 dns_rdatatype_t type, unsigned int options, isc_stdtime_t now,
3884 dns_dbnode_t **nodep, dns_name_t *foundname,
3885 dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset)
3886{
3887 dns_rbtnode_t *node = NULL;
3888 isc_result_t result;
3889 rbtdb_search_t search;
3890 isc_boolean_t cname_ok = ISC_TRUE;
3891 isc_boolean_t empty_node;
3892 nodelock_t *lock;
3893 isc_rwlocktype_t locktype;
3894 rdatasetheader_t *header, *header_prev, *header_next;
3895 rdatasetheader_t *found, *nsheader;
3896 rdatasetheader_t *foundsig, *nssig, *cnamesig;
3897 rdatasetheader_t *update, *updatesig;