2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.6 2007/12/14 08:05:39 dillon Exp $
40 * HAMMER implements a modified B+Tree. B+Trees store records only
41 * at their leaves and HAMMER's modification is to adjust the internal
42 * elements so there is a boundary element on each side instead of sub-tree
45 * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to
48 * A B-Tree internal node looks like this:
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
53 * A B-Tree leaf node looks like this:
55 * L L L L L L L L <-- leaf elemenets
56 * (there is also a previous and next-leaf pointer)
58 * The recursion radix of an internal node is reduced by 1 relative to
59 * a normal B-Tree in order to accomodate the right-hand boundary.
61 * The big benefit to using a B-Tree with built-in bounds information is
62 * that it makes it possible to cache pointers into the middle of the tree
63 * and not have to start searches, insertions, OR deletions at the root node.
64 * The boundary elements allow searches to progress in a definitive direction
65 * from any point in the tree without revisting nodes. It is also possible
66 * to terminate searches early and make minor adjustments to the boundaries
67 * (within the confines of the parent's boundaries) on the fly. This greatly
68 * improves the efficiency of many operations, most especially record appends.
70 * HAMMER B-Trees are per-cluster. The global multi-cluster B-Tree is
71 * constructed by allowing internal nodes to link to the roots of other
72 * clusters. Fields in the cluster header then reference back to its
73 * parent and use the cluster generation number to detect stale linkages.
75 * The B-Tree balancing code can operate within a cluster or across the
76 * filesystem's ENTIRE B-Tree super-structure. A cluster's B-Tree root
77 * can be a leaf node in the worse case. A cluster is guarenteed to have
78 * sufficient free space to hold a single completely full leaf in the
81 * All of the structures below are on-disk structures.
85 * Common base for all B-Tree element types (40 bytes)
87 * obj_type is set to the object type the record represents if an inode,
88 * directory entry, or an inter-cluster reference. A cluster range is
89 * special in that the B-Tree nodes represent a range within the B-Tree
90 * inclusive of rec_type field, so obj_type must be used to detect the
91 * cluster range entries.
93 * subtree_type is only used by internal B-Tree elements and indicates
94 * whether we are referencing a cluster, a leaf, or an internal node.
96 struct hammer_base_elm {
97 int64_t obj_id; /* 00 object record is associated with */
98 int64_t key; /* 08 indexing key (offset or namekey) */
100 hammer_tid_t create_tid; /* 10 transaction id for record creation */
101 hammer_tid_t delete_tid; /* 18 transaction id for record update/del */
103 u_int16_t rec_type; /* 20 _RECTYPE_ */
104 u_int8_t obj_type; /* 22 _OBJTYPE_ (restricted) */
105 u_int8_t subtree_type; /* 23 B-Tree element type */
106 int32_t reserved07; /* 24 (future) */
110 typedef struct hammer_base_elm *hammer_base_elm_t;
113 * Internal element (40 + 16 = 56 bytes).
115 * An internal element contains the left-hand boundary and a recursion to
116 * another B-Tree node. If rec_offset is 0 it points to another node in the
117 * current cluster at subtree_offset. Otherwise it points to the root
118 * of another cluster at volno/cluid.
120 * sub-cluster references have an associated record while intra-cluster
123 * subtree_count is the number of elements in an intra-cluster reference.
124 * For inter-cluster references subtree_count is chaining depth heuristic
125 * used to help balance the B-Tree globally.
127 struct hammer_btree_internal_elm {
128 struct hammer_base_elm base;
129 int32_t rec_offset; /* 0 indicates internal reference */
130 int32_t subtree_offset; /* offset or cluster id */
131 int32_t subtree_volno; /* unused or volume number */
132 int32_t subtree_count; /* hint: can be too small, but not too big */
135 #define subtree_cluid subtree_offset
136 #define subtree_type base.subtree_type
139 * Leaf B-Tree element (40 + 16 = 56 bytes).
143 struct hammer_btree_leaf_elm {
144 struct hammer_base_elm base;
152 * Rollup btree leaf element types - 56 byte structure
154 union hammer_btree_elm {
155 struct hammer_base_elm base;
156 struct hammer_btree_leaf_elm leaf;
157 struct hammer_btree_internal_elm internal;
160 typedef union hammer_btree_elm *hammer_btree_elm_t;
163 * B-Tree node (normal or meta) - 24 + 56 * 14 = 808 bytes (8-byte aligned)
165 * 20 B-Tree nodes fit in a 16K filesystem buffer, leaving us room for
166 * the 128 byte filesystem buffer header and another 96 bytes of filler.
168 * Each node contains 14 elements. The last element for an internal node
169 * is the right-boundary so internal nodes have one fewer logical elements
172 * 'count' always refers to the number of elements and is non-inclusive of
173 * the right-hand boundary for an internal node.
175 * NOTE: The node head for an internal does not contain the subtype
176 * (The B-Tree node type for the nodes referenced by its elements).
177 * Instead, each element specifies the subtype (elm->base.subtype).
178 * This allows us to maintain an unbalanced B-Tree and to easily identify
179 * special inter-cluster link elements.
181 #define HAMMER_BTREE_LEAF_ELMS 14
182 #define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1)
183 #define HAMMER_BTREE_NODES 20
186 * It is safe to combine two adjacent nodes if the total number of elements
187 * is less then or equal to the *_FILL constant.
189 #define HAMMER_BTREE_LEAF_FILL (HAMMER_BTREE_LEAF_ELMS - 3)
190 #define HAMMER_BTREE_INT_FILL (HAMMER_BTREE_INT_ELMS - 3)
192 #define HAMMER_BTREE_TYPE_INTERNAL ((u_int8_t)'I')
193 #define HAMMER_BTREE_TYPE_LEAF ((u_int8_t)'L')
194 #define HAMMER_BTREE_TYPE_CLUSTER ((u_int8_t)'C')
196 struct hammer_node_ondisk {
198 * B-Tree node header (24 bytes)
201 int32_t parent; /* 0 if at root of cluster */
204 u_int16_t reserved02;
205 int32_t reserved03; /* future heuristic */
206 int32_t reserved04; /* future link_left */
207 int32_t reserved05; /* future link_right */
210 * Element array. Internal nodes have one less logical element
211 * (meaning: the same number of physical elements) in order to
212 * accomodate the right-hand boundary. The left-hand boundary
213 * is integrated into the first element. Leaf nodes have no
216 union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS];
219 typedef struct hammer_node_ondisk *hammer_node_ondisk_t;
222 * B-Tree filesystem buffer (16K exactly)
224 struct hammer_fsbuf_btree {
225 struct hammer_fsbuf_head head; /* 128 */
227 struct hammer_node_ondisk nodes[HAMMER_BTREE_NODES];
230 #if HAMMER_BTREE_NODES * (HAMMER_BTREE_LEAF_ELMS * 56 + 24) + 128 + 96 != 16384
231 #error "Sanity check hammer_fsbuf_btree"