2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.3 2007/11/01 20:53:05 dillon Exp $
38 * HAMMER BH-Tree index
40 * HAMMER implements a modified B+Tree. In all documentation this will
41 * simply be refered to as the HAMMER BH-Tree. Basically la BH-Tree
42 * looks like a B+Tree (A B-Tree which stores its records only at the leafs
43 * of the tree), but adds two additional boundary elements which describe
44 * the left-most and right-most element a node is able to represent. In
45 * otherwords, we have boundary elements at the two ends of a BH-Tree node
46 * instead of sub-tree pointers.
48 * A BH-Tree internal node looks like this:
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
53 * A BH-Tree leaf node looks like this:
55 * L L L L L L L L <-- leaf elemenets
56 * (there is also a previous and next-leaf pointer)
58 * The recursion radix is reduced by 2 relative to a normal B-Tree but
59 * as a bonus we treat internal nodes completely differently from leaf nodes,
60 * which allows us to pack in more elements.
62 * The big benefit to using a BH-Tree is that it is possible to cache
63 * pointers into the middle of the tree and not have to start searches,
64 * insertions, OR deletions at the root node. In particular, searches are
65 * able to progress in a definitive direction from any point in the tree
66 * without revisting nodes. This greatly improves the efficiency of many
67 * operations, most especially record appends.
69 * BH-Trees also make stacking of trees fairly straightforward.
71 * Most of the structures below are on-disk structures.
75 * Common base for all B+Tree element types (40 bytes)
77 * Note that obj_type is set to the object type of the record represents
78 * an inode or a cluster range. A cluster range is special in that the
79 * B-Tree nodes represent a range within the B-Tree inclusive of rec_type
80 * field, so obj_type must be used to detect the cluster range entries.
82 * The special field is used as a subtree pointer for internal nodes,
83 * and reserved for leaf nodes.
85 struct hammer_base_elm {
86 int64_t obj_id; /* 00 object record is associated with */
87 int64_t key; /* 08 indexing key (offset or namekey) */
89 hammer_tid_t create_tid; /* 10 transaction id for record creation */
90 hammer_tid_t delete_tid; /* 18 transaction id for record update/del */
92 u_int16_t rec_type; /* 20 */
93 u_int16_t obj_type; /* 22 (special) */
94 int32_t reserved07; /* 24 (future) */
98 typedef struct hammer_base_elm *hammer_base_elm_t;
101 * Internal element - 48 bytes. Internal elements are independantly
102 * organized. Note that internal elements are a different size than
105 struct hammer_btree_internal_elm {
106 struct hammer_base_elm base;
107 int32_t subtree_offset;
108 int8_t subtree_count; /* hint: can be too small, but not too big */
115 * Leaf B-Tree element (40 + 16 = 56 bytes)
117 struct hammer_btree_record_elm {
118 struct hammer_base_elm base;
126 * Leaf Cluster-reference B-Tree element (40 + 16 = 56 bytes)
128 struct hammer_btree_cluster_elm {
129 struct hammer_base_elm base;
130 int32_t rec_offset; /* cluster recursion record */
131 u_int32_t verifier; /* low 32 bits of target clu_id */
137 * Rollup btree leaf element types - 56 byte structure
139 union hammer_btree_leaf_elm {
140 struct hammer_base_elm base;
141 struct hammer_btree_record_elm record;
142 struct hammer_btree_cluster_elm cluster;
145 typedef union hammer_btree_leaf_elm *hammer_btree_leaf_elm_t;
148 * B-Tree node (normal or meta) - 56 + 56 * 8 = 504 bytes
150 * 32 B-Tree nodes fit in a 16K filesystem buffer. Remember, the filesystem
151 * buffer has a 128 byte header so (16384 - 128) / 32 = 508, but we want
152 * our structures to be 8-byte aligned so we use 504.
154 * We store B-Tree records as elements in internal nodes, which means we
155 * must have a separate index of sub-tree pointers. The sub-tree pointers
156 * interlace the elements (that is, the elements act as separators). Thus
157 * we have to have one more pointer then we do elements so we can represent
158 * the gamut - the sub-tree to the left of the first element AND the sub-tree
159 * to the right of the last element.
161 * subcount represents the number of elements in the subnode (1-8)
163 * 'count' always refers to the number of elements.
165 #define HAMMER_BTREE_LEAF_ELMS 8
166 #define HAMMER_BTREE_INT_ELMS 9 /* +1 more for the right boundary */
169 * It is safe to combine two adjacent nodes if the total number of elements
170 * is less then or equal to the *_FILL constant.
172 #define HAMMER_BTREE_LEAF_FILL (HAMMER_BTREE_LEAF_ELMS - 3)
173 #define HAMMER_BTREE_INT_FILL (HAMMER_BTREE_INT_ELMS - 3)
175 #define HAMMER_BTREE_INTERNAL_NODE ((u_int32_t)'I')
176 #define HAMMER_BTREE_LEAF_NODE ((u_int32_t)'L')
178 struct hammer_base_node {
183 u_int16_t reserved02;
187 struct hammer_btree_internal_node {
189 * B-Tree internal node header (24 bytes)
191 struct hammer_base_node base;
195 * Internal element array. The subtree fields are logically to
196 * the right of the element key. These fields are unused and
197 * must be 0 for the right-hand boundary element.
199 * The left-hand boundary element is at elms[0], the right-hand
200 * boundary element is at elms[count]. count does not include
201 * the right-hand boundary (so count at the termination of a
202 * standard for() loop will point at the right-hand boundary).
204 struct hammer_btree_internal_elm elms[HAMMER_BTREE_INT_ELMS+1];
208 * The hammer leaf node contains a smaller header
210 struct hammer_btree_leaf_node {
212 * B-Tree leaf node header
214 struct hammer_base_node base;
222 union hammer_btree_leaf_elm elms[HAMMER_BTREE_LEAF_ELMS];
225 union hammer_btree_node {
226 struct hammer_base_node base;
227 struct hammer_btree_internal_node internal;
228 struct hammer_btree_leaf_node leaf;
231 typedef union hammer_btree_node *hammer_btree_node_t;
234 * This in-memory structure is used by btree_rebalance_node(). Note
235 * that internal elements are a different size than record and cluster
238 struct hammer_btree_elm_inmemory {
239 hammer_btree_node_t owner;
241 struct hammer_base_elm base;
242 struct hammer_btree_internal_elm internal;
243 union hammer_btree_leaf_elm leaf;
247 typedef struct hammer_btree_elm_inmemory *hammer_btree_elm_inmemory_t;
251 * B-Tree filesystem buffer
253 #define HAMMER_BTREE_NODES \
254 ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \
255 sizeof(union hammer_btree_node))
257 struct hammer_fsbuf_btree {
258 struct hammer_fsbuf_head head;
260 union hammer_btree_node nodes[HAMMER_BTREE_NODES];
264 * Key and caching structures used for HAMMER B-Tree lookups. These are
265 * in-memory structures.
267 struct hammer_btree_cursor {
268 hammer_base_elm_t left_bound;
269 hammer_base_elm_t right_bound;
270 struct hammer_cluster *cluster;
271 struct hammer_buffer *parent_buffer;
272 struct hammer_btree_internal_node *parent;
273 struct hammer_buffer *node_buffer;
274 hammer_btree_node_t node; /* internal or leaf node */
279 typedef struct hammer_btree_cursor *hammer_btree_cursor_t;
281 struct hammer_btree_info {
282 struct hammer_btree_cursor cursor;
283 struct hammer_buffer *data_buffer;
284 struct hammer_buffer *record_buffer;
285 union hammer_data_ondisk *data; /* returned data pointer */
286 union hammer_record_ondisk *rec;/* returned record pointer */
289 typedef struct hammer_btree_info *hammer_btree_info_t;
291 #define HAMMER_BTREE_GET_RECORD 0x0001
292 #define HAMMER_BTREE_GET_DATA 0x0002
293 #define HAMMER_BTREE_CLUSTER_TAG 0x0004 /* stop at the cluster tag */
294 #define HAMMER_BTREE_INSERT 0x0008 /* adjust for insert */
295 #define HAMMER_BTREE_DELETE 0x0010 /* adjust for delete */