kernel - Fix excessive mbuf use in nfs_realign()
[dragonfly.git] / sys / vfs / hammer / hammer_btree.h
CommitLineData
8750964d
MD
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
c82af904 34 * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.24 2008/06/26 04:06:22 dillon Exp $
8750964d
MD
35 */
36
37/*
8cd0a023 38 * HAMMER B-Tree index
8750964d 39 *
8cd0a023
MD
40 * HAMMER implements a modified B+Tree. B+Trees store records only
41 * at their leaves and HAMMER's modification is to adjust the internal
42 * elements so there is a boundary element on each side instead of sub-tree
43 * pointers.
8750964d 44 *
8cd0a023
MD
45 * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to
46 * reduce confusion.
47 *
48 * A B-Tree internal node looks like this:
427e5fc6
MD
49 *
50 * B N N N N N N B <-- boundary and internal elements
51 * S S S S S S S <-- subtree pointers
52 *
8cd0a023 53 * A B-Tree leaf node looks like this:
427e5fc6
MD
54 *
55 * L L L L L L L L <-- leaf elemenets
56 * (there is also a previous and next-leaf pointer)
57 *
8cd0a023
MD
58 * The recursion radix of an internal node is reduced by 1 relative to
59 * a normal B-Tree in order to accomodate the right-hand boundary.
60 *
61 * The big benefit to using a B-Tree with built-in bounds information is
62 * that it makes it possible to cache pointers into the middle of the tree
63 * and not have to start searches, insertions, OR deletions at the root node.
64 * The boundary elements allow searches to progress in a definitive direction
fbc6e32a
MD
65 * from any point in the tree without revisting nodes. It is also possible
66 * to terminate searches early and make minor adjustments to the boundaries
67 * (within the confines of the parent's boundaries) on the fly. This greatly
11ad5ade 68 * improves the efficiency of many operations.
427e5fc6 69 *
8cd0a023
MD
70 * HAMMER B-Trees are per-cluster. The global multi-cluster B-Tree is
71 * constructed by allowing internal nodes to link to the roots of other
72 * clusters. Fields in the cluster header then reference back to its
73 * parent and use the cluster generation number to detect stale linkages.
427e5fc6 74 *
8cd0a023
MD
75 * The B-Tree balancing code can operate within a cluster or across the
76 * filesystem's ENTIRE B-Tree super-structure. A cluster's B-Tree root
77 * can be a leaf node in the worse case. A cluster is guarenteed to have
78 * sufficient free space to hold a single completely full leaf in the
79 * degenerate case.
427e5fc6 80 *
8cd0a023 81 * All of the structures below are on-disk structures.
8750964d
MD
82 */
83
84/*
8cd0a023 85 * Common base for all B-Tree element types (40 bytes)
c60bb2c5 86 *
8cd0a023
MD
87 * obj_type is set to the object type the record represents if an inode,
88 * directory entry, or an inter-cluster reference. A cluster range is
66325755
MD
89 * special in that the B-Tree nodes represent a range within the B-Tree
90 * inclusive of rec_type field, so obj_type must be used to detect the
91 * cluster range entries.
427e5fc6 92 *
fe7678ee
MD
93 * btype is only used by the elements making up an internal or leaf B-Tree
94 * node and applies to the node rather then to the key. This means that
95 * btype must be assigned/reassigned after any update to the base_elm making
96 * up a B-Tree element.
8750964d
MD
97 */
98struct hammer_base_elm {
99 int64_t obj_id; /* 00 object record is associated with */
100 int64_t key; /* 08 indexing key (offset or namekey) */
101
102 hammer_tid_t create_tid; /* 10 transaction id for record creation */
103 hammer_tid_t delete_tid; /* 18 transaction id for record update/del */
104
8cd0a023
MD
105 u_int16_t rec_type; /* 20 _RECTYPE_ */
106 u_int8_t obj_type; /* 22 _OBJTYPE_ (restricted) */
fe7678ee 107 u_int8_t btype; /* 23 B-Tree element type */
2f85fa4d 108 u_int32_t localization; /* 24 B-Tree localization parameter */
8750964d
MD
109 /* 28 */
110};
111
427e5fc6
MD
112typedef struct hammer_base_elm *hammer_base_elm_t;
113
2f85fa4d
MD
114/*
115 * Localization has sorting priority over the obj_id and is used to
116 * localize inodes for very fast directory scans.
ddfdf542
MD
117 *
118 * Localization can also be used to create pseudo-filesystems within
119 * a HAMMER filesystem. Pseudo-filesystems would be suitable
120 * replication targets.
2f85fa4d
MD
121 */
122#define HAMMER_LOCALIZE_RESERVED00 0x00000000
123#define HAMMER_LOCALIZE_INODE 0x00000001
124#define HAMMER_LOCALIZE_MISC 0x00000002
125#define HAMMER_LOCALIZE_RESERVED03 0x00000003
ddfdf542 126#define HAMMER_LOCALIZE_MASK 0x0000FFFF
5a930e66
MD
127#define HAMMER_LOCALIZE_PSEUDOFS_MASK 0xFFFF0000
128#define HAMMER_LOCALIZE_PSEUDOFS_INC 0x00010000
2f85fa4d
MD
129
130#define HAMMER_MIN_LOCALIZATION 0x00000000U
dd94f1b1 131#define HAMMER_MAX_LOCALIZATION 0x0000FFFFU
ddfdf542 132#define HAMMER_DEF_LOCALIZATION 0x00000000U
2f85fa4d 133
427e5fc6 134/*
47197d71 135 * Internal element (40 + 24 = 64 bytes).
8cd0a023 136 *
fe7678ee
MD
137 * An internal element contains the left-hand boundary, right-hand boundary,
138 * and a recursion to another B-Tree node.
427e5fc6
MD
139 */
140struct hammer_btree_internal_elm {
141 struct hammer_base_elm base;
5de0c0e5 142 hammer_tid_t mirror_tid; /* mirroring support */
ddfdf542
MD
143 hammer_off_t subtree_offset;
144 int32_t unused02;
145 int32_t unused03;
427e5fc6
MD
146};
147
c82af904
MD
148typedef struct hammer_btree_internal_elm *hammer_btree_internal_elm_t;
149
8750964d 150/*
47197d71 151 * Leaf B-Tree element (40 + 24 = 64 bytes).
8cd0a023 152 *
dd94f1b1
MD
153 * NOTE: create_ts/delete_ts are not used by any core algorithms, they are
154 * used only by userland to present nominal real-time date strings
155 * to the user.
8750964d 156 */
8cd0a023 157struct hammer_btree_leaf_elm {
8750964d 158 struct hammer_base_elm base;
dd94f1b1
MD
159 u_int32_t create_ts;
160 u_int32_t delete_ts;
19619882
MD
161 hammer_off_t data_offset;
162 int32_t data_len;
163 hammer_crc_t data_crc;
8750964d
MD
164};
165
11ad5ade
MD
166typedef struct hammer_btree_leaf_elm *hammer_btree_leaf_elm_t;
167
8750964d 168/*
47197d71 169 * Rollup btree leaf element types - 64 byte structure
8750964d 170 */
8cd0a023 171union hammer_btree_elm {
11ad5ade
MD
172 struct hammer_base_elm base;
173 struct hammer_btree_leaf_elm leaf;
8cd0a023 174 struct hammer_btree_internal_elm internal;
8750964d
MD
175};
176
8cd0a023 177typedef union hammer_btree_elm *hammer_btree_elm_t;
427e5fc6 178
8750964d 179/*
af209b0f 180 * B-Tree node (normal or meta) (64x64 = 4K structure)
427e5fc6 181 *
af209b0f 182 * Each node contains 63 elements. The last element for an internal node
8cd0a023
MD
183 * is the right-boundary so internal nodes have one fewer logical elements
184 * then leaf nodes.
427e5fc6 185 *
8cd0a023
MD
186 * 'count' always refers to the number of elements and is non-inclusive of
187 * the right-hand boundary for an internal node.
427e5fc6 188 *
af209b0f
MD
189 * The use of a fairly large radix is designed to reduce the number of
190 * discrete disk accesses required to locate something. Keep in mind
191 * that nodes are allocated out of 16K hammer buffers so supported values
192 * are (256-1), (128-1), (64-1), (32-1), or (16-1).
193 *
8cd0a023
MD
194 * NOTE: The node head for an internal does not contain the subtype
195 * (The B-Tree node type for the nodes referenced by its elements).
196 * Instead, each element specifies the subtype (elm->base.subtype).
197 * This allows us to maintain an unbalanced B-Tree and to easily identify
198 * special inter-cluster link elements.
eaeff70d
MD
199 *
200 * NOTE: FUTURE EXPANSION: The reserved fields in hammer_node_ondisk are
201 * reserved for left/right leaf linkage fields, flags, and other future
202 * features.
427e5fc6 203 */
af209b0f 204#define HAMMER_BTREE_LEAF_ELMS 63
8cd0a023 205#define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1)
427e5fc6
MD
206
207/*
208 * It is safe to combine two adjacent nodes if the total number of elements
209 * is less then or equal to the *_FILL constant.
8750964d 210 */
427e5fc6
MD
211#define HAMMER_BTREE_LEAF_FILL (HAMMER_BTREE_LEAF_ELMS - 3)
212#define HAMMER_BTREE_INT_FILL (HAMMER_BTREE_INT_ELMS - 3)
213
8cd0a023
MD
214#define HAMMER_BTREE_TYPE_INTERNAL ((u_int8_t)'I')
215#define HAMMER_BTREE_TYPE_LEAF ((u_int8_t)'L')
fe7678ee 216#define HAMMER_BTREE_TYPE_RECORD ((u_int8_t)'R')
f36a9737 217#define HAMMER_BTREE_TYPE_DELETED ((u_int8_t)'D')
427e5fc6 218
8cd0a023
MD
219struct hammer_node_ondisk {
220 /*
47197d71 221 * B-Tree node header (64 bytes)
8cd0a023 222 */
19619882 223 hammer_crc_t crc; /* MUST BE FIRST FIELD OF STRUCTURE */
40043e7f 224 u_int32_t signature;
47197d71 225 hammer_off_t parent; /* 0 if at root of cluster */
427e5fc6 226 int32_t count;
427e5fc6 227 u_int8_t type;
8cd0a023 228 u_int8_t reserved01;
427e5fc6 229 u_int16_t reserved02;
47197d71
MD
230 hammer_off_t reserved03; /* future link_left */
231 hammer_off_t reserved04; /* future link_right */
232 hammer_off_t reserved05;
233 hammer_off_t reserved06;
5de0c0e5 234 hammer_tid_t mirror_tid; /* mirroring support (aggregator) */
8750964d 235
8750964d 236 /*
8cd0a023
MD
237 * Element array. Internal nodes have one less logical element
238 * (meaning: the same number of physical elements) in order to
239 * accomodate the right-hand boundary. The left-hand boundary
240 * is integrated into the first element. Leaf nodes have no
241 * boundary elements.
8750964d 242 */
8cd0a023 243 union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS];
427e5fc6 244};
8750964d 245
19619882
MD
246#define HAMMER_BTREE_SIGNATURE_GOOD 0xB3A49586
247#define HAMMER_BTREE_SIGNATURE_DESTROYED 0x4A3B2C1D
248#define HAMMER_BTREE_CRCSIZE \
249 (sizeof(struct hammer_node_ondisk) - sizeof(hammer_crc_t))
250
8cd0a023 251typedef struct hammer_node_ondisk *hammer_node_ondisk_t;
427e5fc6 252