| 1 | /* |
| 2 | * Copyright (c) 2007 The DragonFly Project. All rights reserved. |
| 3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project |
| 5 | * by Matthew Dillon <dillon@backplane.com> |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in |
| 15 | * the documentation and/or other materials provided with the |
| 16 | * distribution. |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 18 | * contributors may be used to endorse or promote products derived |
| 19 | * from this software without specific, prior written permission. |
| 20 | * |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 32 | * SUCH DAMAGE. |
| 33 | * |
| 34 | * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.8 2007/11/20 22:55:40 dillon Exp $ |
| 35 | */ |
| 36 | |
| 37 | #ifndef _SYS_UUID_H_ |
| 38 | #include <sys/uuid.h> |
| 39 | #endif |
| 40 | |
| 41 | /* |
| 42 | * The structures below represent the on-disk format for a HAMMER |
| 43 | * filesystem. Note that all fields for on-disk structures are naturally |
| 44 | * aligned. The host endian format is used - compatibility is possible |
| 45 | * if the implementation detects reversed endian and adjusts data accordingly. |
| 46 | * |
| 47 | * Most of HAMMER revolves around the concept of an object identifier. An |
| 48 | * obj_id is a 64 bit quantity which uniquely identifies a filesystem object |
| 49 | * FOR THE ENTIRE LIFE OF THE FILESYSTEM. This uniqueness allows backups |
| 50 | * and mirrors to retain varying amounts of filesystem history by removing |
| 51 | * any possibility of conflict through identifier reuse. |
| 52 | * |
| 53 | * A HAMMER filesystem may spam multiple volumes. |
| 54 | * |
| 55 | * A HAMMER filesystem uses a 16K filesystem buffer size. All filesystem |
| 56 | * I/O is done in multiples of 16K. Most buffer-sized headers such as those |
| 57 | * used by volumes, super-clusters, clusters, and basic filesystem buffers |
| 58 | * use fixed-sized A-lists which are heavily dependant on HAMMER_BUFSIZE. |
| 59 | */ |
| 60 | #define HAMMER_BUFSIZE 16384 |
| 61 | #define HAMMER_BUFMASK (HAMMER_BUFSIZE - 1) |
| 62 | |
| 63 | /* |
| 64 | * Hammer transction ids are 64 bit unsigned integers and are usually |
| 65 | * synchronized with the time of day in nanoseconds. |
| 66 | */ |
| 67 | typedef u_int64_t hammer_tid_t; |
| 68 | |
| 69 | #define HAMMER_MAX_TID 0xFFFFFFFFFFFFFFFFULL |
| 70 | #define HAMMER_MIN_KEY -0x8000000000000000LL |
| 71 | #define HAMMER_MAX_KEY 0x7FFFFFFFFFFFFFFFLL |
| 72 | |
| 73 | /* |
| 74 | * Most HAMMER data structures are embedded in 16K filesystem buffers. |
| 75 | * All filesystem buffers except those designated as pure-data buffers |
| 76 | * contain this 128-byte header. |
| 77 | * |
| 78 | * This structure contains an embedded A-List used to manage space within |
| 79 | * the filesystem buffer. It is not used by volume or cluster header |
| 80 | * buffers, or by pure-data buffers. The granularity is variable and |
| 81 | * depends on the type of filesystem buffer. BLKSIZE is just a minimum. |
| 82 | */ |
| 83 | |
| 84 | #define HAMMER_FSBUF_HEAD_SIZE 128 |
| 85 | #define HAMMER_FSBUF_MAXBLKS 256 |
| 86 | #define HAMMER_FSBUF_BLKMASK (HAMMER_FSBUF_MAXBLKS - 1) |
| 87 | #define HAMMER_FSBUF_METAELMS HAMMER_ALIST_METAELMS_256_1LYR /* 11 */ |
| 88 | |
| 89 | struct hammer_fsbuf_head { |
| 90 | u_int64_t buf_type; |
| 91 | u_int32_t buf_crc; |
| 92 | u_int32_t buf_reserved07; |
| 93 | u_int32_t reserved[6]; |
| 94 | struct hammer_almeta buf_almeta[HAMMER_FSBUF_METAELMS]; |
| 95 | }; |
| 96 | |
| 97 | typedef struct hammer_fsbuf_head *hammer_fsbuf_head_t; |
| 98 | |
| 99 | /* |
| 100 | * Note: Pure-data buffers contain pure-data and have no buf_type. |
| 101 | * Piecemeal data buffers do have a header and use HAMMER_FSBUF_DATA. |
| 102 | */ |
| 103 | #define HAMMER_FSBUF_VOLUME 0xC8414D4DC5523031ULL /* HAMMER01 */ |
| 104 | #define HAMMER_FSBUF_SUPERCL 0xC8414D52C3555052ULL /* HAMRSUPR */ |
| 105 | #define HAMMER_FSBUF_CLUSTER 0xC8414D52C34C5553ULL /* HAMRCLUS */ |
| 106 | #define HAMMER_FSBUF_RECORDS 0xC8414D52D2454353ULL /* HAMRRECS */ |
| 107 | #define HAMMER_FSBUF_BTREE 0xC8414D52C2545245ULL /* HAMRBTRE */ |
| 108 | #define HAMMER_FSBUF_DATA 0xC8414D52C4415441ULL /* HAMRDATA */ |
| 109 | |
| 110 | #define HAMMER_FSBUF_VOLUME_REV 0x313052C54D4D41C8ULL /* (reverse endian) */ |
| 111 | |
| 112 | /* |
| 113 | * The B-Tree structures need hammer_fsbuf_head. |
| 114 | */ |
| 115 | #include "hammer_btree.h" |
| 116 | |
| 117 | /* |
| 118 | * HAMMER Volume header |
| 119 | * |
| 120 | * A HAMMER filesystem is built from any number of block devices, Each block |
| 121 | * device contains a volume header followed by however many super-clusters |
| 122 | * and clusters fit into the volume. Clusters cannot be migrated but the |
| 123 | * data they contain can, so HAMMER can use a truncated cluster for any |
| 124 | * extra space at the end of the volume. |
| 125 | * |
| 126 | * The volume containing the root cluster is designated as the master volume. |
| 127 | * The root cluster designation can be moved to any volume. |
| 128 | * |
| 129 | * The volume header takes up an entire 16K filesystem buffer and includes |
| 130 | * a one or two-layered A-list to manage the clusters making up the volume. |
| 131 | * A volume containing up to 32768 clusters (2TB) can be managed with a |
| 132 | * single-layered A-list. A two-layer A-list is capable of managing up |
| 133 | * to 16384 super-clusters with each super-cluster containing 32768 clusters |
| 134 | * (32768 TB per volume total). The number of volumes is limited to 32768 |
| 135 | * but it only takes 512 to fill out a 64 bit address space so for all |
| 136 | * intents and purposes the filesystem has no limits. |
| 137 | * |
| 138 | * cluster addressing within a volume depends on whether a single or |
| 139 | * duel-layer A-list is used. If a duel-layer A-list is used a 16K |
| 140 | * super-cluster buffer is needed for every 16384 clusters in the volume. |
| 141 | * However, because the A-list's hinting is grouped in multiples of 16 |
| 142 | * we group 16 super-cluster buffers together (starting just after the |
| 143 | * volume header), followed by 16384x16 clusters, and repeat. |
| 144 | * |
| 145 | * NOTE: A 32768-element single-layer and 16384-element duel-layer A-list |
| 146 | * is the same size. |
| 147 | * |
| 148 | * Special field notes: |
| 149 | * |
| 150 | * vol_bot_beg - offset of boot area (mem_beg - bot_beg bytes) |
| 151 | * vol_mem_beg - offset of memory log (clu_beg - mem_beg bytes) |
| 152 | * vol_clo_beg - offset of cluster #0 in volume |
| 153 | * |
| 154 | * The memory log area allows a kernel to cache new records and data |
| 155 | * in memory without allocating space in the actual filesystem to hold |
| 156 | * the records and data. In the event that a filesystem becomes full, |
| 157 | * any records remaining in memory can be flushed to the memory log |
| 158 | * area. This allows the kernel to immediately return success. |
| 159 | */ |
| 160 | #define HAMMER_VOL_MAXCLUSTERS 32768 /* 1-layer */ |
| 161 | #define HAMMER_VOL_MAXSUPERCLUSTERS 16384 /* 2-layer */ |
| 162 | #define HAMMER_VOL_SUPERCLUSTER_GROUP 16 |
| 163 | #define HAMMER_VOL_METAELMS_1LYR HAMMER_ALIST_METAELMS_32K_1LYR |
| 164 | #define HAMMER_VOL_METAELMS_2LYR HAMMER_ALIST_METAELMS_16K_2LYR |
| 165 | |
| 166 | #define HAMMER_BOOT_MINBYTES (32*1024) |
| 167 | #define HAMMER_BOOT_NOMBYTES (64LL*1024*1024) |
| 168 | #define HAMMER_BOOT_MAXBYTES (256LL*1024*1024) |
| 169 | |
| 170 | #define HAMMER_MEM_MINBYTES (256*1024) |
| 171 | #define HAMMER_MEM_NOMBYTES (1LL*1024*1024*1024) |
| 172 | #define HAMMER_MEM_MAXBYTES (64LL*1024*1024*1024) |
| 173 | |
| 174 | struct hammer_volume_ondisk { |
| 175 | struct hammer_fsbuf_head head; |
| 176 | int64_t vol_bot_beg; /* byte offset of boot area or 0 */ |
| 177 | int64_t vol_mem_beg; /* byte offset of memory log or 0 */ |
| 178 | int64_t vol_clo_beg; /* byte offset of first cl/supercl in volume */ |
| 179 | int64_t vol_clo_end; /* byte offset of volume EOF */ |
| 180 | int64_t vol_locked; /* reserved clusters are >= this offset */ |
| 181 | |
| 182 | uuid_t vol_fsid; /* identify filesystem */ |
| 183 | uuid_t vol_fstype; /* identify filesystem type */ |
| 184 | char vol_name[64]; /* Name of volume */ |
| 185 | |
| 186 | int32_t vol_no; /* volume number within filesystem */ |
| 187 | int32_t vol_count; /* number of volumes making up FS */ |
| 188 | |
| 189 | u_int32_t vol_version; /* version control information */ |
| 190 | u_int32_t vol_reserved01; |
| 191 | u_int32_t vol_flags; /* volume flags */ |
| 192 | u_int32_t vol_rootvol; /* which volume is the root volume? */ |
| 193 | |
| 194 | int32_t vol_clsize; /* cluster size (same for all volumes) */ |
| 195 | int32_t vol_nclusters; |
| 196 | u_int32_t vol_reserved06; |
| 197 | u_int32_t vol_reserved07; |
| 198 | |
| 199 | int32_t vol_stat_blocksize; /* for statfs only */ |
| 200 | int64_t vol_stat_bytes; /* for statfs only */ |
| 201 | int64_t vol_stat_inodes; /* for statfs only */ |
| 202 | |
| 203 | /* |
| 204 | * These fields are initialized and space is reserved in every |
| 205 | * volume making up a HAMMER filesytem, but only the master volume |
| 206 | * contains valid data. |
| 207 | */ |
| 208 | int32_t vol0_root_clu_no; /* root cluster no (index) in rootvol */ |
| 209 | hammer_tid_t vol0_root_clu_id; /* root cluster id */ |
| 210 | hammer_tid_t vol0_nexttid; /* next TID */ |
| 211 | u_int64_t vol0_recid; /* fs-wide record id allocator */ |
| 212 | u_int64_t vol0_synchronized_rec_id; /* XXX */ |
| 213 | |
| 214 | char reserved[1024]; |
| 215 | |
| 216 | /* |
| 217 | * Meta elements for the volume header's A-list, which is either a |
| 218 | * 1-layer A-list capable of managing 32768 clusters, or a 2-layer |
| 219 | * A-list capable of managing 16384 super-clusters (each of which |
| 220 | * can handle 32768 clusters). |
| 221 | */ |
| 222 | union { |
| 223 | struct hammer_almeta super[HAMMER_VOL_METAELMS_2LYR]; |
| 224 | struct hammer_almeta normal[HAMMER_VOL_METAELMS_1LYR]; |
| 225 | } vol_almeta; |
| 226 | u_int32_t vol0_bitmap[1024]; |
| 227 | }; |
| 228 | |
| 229 | typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; |
| 230 | |
| 231 | #define HAMMER_VOLF_VALID 0x0001 /* valid entry */ |
| 232 | #define HAMMER_VOLF_OPEN 0x0002 /* volume is open */ |
| 233 | #define HAMMER_VOLF_USINGSUPERCL 0x0004 /* using superclusters */ |
| 234 | |
| 235 | /* |
| 236 | * HAMMER Super-cluster header |
| 237 | * |
| 238 | * A super-cluster is used to increase the maximum size of a volume. |
| 239 | * HAMMER's volume header can manage up to 32768 direct clusters or |
| 240 | * 16384 super-clusters. Each super-cluster (which is basically just |
| 241 | * a 16K filesystem buffer) can manage up to 32768 clusters. So adding |
| 242 | * a super-cluster layer allows a HAMMER volume to be sized upwards of |
| 243 | * around 32768TB instead of 2TB. |
| 244 | * |
| 245 | * Any volume initially formatted to be over 32G reserves space for the layer |
| 246 | * but the layer is only enabled if the volume exceeds 2TB. |
| 247 | */ |
| 248 | #define HAMMER_SUPERCL_METAELMS HAMMER_ALIST_METAELMS_32K_1LYR |
| 249 | #define HAMMER_SCL_MAXCLUSTERS HAMMER_VOL_MAXCLUSTERS |
| 250 | |
| 251 | struct hammer_supercl_ondisk { |
| 252 | struct hammer_fsbuf_head head; |
| 253 | uuid_t vol_fsid; /* identify filesystem - sanity check */ |
| 254 | uuid_t vol_fstype; /* identify filesystem type - sanity check */ |
| 255 | int32_t reserved[1024]; |
| 256 | |
| 257 | struct hammer_almeta scl_meta[HAMMER_SUPERCL_METAELMS]; |
| 258 | }; |
| 259 | |
| 260 | typedef struct hammer_supercl_ondisk *hammer_supercl_ondisk_t; |
| 261 | |
| 262 | /* |
| 263 | * HAMMER Cluster header |
| 264 | * |
| 265 | * A cluster is limited to 64MB and is made up of 4096 16K filesystem |
| 266 | * buffers. The cluster header contains four A-lists to manage these |
| 267 | * buffers. |
| 268 | * |
| 269 | * master_alist - This is a non-layered A-list which manages pure-data |
| 270 | * allocations and allocations on behalf of other A-lists. |
| 271 | * |
| 272 | * btree_alist - This is a layered A-list which manages filesystem buffers |
| 273 | * containing B-Tree nodes. |
| 274 | * |
| 275 | * record_alist - This is a layered A-list which manages filesystem buffers |
| 276 | * containing records. |
| 277 | * |
| 278 | * mdata_alist - This is a layered A-list which manages filesystem buffers |
| 279 | * containing piecemeal record data. |
| 280 | * |
| 281 | * General storage management works like this: All the A-lists except the |
| 282 | * master start in an all-allocated state. Now lets say you wish to allocate |
| 283 | * a B-Tree node out the btree_alist. If the allocation fails you allocate |
| 284 | * a pure data block out of master_alist and then free that block in |
| 285 | * btree_alist, thereby assigning more space to the btree_alist, and then |
| 286 | * retry your allocation out of the btree_alist. In the reverse direction, |
| 287 | * filesystem buffers can be garbage collected back to master_alist simply |
| 288 | * by doing whole-buffer allocations in btree_alist and then freeing the |
| 289 | * space in master_alist. The whole-buffer-allocation approach to garbage |
| 290 | * collection works because A-list allocations are always power-of-2 sized |
| 291 | * and aligned. |
| 292 | */ |
| 293 | #define HAMMER_CLU_MAXBUFFERS 4096 |
| 294 | #define HAMMER_CLU_MASTER_METAELMS HAMMER_ALIST_METAELMS_4K_1LYR |
| 295 | #define HAMMER_CLU_SLAVE_METAELMS HAMMER_ALIST_METAELMS_4K_2LYR |
| 296 | #define HAMMER_CLU_MAXBYTES (HAMMER_CLU_MAXBUFFERS * HAMMER_BUFSIZE) |
| 297 | |
| 298 | struct hammer_cluster_ondisk { |
| 299 | struct hammer_fsbuf_head head; |
| 300 | uuid_t vol_fsid; /* identify filesystem - sanity check */ |
| 301 | uuid_t vol_fstype; /* identify filesystem type - sanity check */ |
| 302 | |
| 303 | hammer_tid_t clu_id; /* unique cluster self identification */ |
| 304 | hammer_tid_t clu_gen; /* generation number */ |
| 305 | int32_t vol_no; /* cluster contained in volume (sanity) */ |
| 306 | u_int32_t clu_flags; /* cluster flags */ |
| 307 | |
| 308 | int32_t clu_start; /* start of data (byte offset) */ |
| 309 | int32_t clu_limit; /* end of data (byte offset) */ |
| 310 | int32_t clu_no; /* cluster index in volume (sanity) */ |
| 311 | u_int32_t clu_reserved03; |
| 312 | |
| 313 | u_int32_t clu_reserved04; |
| 314 | u_int32_t clu_reserved05; |
| 315 | u_int32_t clu_reserved06; |
| 316 | u_int32_t clu_reserved07; |
| 317 | |
| 318 | int32_t idx_data; /* data append point (element no) */ |
| 319 | int32_t idx_index; /* index append point (element no) */ |
| 320 | int32_t idx_record; /* record prepend point (element no) */ |
| 321 | u_int32_t idx_reserved03; |
| 322 | |
| 323 | /* |
| 324 | * Specify the range of information stored in this cluster as two |
| 325 | * btree elements. These elements match the left and right |
| 326 | * boundary elements in the internal B-Tree node of the parent |
| 327 | * cluster that points to the root of our cluster. Because these |
| 328 | * are boundary elements, the right boundary is range-NONinclusive. |
| 329 | */ |
| 330 | struct hammer_base_elm clu_btree_beg; |
| 331 | struct hammer_base_elm clu_btree_end; |
| 332 | |
| 333 | /* |
| 334 | * The cluster's B-Tree root can change as a side effect of insertion |
| 335 | * and deletion operations so store an offset instead of embedding |
| 336 | * the root node. The parent_offset is stale if the generation number |
| 337 | * does not match. |
| 338 | * |
| 339 | * Parent linkages are explicit. |
| 340 | */ |
| 341 | int32_t clu_btree_root; |
| 342 | int32_t clu_btree_parent_vol_no; |
| 343 | int32_t clu_btree_parent_clu_no; |
| 344 | int32_t clu_btree_parent_offset; |
| 345 | hammer_tid_t clu_btree_parent_clu_gen; |
| 346 | |
| 347 | u_int64_t synchronized_rec_id; |
| 348 | |
| 349 | struct hammer_almeta clu_master_meta[HAMMER_CLU_MASTER_METAELMS]; |
| 350 | struct hammer_almeta clu_btree_meta[HAMMER_CLU_SLAVE_METAELMS]; |
| 351 | struct hammer_almeta clu_record_meta[HAMMER_CLU_SLAVE_METAELMS]; |
| 352 | struct hammer_almeta clu_mdata_meta[HAMMER_CLU_SLAVE_METAELMS]; |
| 353 | }; |
| 354 | |
| 355 | typedef struct hammer_cluster_ondisk *hammer_cluster_ondisk_t; |
| 356 | |
| 357 | /* |
| 358 | * HAMMER records are 96 byte entities encoded into 16K filesystem buffers. |
| 359 | * Each record has a 64 byte header and a 32 byte extension. 170 records |
| 360 | * fit into each buffer. Storage is managed by the buffer's A-List. |
| 361 | * |
| 362 | * Each record may have an explicit data reference to a block of data up |
| 363 | * to 2^31-1 bytes in size within the current cluster. Note that multiple |
| 364 | * records may share the same or overlapping data references. |
| 365 | */ |
| 366 | |
| 367 | /* |
| 368 | * All HAMMER records have a common 64-byte base and a 32-byte extension. |
| 369 | * |
| 370 | * Many HAMMER record types reference out-of-band data within the cluster. |
| 371 | * This data can also be stored in-band in the record itself if it is small |
| 372 | * enough. Either way, (data_offset, data_len) points to it. |
| 373 | * |
| 374 | * Key comparison order: obj_id, rec_type, key, create_tid |
| 375 | */ |
| 376 | struct hammer_base_record { |
| 377 | /* |
| 378 | * 40 byte base element info - same base as used in B-Tree internal |
| 379 | * and leaf node element arrays. |
| 380 | * |
| 381 | * Fields: obj_id, key, create_tid, delete_tid, rec_type, obj_type, |
| 382 | * reserved07. |
| 383 | */ |
| 384 | struct hammer_base_elm base; /* 00 base element info */ |
| 385 | |
| 386 | int32_t data_len; /* 28 size of data (remainder zero-fill) */ |
| 387 | u_int32_t data_crc; /* 2C data sanity check */ |
| 388 | u_int64_t rec_id; /* 30 record id (iterator for recovery) */ |
| 389 | int32_t data_offset; /* 38 cluster-relative data reference or 0 */ |
| 390 | u_int32_t reserved07; /* 3C */ |
| 391 | /* 40 */ |
| 392 | }; |
| 393 | |
| 394 | /* |
| 395 | * Record types are fairly straightforward. The B-Tree includes the record |
| 396 | * type in its index sort. |
| 397 | * |
| 398 | * In particular please note that it is possible to create a pseudo- |
| 399 | * filesystem within a HAMMER filesystem by creating a special object |
| 400 | * type within a directory. Pseudo-filesystems are used as replication |
| 401 | * targets and even though they are built within a HAMMER filesystem they |
| 402 | * get their own obj_id space (and thus can serve as a replication target) |
| 403 | * and look like a mount point to the system. |
| 404 | * |
| 405 | * Inter-cluster records are special-cased in the B-Tree. These records |
| 406 | * are referenced from a B-Tree INTERNAL node, NOT A LEAF. This means |
| 407 | * that the element in the B-Tree node is actually a boundary element whos |
| 408 | * base element fields, including rec_type, reflect the boundary, NOT |
| 409 | * the inter-cluster record type. |
| 410 | * |
| 411 | * HAMMER_RECTYPE_CLUSTER - only set in the actual inter-cluster record, |
| 412 | * not set in the left or right boundary elements around the inter-cluster |
| 413 | * reference of an internal node in the B-Tree (because doing so would |
| 414 | * interfere with the boundary tests). |
| 415 | */ |
| 416 | #define HAMMER_RECTYPE_UNKNOWN 0 |
| 417 | #define HAMMER_RECTYPE_LOWEST 1 /* lowest record type avail */ |
| 418 | #define HAMMER_RECTYPE_INODE 1 /* inode in obj_id space */ |
| 419 | #define HAMMER_RECTYPE_PSEUDO_INODE 2 /* pseudo filesysem */ |
| 420 | #define HAMMER_RECTYPE_CLUSTER 3 /* inter-cluster reference */ |
| 421 | #define HAMMER_RECTYPE_DATA 0x10 |
| 422 | #define HAMMER_RECTYPE_DIRENTRY 0x11 |
| 423 | #define HAMMER_RECTYPE_DB 0x12 |
| 424 | #define HAMMER_RECTYPE_EXT 0x13 /* ext attributes */ |
| 425 | |
| 426 | #define HAMMER_OBJTYPE_UNKNOWN 0 /* (never exists on-disk) */ |
| 427 | #define HAMMER_OBJTYPE_DIRECTORY 1 |
| 428 | #define HAMMER_OBJTYPE_REGFILE 2 |
| 429 | #define HAMMER_OBJTYPE_DBFILE 3 |
| 430 | #define HAMMER_OBJTYPE_FIFO 4 |
| 431 | #define HAMMER_OBJTYPE_CDEV 5 |
| 432 | #define HAMMER_OBJTYPE_BDEV 6 |
| 433 | #define HAMMER_OBJTYPE_SOFTLINK 7 |
| 434 | #define HAMMER_OBJTYPE_PSEUDOFS 8 /* pseudo filesystem obj */ |
| 435 | |
| 436 | /* |
| 437 | * Generic full-sized record |
| 438 | */ |
| 439 | struct hammer_generic_record { |
| 440 | struct hammer_base_record base; |
| 441 | char filler[32]; |
| 442 | }; |
| 443 | |
| 444 | /* |
| 445 | * A HAMMER inode record. |
| 446 | * |
| 447 | * This forms the basis for a filesystem object. obj_id is the inode number, |
| 448 | * key1 represents the pseudo filesystem id for security partitioning |
| 449 | * (preventing cross-links and/or restricting a NFS export and specifying the |
| 450 | * security policy), and key2 represents the data retention policy id. |
| 451 | * |
| 452 | * Inode numbers are 64 bit quantities which uniquely identify a filesystem |
| 453 | * object for the ENTIRE life of the filesystem, even after the object has |
| 454 | * been deleted. For all intents and purposes inode numbers are simply |
| 455 | * allocated by incrementing a sequence space. |
| 456 | * |
| 457 | * There is an important distinction between the data stored in the inode |
| 458 | * record and the record's data reference. The record references a |
| 459 | * hammer_inode_data structure but the filesystem object size and hard link |
| 460 | * count is stored in the inode record itself. This allows multiple inodes |
| 461 | * to share the same hammer_inode_data structure. This is possible because |
| 462 | * any modifications will lay out new data. The HAMMER implementation need |
| 463 | * not use the data-sharing ability when laying down new records. |
| 464 | * |
| 465 | * A HAMMER inode is subject to the same historical storage requirements |
| 466 | * as any other record. In particular any change in filesystem or hard link |
| 467 | * count will lay down a new inode record when the filesystem is synced to |
| 468 | * disk. This can lead to a lot of junk records which get cleaned up by |
| 469 | * the data retention policy. |
| 470 | * |
| 471 | * The ino_atime and ino_mtime fields are a special case. Modifications to |
| 472 | * these fields do NOT lay down a new record by default, though the values |
| 473 | * are effectively frozen for snapshots which access historical versions |
| 474 | * of the inode record due to other operations. This means that atime will |
| 475 | * not necessarily be accurate in snapshots, backups, or mirrors. mtime |
| 476 | * will be accurate in backups and mirrors since it can be regenerated from |
| 477 | * the mirroring stream. |
| 478 | * |
| 479 | * Because nlinks is historically retained the hardlink count will be |
| 480 | * accurate when accessing a HAMMER filesystem snapshot. |
| 481 | */ |
| 482 | struct hammer_inode_record { |
| 483 | struct hammer_base_record base; |
| 484 | u_int64_t ino_atime; /* last access time (not historical) */ |
| 485 | u_int64_t ino_mtime; /* last modified time (not historical) */ |
| 486 | u_int64_t ino_size; /* filesystem object size */ |
| 487 | u_int64_t ino_nlinks; /* hard links */ |
| 488 | }; |
| 489 | |
| 490 | /* |
| 491 | * Data records specify the entire contents of a regular file object, |
| 492 | * including attributes. Small amounts of data can theoretically be |
| 493 | * embedded in the record itself but the use of this ability verses using |
| 494 | * an out-of-band data reference depends on the implementation. |
| 495 | */ |
| 496 | struct hammer_data_record { |
| 497 | struct hammer_base_record base; |
| 498 | char filler[32]; |
| 499 | }; |
| 500 | |
| 501 | /* |
| 502 | * A directory entry specifies the HAMMER filesystem object id, a copy of |
| 503 | * the file type, and file name (either embedded or as out-of-band data). |
| 504 | * If the file name is short enough to fit into den_name[] (including a |
| 505 | * terminating nul) then it will be embedded in the record, otherwise it |
| 506 | * is stored out-of-band. The base record's data reference always points |
| 507 | * to the nul-terminated filename regardless. |
| 508 | * |
| 509 | * Directory entries are indexed with a 128 bit namekey rather then an |
| 510 | * offset. A portion of the namekey is an iterator or randomizer to deal |
| 511 | * with collisions. |
| 512 | * |
| 513 | * NOTE: base.base.obj_type holds the filesystem object type of obj_id, |
| 514 | * e.g. a den_type equivalent. |
| 515 | * |
| 516 | * NOTE: den_name / the filename data reference is NOT terminated with \0. |
| 517 | * |
| 518 | */ |
| 519 | struct hammer_entry_record { |
| 520 | struct hammer_base_record base; |
| 521 | u_int64_t obj_id; /* object being referenced */ |
| 522 | u_int64_t reserved01; |
| 523 | char den_name[16]; /* short file names fit in record */ |
| 524 | }; |
| 525 | |
| 526 | /* |
| 527 | * Hammer rollup record |
| 528 | */ |
| 529 | union hammer_record_ondisk { |
| 530 | struct hammer_base_record base; |
| 531 | struct hammer_generic_record generic; |
| 532 | struct hammer_inode_record inode; |
| 533 | struct hammer_data_record data; |
| 534 | struct hammer_entry_record entry; |
| 535 | }; |
| 536 | |
| 537 | typedef union hammer_record_ondisk *hammer_record_ondisk_t; |
| 538 | |
| 539 | /* |
| 540 | * Filesystem buffer for records |
| 541 | */ |
| 542 | #define HAMMER_RECORD_NODES \ |
| 543 | ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \ |
| 544 | sizeof(union hammer_record_ondisk)) |
| 545 | |
| 546 | struct hammer_fsbuf_recs { |
| 547 | struct hammer_fsbuf_head head; |
| 548 | char unused[32]; |
| 549 | union hammer_record_ondisk recs[HAMMER_RECORD_NODES]; |
| 550 | }; |
| 551 | |
| 552 | /* |
| 553 | * Filesystem buffer for piecemeal data. Note that this does not apply |
| 554 | * to dedicated pure-data buffers as such buffers do not have a header. |
| 555 | */ |
| 556 | |
| 557 | #define HAMMER_DATA_SIZE (HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) |
| 558 | #define HAMMER_DATA_BLKSIZE 64 |
| 559 | #define HAMMER_DATA_BLKMASK (HAMMER_DATA_BLKSIZE-1) |
| 560 | #define HAMMER_DATA_NODES (HAMMER_DATA_SIZE / HAMMER_DATA_BLKSIZE) |
| 561 | |
| 562 | struct hammer_fsbuf_data { |
| 563 | struct hammer_fsbuf_head head; |
| 564 | u_int8_t data[HAMMER_DATA_NODES][HAMMER_DATA_BLKSIZE]; |
| 565 | }; |
| 566 | |
| 567 | /* |
| 568 | * Filesystem buffer rollup |
| 569 | */ |
| 570 | union hammer_fsbuf_ondisk { |
| 571 | struct hammer_fsbuf_head head; |
| 572 | struct hammer_fsbuf_btree btree; |
| 573 | struct hammer_fsbuf_recs record; |
| 574 | struct hammer_fsbuf_data data; |
| 575 | }; |
| 576 | |
| 577 | typedef union hammer_fsbuf_ondisk *hammer_fsbuf_ondisk_t; |
| 578 | |
| 579 | /* |
| 580 | * HAMMER UNIX Attribute data |
| 581 | * |
| 582 | * The data reference in a HAMMER inode record points to this structure. Any |
| 583 | * modifications to the contents of this structure will result in a record |
| 584 | * replacement operation. |
| 585 | * |
| 586 | * state_sum allows a filesystem object to be validated to a degree by |
| 587 | * generating a checksum of all of its pieces (in no particular order) and |
| 588 | * checking it against this field. |
| 589 | * |
| 590 | * short_data_off allows a small amount of data to be embedded in the |
| 591 | * hammer_inode_data structure. HAMMER typically uses this to represent |
| 592 | * up to 64 bytes of data, or to hold symlinks. Remember that allocations |
| 593 | * are in powers of 2 so 64, 192, 448, or 960 bytes of embedded data is |
| 594 | * support (64+64, 64+192, 64+448 64+960). |
| 595 | * |
| 596 | * parent_obj_id is only valid for directories (which cannot be hard-linked), |
| 597 | * and specifies the parent directory obj_id. This field will also be set |
| 598 | * for non-directory inodes as a recovery aid, but can wind up specifying |
| 599 | * stale information. However, since object id's are not reused, the worse |
| 600 | * that happens is that the recovery code is unable to use it. |
| 601 | */ |
| 602 | struct hammer_inode_data { |
| 603 | u_int16_t version; /* inode data version */ |
| 604 | u_int16_t mode; /* basic unix permissions */ |
| 605 | u_int32_t uflags; /* chflags */ |
| 606 | u_int16_t short_data_off; /* degenerate data case */ |
| 607 | u_int16_t short_data_len; |
| 608 | u_int32_t state_sum; |
| 609 | u_int64_t ctime; |
| 610 | u_int64_t parent_obj_id;/* parent directory obj_id */ |
| 611 | uuid_t uid; |
| 612 | uuid_t gid; |
| 613 | /* XXX device, softlink extension */ |
| 614 | }; |
| 615 | |
| 616 | #define HAMMER_INODE_DATA_VERSION 1 |
| 617 | |
| 618 | /* |
| 619 | * Rollup various structures embedded as record data |
| 620 | */ |
| 621 | union hammer_data_ondisk { |
| 622 | struct hammer_inode_data inode; |
| 623 | }; |
| 624 | |