2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/Attic/hammerfs.h,v 1.1 2007/10/10 19:37:25 dillon Exp $
42 * The structures below represent the on-disk format for a HAMMER
43 * filesystem. Note that all fields for on-disk structures are naturally
44 * aligned. The host endian format is used - compatibility is possible
45 * if the implementation detects reversed endian and adjusts data accordingly.
47 * Most of HAMMER revolves around the concept of an object identifier. An
48 * obj_id is a 64 bit quantity which uniquely identifies a filesystem object
49 * FOR THE ENTIRE LIFE OF THE FILESYSTEM. This uniqueness allows backups
50 * and mirrors to retain varying amounts of filesystem history by removing
51 * any possibility of conflict through identifier reuse.
53 * A HAMMER filesystem may spam multiple volumes.
55 * A HAMMER filesystem uses a 16K filesystem buffer size. All filesystem
56 * I/O is done in multiples of 16K.
58 #define HAMMER_BUFSIZE 16384
59 #define HAMMER_BUFMASK (HAMMER_BUFSIZE - 1)
62 * Hammer transction ids are 64 bit unsigned integers and are usually
63 * synchronized with the time of day in nanoseconds.
65 typedef u_int64_t hammer_tid_t;
68 * Storage allocations are managed in powers of 2 with a hinted radix tree
69 * based in volume and cluster headers. The tree is not necessarily
70 * contained within the header and may recurse into other storage elements.
72 * The allocator's basic storage element is the hammer_almeta structure
73 * which is laid out recursively in a buffer. Allocations are driven using
74 * a template called hammer_alist which is constructed in memory and NOT
75 * stored in the filesystem.
77 struct hammer_almeta {
82 #define HAMMER_ALMETA_SIZE 8
85 int32_t bl_blocks; /* area of coverage */
86 int32_t bl_radix; /* coverage radix */
87 int32_t bl_skip; /* starting skip for linear layout */
88 int32_t bl_free; /* number of free blocks */
89 int32_t bl_rootblks; /* meta-blocks allocated for tree */
92 typedef struct hammer_almeta hammer_almeta_t;
93 typedef struct hammer_alist *hammer_alist_t;
95 #define HAMMER_ALIST_META_RADIX (sizeof(u_int32_t) * 4) /* 16 */
96 #define HAMMER_ALIST_BMAP_RADIX (sizeof(u_int32_t) * 8) /* 32 */
97 #define HAMMER_ALIST_BLOCK_NONE ((int32_t)-1)
98 #define HAMMER_ALIST_FORWARDS 0x0001
99 #define HAMMER_ALIST_BACKWARDS 0x0002
102 * Most HAMMER data structures are embedded in 16K filesystem buffers.
103 * All filesystem buffers except those designated as pure-data buffers
104 * contain this 128-byte header.
106 * This structure contains an embedded A-List used to manage space within
107 * the filesystem buffer. It is not used by volume or cluster header
108 * buffers, or by pure-data buffers. The granularity is variable and
109 * depends on the type of filesystem buffer. BLKSIZE is just a minimum.
112 #define HAMMER_FSBUF_HEAD_SIZE 128
113 #define HAMMER_FSBUF_MAXBLKS 256
114 #define HAMMER_FSBUF_METAELMS 10 /* 10 elements needed for 256 blks */
116 struct hammer_fsbuf_head {
119 u_int32_t buf_reserved07;
120 u_int32_t reserved[8];
121 struct hammer_almeta buf_almeta[HAMMER_FSBUF_METAELMS];
124 typedef struct hammer_fsbuf_head *hammer_fsbuf_head_t;
126 #define HAMMER_FSBUF_VOLUME 0xC8414D4DC5523031ULL /* HAMMER01 */
127 #define HAMMER_FSBUF_CLUSTER 0xC8414D52C34C5553ULL /* HAMRCLUS */
128 #define HAMMER_FSBUF_RECORDS 0xC8414D52D2454353ULL /* HAMRRECS */
129 #define HAMMER_FSBUF_BTREE 0xC8414D52C2545245ULL /* HAMRBTRE */
130 #define HAMMER_FSBUF_DATA 0xC8414D52C4415441ULL /* HAMRDATA */
132 #define HAMMER_FSBUF_VOLUME_REV 0x313052C54D4D41C8ULL /* (reverse endian) */
135 * The B-Tree structures need hammer_fsbuf_head.
137 #include "hammer_btree.h"
140 * HAMMER Volume header
142 * A HAMMER filesystem is built from any number of block devices, Each block
143 * device contains a volume header followed by however many clusters
144 * fit in the volume. Clusters cannot be migrated but the data they contain
145 * can, so HAMMER can use a truncated cluster for any extra space at the
148 * The volume containing the root cluster is designated as the master volume.
149 * The root cluster designation can be moved to any volume.
151 * The volume header takes up an entire 16K filesystem buffer and includes
152 * an A-list to manage the clusters contained within the volume (up to 32768).
153 * With 512M clusters a volume will be limited to 16TB.
155 #define HAMMER_VOL_MAXCLUSTERS 32768
156 #define HAMMER_VOL_METAELMS 1094
158 struct hammer_volume_ondisk {
159 struct hammer_fsbuf_head head;
160 int64_t vol_beg; /* byte offset of first cluster in volume */
161 int64_t vol_end; /* byte offset of volume EOF */
162 int64_t vol_locked; /* reserved clusters are >= this offset */
164 uuid_t vol_fsid; /* identify filesystem */
165 uuid_t vol_fstype; /* identify filesystem type */
166 char vol_name[64]; /* Name of volume */
168 int32_t vol_no; /* volume number within filesystem */
169 int32_t vol_count; /* number of volumes making up FS */
171 u_int32_t vol_version; /* version control information */
172 u_int32_t vol_segsize; /* cluster size power of 2, 512M max */
173 u_int32_t vol_flags; /* volume flags */
174 u_int32_t vol_rootvol; /* which volume is the root volume? */
176 int32_t vol_clsize; /* cluster size (same for all volumes) */
177 u_int32_t vol_reserved05;
178 u_int32_t vol_reserved06;
179 u_int32_t vol_reserved07;
182 * These fields are initialized and space is reserved in every
183 * volume making up a HAMMER filesytem, but only the master volume
184 * contains valid data.
186 int32_t vol0_rootcluster; /* root cluster no (index) in rootvol */
187 u_int32_t vol0_reserved02;
188 u_int32_t vol0_reserved03;
189 hammer_tid_t vol0_nexttid; /* next TID */
190 u_int64_t vol0_recid; /* fs-wide record id allocator */
194 hammer_almeta_t vol_almeta[HAMMER_VOL_METAELMS];
195 u_int32_t vol0_bitmap[1024];
198 #define HAMMER_VOLF_VALID 0x0001 /* valid entry */
199 #define HAMMER_VOLF_OPEN 0x0002 /* volume is open */
202 * HAMMER Cluster header
204 * The cluster header contains all the information required to identify a
205 * cluster, locate critical information areas within the cluster, and
206 * to manage space within the cluster.
208 * A Cluster contains pure data, incremental data, b-tree nodes, and records.
210 #define HAMMER_CLU_MAXBUFFERS 32768
211 #define HAMMER_CLU_METAELMS 1094
213 struct hammer_cluster_ondisk {
214 struct hammer_fsbuf_head head;
215 uuid_t vol_fsid; /* identify filesystem - sanity check */
216 uuid_t vol_fstype; /* identify filesystem type - sanity check */
218 u_int64_t clu_gen; /* identify generation number of cluster */
219 u_int64_t clu_unused01;
221 hammer_tid_t clu_id; /* unique cluster self identification */
222 int32_t vol_no; /* cluster contained in volume (sanity) */
223 u_int32_t clu_flags; /* cluster flags */
225 int32_t clu_start; /* start of data (byte offset) */
226 int32_t clu_limit; /* end of data (byte offset) */
227 int32_t clu_no; /* cluster index in volume (sanity) */
228 u_int32_t clu_reserved03;
230 u_int32_t clu_reserved04;
231 u_int32_t clu_reserved05;
232 u_int32_t clu_reserved06;
233 u_int32_t clu_reserved07;
235 int32_t idx_data; /* data append point (byte offset) */
236 int32_t idx_index; /* index append point (byte offset) */
237 int32_t idx_record; /* record prepend point (byte offset) */
238 u_int32_t idx_reserved03;
241 * Specify the range of information stored in this cluster. These
242 * structures match the B-Tree elements in our parent cluster
243 * (if any) that point to us. Note that clu_objend is
244 * range-inclusive, not range-exclusive so e.g. 0-1023 instead
247 int64_t clu_parent; /* parent vol & cluster */
248 struct hammer_base_elm clu_objstart;
249 struct hammer_base_elm clu_objend;
252 * The root node of the cluster's B-Tree is embedded in the
253 * cluster header. The node is 504 bytes.
255 struct hammer_btree_node clu_btree_root;
258 * HAMMER needs a separate bitmap to indicate which buffers are
259 * managed (contain a hammer_fsbuf_head). Any buffers not so
260 * designated are either unused or contain pure data.
262 * synchronized_rec_id is the synchronization point for the
263 * cluster. Any records with a greater or equal rec_id found
264 * when recovering a cluster are likely incomplete and will be
267 u_int64_t synchronized_rec_id;
268 u_int32_t managed_buffers_bitmap[HAMMER_CLU_MAXBUFFERS/32];
271 hammer_almeta_t clu_almeta[HAMMER_CLU_METAELMS];
275 * HAMMER records are 96 byte entities encoded into 16K filesystem buffers.
276 * Each record has a 64 byte header and a 32 byte extension. 170 records
277 * fit into each buffer. Storage is managed by the buffer's A-List.
279 * Each record may have an explicit data reference to a block of data up
280 * to 2^31-1 bytes in size within the current cluster. Note that multiple
281 * records may share the same or overlapping data references.
285 * All HAMMER records have a common 64-byte base and a 32-byte extension.
287 * Many HAMMER record types reference out-of-band data within the cluster.
288 * This data can also be stored in-band in the record itself if it is small
289 * enough. Either way, (data_offset, data_len) points to it.
291 * Key comparison order: obj_id, rec_type, key, create_tid
293 struct hammer_base_record {
294 int64_t obj_id; /* 00 object record is associated with */
295 int64_t key; /* 08 indexing key (offset or namekey) */
297 hammer_tid_t create_tid;/* 10 transaction id for record creation */
298 hammer_tid_t delete_tid;/* 18 transaction id for record update/delete */
300 u_int16_t rec_type; /* 20 type of record */
301 u_int16_t obj_type; /* 22 type of object (if inode) */
302 u_int32_t data_offset; /* 24 intra-cluster data reference */
303 /* An offset of 0 indicates zero-fill */
304 int32_t data_len; /* 28 size of data (remainder zero-fill) */
305 u_int32_t data_crc; /* 2C data sanity check */
306 u_int64_t rec_id; /* 30 record id (iterator for recovery) */
307 u_int64_t reserved07; /* 38 */
311 #define HAMMER_RECTYPE_UNKNOWN 0
312 #define HAMMER_RECTYPE_INODE 1 /* inode in obj_id space */
313 #define HAMMER_RECTYPE_SLAVE 2 /* slave inode */
314 #define HAMMER_RECTYPE_OBJZONE 3 /* subdivide obj_id space */
315 #define HAMMER_RECTYPE_DATA_CREATE 0x10
316 #define HAMMER_RECTYPE_DATA_ZEROFILL 0x11
317 #define HAMMER_RECTYPE_DATA_DELETE 0x12
318 #define HAMMER_RECTYPE_DATA_UPDATE 0x13
319 #define HAMMER_RECTYPE_DIR_CREATE 0x20
320 #define HAMMER_RECTYPE_DIR_DELETE 0x22
321 #define HAMMER_RECTYPE_DIR_UPDATE 0x23
322 #define HAMMER_RECTYPE_DB_CREATE 0x30
323 #define HAMMER_RECTYPE_DB_DELETE 0x32
324 #define HAMMER_RECTYPE_DB_UPDATE 0x33
325 #define HAMMER_RECTYPE_EXT_CREATE 0x40 /* ext attributes */
326 #define HAMMER_RECTYPE_EXT_DELETE 0x42
327 #define HAMMER_RECTYPE_EXT_UPDATE 0x43
329 #define HAMMER_OBJTYPE_DIRECTORY 1
330 #define HAMMER_OBJTYPE_REGFILE 2
331 #define HAMMER_OBJTYPE_DBFILE 3
332 #define HAMMER_OBJTYPE_FIFO 4
333 #define HAMMER_OBJTYPE_DEVNODE 5
334 #define HAMMER_OBJTYPE_SOFTLINK 6
337 * Generic full-sized record
339 struct hammer_generic_record {
340 struct hammer_base_record base;
345 * A HAMMER inode record.
347 * This forms the basis for a filesystem object. obj_id is the inode number,
348 * key1 represents the pseudo filesystem id for security partitioning
349 * (preventing cross-links and/or restricting a NFS export and specifying the
350 * security policy), and key2 represents the data retention policy id.
352 * Inode numbers are 64 bit quantities which uniquely identify a filesystem
353 * object for the ENTIRE life of the filesystem, even after the object has
354 * been deleted. For all intents and purposes inode numbers are simply
355 * allocated by incrementing a sequence space.
357 * There is an important distinction between the data stored in the inode
358 * record and the record's data reference. The record references a
359 * hammer_inode_data structure but the filesystem object size and hard link
360 * count is stored in the inode record itself. This allows multiple inodes
361 * to share the same hammer_inode_data structure. This is possible because
362 * any modifications will lay out new data. The HAMMER implementation need
363 * not use the data-sharing ability when laying down new records.
365 * A HAMMER inode is subject to the same historical storage requirements
366 * as any other record. In particular any change in filesystem or hard link
367 * count will lay down a new inode record when the filesystem is synced to
368 * disk. This can lead to a lot of junk records which get cleaned up by
369 * the data retention policy.
371 * The ino_atime and ino_mtime fields are a special case. Modifications to
372 * these fields do NOT lay down a new record by default, though the values
373 * are effectively frozen for snapshots which access historical versions
374 * of the inode record due to other operations. This means that atime will
375 * not necessarily be accurate in snapshots, backups, or mirrors. mtime
376 * will be accurate in backups and mirrors since it can be regenerated from
377 * the mirroring stream.
379 * Because nlinks is historically retained the hardlink count will be
380 * accurate when accessing a HAMMER filesystem snapshot.
382 struct hammer_inode_record {
383 struct hammer_base_record base;
384 u_int64_t ino_atime; /* last access time (not historical) */
385 u_int64_t ino_mtime; /* last modified time (not historical) */
386 u_int64_t ino_size; /* filesystem object size */
387 u_int64_t ino_nlinks; /* hard links */
391 * Data records specify the entire contents of a regular file object,
392 * including attributes. Small amounts of data can theoretically be
393 * embedded in the record itself but the use of this ability verses using
394 * an out-of-band data reference depends on the implementation.
396 struct hammer_data_record {
397 struct hammer_base_record base;
402 * A directory entry specifies the HAMMER filesystem object id, a copy of
403 * the file type, and file name (either embedded or as out-of-band data).
404 * If the file name is short enough to fit into den_name[] (including a
405 * terminating nul) then it will be embedded in the record, otherwise it
406 * is stored out-of-band. The base record's data reference always points
407 * to the nul-terminated filename regardless.
409 * Directory entries are indexed with a 128 bit namekey rather then an
410 * offset. A portion of the namekey is an iterator or randomizer to deal
413 struct hammer_entry_record {
414 struct hammer_base_record base;
415 u_int64_t obj_id; /* object being referenced */
416 u_int64_t reserved01;
417 u_int8_t den_type; /* cached file type */
418 char den_name[15]; /* short file names fit in record */
422 * Hammer rollup record
424 union hammer_record {
425 struct hammer_base_record base;
426 struct hammer_generic_record generic;
427 struct hammer_inode_record inode;
428 struct hammer_data_record data;
429 struct hammer_entry_record entry;
432 typedef union hammer_record *hammer_record_t;
435 * Filesystem buffer for records
437 #define HAMMER_RECORD_NODES \
438 ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \
439 sizeof(union hammer_record))
441 struct hammer_fsbuf_recs {
442 struct hammer_fsbuf_head head;
444 union hammer_record recs[HAMMER_RECORD_NODES];
448 * Filesystem buffer for piecemeal data. Note that this does not apply
449 * to dedicated pure-data buffers as such buffers do not have a header.
452 #define HAMMER_DATA_SIZE (HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head))
453 #define HAMMER_DATA_BLKSIZE 64
454 #define HAMMER_DATA_NODES (HAMMER_DATA_SIZE / HAMMER_DATA_BLKSIZE)
456 struct hammer_fsbuf_data {
457 struct hammer_fsbuf_head head;
458 u_int8_t data[HAMMER_DATA_NODES][HAMMER_DATA_BLKSIZE];
463 * HAMMER UNIX Attribute data
465 * The data reference in a HAMMER inode record points to this structure. Any
466 * modifications to the contents of this structure will result in a record
467 * replacement operation.
469 * state_sum allows a filesystem object to be validated to a degree by
470 * generating a checksum of all of its pieces (in no particular order) and
471 * checking it against this field.
473 struct hammer_inode_data {
474 u_int16_t version; /* inode data version */
475 u_int16_t mode; /* basic unix permissions */
476 u_int32_t uflags; /* chflags */
477 u_int64_t reserved01;
478 u_int64_t reserved02;
479 u_int64_t state_sum; /* cumulative checksum */
484 #define HAMMER_INODE_DATA_VERSION 1
487 * Function library support available to kernel and userland
489 void hammer_alist_template(hammer_alist_t, int blocks, int maxmeta);
490 void hammer_alist_init(hammer_alist_t bl, hammer_almeta_t *meta);
491 int32_t hammer_alist_alloc(hammer_alist_t bl, hammer_almeta_t *meta,
493 int32_t hammer_alist_alloc_rev(hammer_alist_t bl, hammer_almeta_t *meta,
496 int32_t hammer_alist_alloc_from(hammer_alist_t bl, hammer_almeta_t *meta,
497 int32_t count, int32_t start, int flags);
499 void hammer_alist_free(hammer_alist_t bl, hammer_almeta_t *meta,
500 int32_t blkno, int32_t count);