Adjust the description of HAMMER's storage limitations. I have rearranged
[dragonfly.git] / sys / vfs / hammer / hammerfs.h
CommitLineData
8750964d
MD
1/*
2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $DragonFly: src/sys/vfs/hammer/Attic/hammerfs.h,v 1.1 2007/10/10 19:37:25 dillon Exp $
35 */
36
37#ifndef _SYS_UUID_H_
38#include <sys/uuid.h>
39#endif
40
41/*
42 * The structures below represent the on-disk format for a HAMMER
43 * filesystem. Note that all fields for on-disk structures are naturally
44 * aligned. The host endian format is used - compatibility is possible
45 * if the implementation detects reversed endian and adjusts data accordingly.
46 *
47 * Most of HAMMER revolves around the concept of an object identifier. An
48 * obj_id is a 64 bit quantity which uniquely identifies a filesystem object
49 * FOR THE ENTIRE LIFE OF THE FILESYSTEM. This uniqueness allows backups
50 * and mirrors to retain varying amounts of filesystem history by removing
51 * any possibility of conflict through identifier reuse.
52 *
53 * A HAMMER filesystem may spam multiple volumes.
54 *
55 * A HAMMER filesystem uses a 16K filesystem buffer size. All filesystem
56 * I/O is done in multiples of 16K.
57 */
58#define HAMMER_BUFSIZE 16384
59#define HAMMER_BUFMASK (HAMMER_BUFSIZE - 1)
60
61/*
62 * Hammer transction ids are 64 bit unsigned integers and are usually
63 * synchronized with the time of day in nanoseconds.
64 */
65typedef u_int64_t hammer_tid_t;
66
67/*
68 * Storage allocations are managed in powers of 2 with a hinted radix tree
69 * based in volume and cluster headers. The tree is not necessarily
70 * contained within the header and may recurse into other storage elements.
71 *
72 * The allocator's basic storage element is the hammer_almeta structure
73 * which is laid out recursively in a buffer. Allocations are driven using
74 * a template called hammer_alist which is constructed in memory and NOT
75 * stored in the filesystem.
76 */
77struct hammer_almeta {
78 u_int32_t bm_bitmap;
79 int32_t bm_bighint;
80};
81
82#define HAMMER_ALMETA_SIZE 8
83
84struct hammer_alist {
85 int32_t bl_blocks; /* area of coverage */
86 int32_t bl_radix; /* coverage radix */
87 int32_t bl_skip; /* starting skip for linear layout */
88 int32_t bl_free; /* number of free blocks */
89 int32_t bl_rootblks; /* meta-blocks allocated for tree */
90};
91
92typedef struct hammer_almeta hammer_almeta_t;
93typedef struct hammer_alist *hammer_alist_t;
94
95#define HAMMER_ALIST_META_RADIX (sizeof(u_int32_t) * 4) /* 16 */
96#define HAMMER_ALIST_BMAP_RADIX (sizeof(u_int32_t) * 8) /* 32 */
97#define HAMMER_ALIST_BLOCK_NONE ((int32_t)-1)
98#define HAMMER_ALIST_FORWARDS 0x0001
99#define HAMMER_ALIST_BACKWARDS 0x0002
100
101/*
102 * Most HAMMER data structures are embedded in 16K filesystem buffers.
103 * All filesystem buffers except those designated as pure-data buffers
104 * contain this 128-byte header.
105 *
106 * This structure contains an embedded A-List used to manage space within
107 * the filesystem buffer. It is not used by volume or cluster header
108 * buffers, or by pure-data buffers. The granularity is variable and
109 * depends on the type of filesystem buffer. BLKSIZE is just a minimum.
110 */
111
112#define HAMMER_FSBUF_HEAD_SIZE 128
113#define HAMMER_FSBUF_MAXBLKS 256
114#define HAMMER_FSBUF_METAELMS 10 /* 10 elements needed for 256 blks */
115
116struct hammer_fsbuf_head {
117 u_int64_t buf_type;
118 u_int32_t buf_crc;
119 u_int32_t buf_reserved07;
120 u_int32_t reserved[8];
121 struct hammer_almeta buf_almeta[HAMMER_FSBUF_METAELMS];
122};
123
124typedef struct hammer_fsbuf_head *hammer_fsbuf_head_t;
125
126#define HAMMER_FSBUF_VOLUME 0xC8414D4DC5523031ULL /* HAMMER01 */
127#define HAMMER_FSBUF_CLUSTER 0xC8414D52C34C5553ULL /* HAMRCLUS */
128#define HAMMER_FSBUF_RECORDS 0xC8414D52D2454353ULL /* HAMRRECS */
129#define HAMMER_FSBUF_BTREE 0xC8414D52C2545245ULL /* HAMRBTRE */
130#define HAMMER_FSBUF_DATA 0xC8414D52C4415441ULL /* HAMRDATA */
131
132#define HAMMER_FSBUF_VOLUME_REV 0x313052C54D4D41C8ULL /* (reverse endian) */
133
134/*
135 * The B-Tree structures need hammer_fsbuf_head.
136 */
137#include "hammer_btree.h"
138
139/*
140 * HAMMER Volume header
141 *
142 * A HAMMER filesystem is built from any number of block devices, Each block
143 * device contains a volume header followed by however many clusters
144 * fit in the volume. Clusters cannot be migrated but the data they contain
145 * can, so HAMMER can use a truncated cluster for any extra space at the
146 * end of the volume.
147 *
148 * The volume containing the root cluster is designated as the master volume.
149 * The root cluster designation can be moved to any volume.
150 *
151 * The volume header takes up an entire 16K filesystem buffer and includes
152 * an A-list to manage the clusters contained within the volume (up to 32768).
153 * With 512M clusters a volume will be limited to 16TB.
154 */
155#define HAMMER_VOL_MAXCLUSTERS 32768
156#define HAMMER_VOL_METAELMS 1094
157
158struct hammer_volume_ondisk {
159 struct hammer_fsbuf_head head;
160 int64_t vol_beg; /* byte offset of first cluster in volume */
161 int64_t vol_end; /* byte offset of volume EOF */
162 int64_t vol_locked; /* reserved clusters are >= this offset */
163
164 uuid_t vol_fsid; /* identify filesystem */
165 uuid_t vol_fstype; /* identify filesystem type */
166 char vol_name[64]; /* Name of volume */
167
168 int32_t vol_no; /* volume number within filesystem */
169 int32_t vol_count; /* number of volumes making up FS */
170
171 u_int32_t vol_version; /* version control information */
172 u_int32_t vol_segsize; /* cluster size power of 2, 512M max */
173 u_int32_t vol_flags; /* volume flags */
174 u_int32_t vol_rootvol; /* which volume is the root volume? */
175
176 int32_t vol_clsize; /* cluster size (same for all volumes) */
177 u_int32_t vol_reserved05;
178 u_int32_t vol_reserved06;
179 u_int32_t vol_reserved07;
180
181 /*
182 * These fields are initialized and space is reserved in every
183 * volume making up a HAMMER filesytem, but only the master volume
184 * contains valid data.
185 */
186 int32_t vol0_rootcluster; /* root cluster no (index) in rootvol */
187 u_int32_t vol0_reserved02;
188 u_int32_t vol0_reserved03;
189 hammer_tid_t vol0_nexttid; /* next TID */
190 u_int64_t vol0_recid; /* fs-wide record id allocator */
191
192 char reserved[1024];
193
194 hammer_almeta_t vol_almeta[HAMMER_VOL_METAELMS];
195 u_int32_t vol0_bitmap[1024];
196};
197
198#define HAMMER_VOLF_VALID 0x0001 /* valid entry */
199#define HAMMER_VOLF_OPEN 0x0002 /* volume is open */
200
201/*
202 * HAMMER Cluster header
203 *
204 * The cluster header contains all the information required to identify a
205 * cluster, locate critical information areas within the cluster, and
206 * to manage space within the cluster.
207 *
208 * A Cluster contains pure data, incremental data, b-tree nodes, and records.
209 */
210#define HAMMER_CLU_MAXBUFFERS 32768
211#define HAMMER_CLU_METAELMS 1094
212
213struct hammer_cluster_ondisk {
214 struct hammer_fsbuf_head head;
215 uuid_t vol_fsid; /* identify filesystem - sanity check */
216 uuid_t vol_fstype; /* identify filesystem type - sanity check */
217
218 u_int64_t clu_gen; /* identify generation number of cluster */
219 u_int64_t clu_unused01;
220
221 hammer_tid_t clu_id; /* unique cluster self identification */
222 int32_t vol_no; /* cluster contained in volume (sanity) */
223 u_int32_t clu_flags; /* cluster flags */
224
225 int32_t clu_start; /* start of data (byte offset) */
226 int32_t clu_limit; /* end of data (byte offset) */
227 int32_t clu_no; /* cluster index in volume (sanity) */
228 u_int32_t clu_reserved03;
229
230 u_int32_t clu_reserved04;
231 u_int32_t clu_reserved05;
232 u_int32_t clu_reserved06;
233 u_int32_t clu_reserved07;
234
235 int32_t idx_data; /* data append point (byte offset) */
236 int32_t idx_index; /* index append point (byte offset) */
237 int32_t idx_record; /* record prepend point (byte offset) */
238 u_int32_t idx_reserved03;
239
240 /*
241 * Specify the range of information stored in this cluster. These
242 * structures match the B-Tree elements in our parent cluster
243 * (if any) that point to us. Note that clu_objend is
244 * range-inclusive, not range-exclusive so e.g. 0-1023 instead
245 * of 0-1024.
246 */
247 int64_t clu_parent; /* parent vol & cluster */
248 struct hammer_base_elm clu_objstart;
249 struct hammer_base_elm clu_objend;
250
251 /*
252 * The root node of the cluster's B-Tree is embedded in the
253 * cluster header. The node is 504 bytes.
254 */
255 struct hammer_btree_node clu_btree_root;
256
257 /*
258 * HAMMER needs a separate bitmap to indicate which buffers are
259 * managed (contain a hammer_fsbuf_head). Any buffers not so
260 * designated are either unused or contain pure data.
261 *
262 * synchronized_rec_id is the synchronization point for the
263 * cluster. Any records with a greater or equal rec_id found
264 * when recovering a cluster are likely incomplete and will be
265 * ignored.
266 */
267 u_int64_t synchronized_rec_id;
268 u_int32_t managed_buffers_bitmap[HAMMER_CLU_MAXBUFFERS/32];
269
270 char reserved[1024];
271 hammer_almeta_t clu_almeta[HAMMER_CLU_METAELMS];
272};
273
274/*
275 * HAMMER records are 96 byte entities encoded into 16K filesystem buffers.
276 * Each record has a 64 byte header and a 32 byte extension. 170 records
277 * fit into each buffer. Storage is managed by the buffer's A-List.
278 *
279 * Each record may have an explicit data reference to a block of data up
280 * to 2^31-1 bytes in size within the current cluster. Note that multiple
281 * records may share the same or overlapping data references.
282 */
283
284/*
285 * All HAMMER records have a common 64-byte base and a 32-byte extension.
286 *
287 * Many HAMMER record types reference out-of-band data within the cluster.
288 * This data can also be stored in-band in the record itself if it is small
289 * enough. Either way, (data_offset, data_len) points to it.
290 *
291 * Key comparison order: obj_id, rec_type, key, create_tid
292 */
293struct hammer_base_record {
294 int64_t obj_id; /* 00 object record is associated with */
295 int64_t key; /* 08 indexing key (offset or namekey) */
296
297 hammer_tid_t create_tid;/* 10 transaction id for record creation */
298 hammer_tid_t delete_tid;/* 18 transaction id for record update/delete */
299
300 u_int16_t rec_type; /* 20 type of record */
301 u_int16_t obj_type; /* 22 type of object (if inode) */
302 u_int32_t data_offset; /* 24 intra-cluster data reference */
303 /* An offset of 0 indicates zero-fill */
304 int32_t data_len; /* 28 size of data (remainder zero-fill) */
305 u_int32_t data_crc; /* 2C data sanity check */
306 u_int64_t rec_id; /* 30 record id (iterator for recovery) */
307 u_int64_t reserved07; /* 38 */
308 /* 40 */
309};
310
311#define HAMMER_RECTYPE_UNKNOWN 0
312#define HAMMER_RECTYPE_INODE 1 /* inode in obj_id space */
313#define HAMMER_RECTYPE_SLAVE 2 /* slave inode */
314#define HAMMER_RECTYPE_OBJZONE 3 /* subdivide obj_id space */
315#define HAMMER_RECTYPE_DATA_CREATE 0x10
316#define HAMMER_RECTYPE_DATA_ZEROFILL 0x11
317#define HAMMER_RECTYPE_DATA_DELETE 0x12
318#define HAMMER_RECTYPE_DATA_UPDATE 0x13
319#define HAMMER_RECTYPE_DIR_CREATE 0x20
320#define HAMMER_RECTYPE_DIR_DELETE 0x22
321#define HAMMER_RECTYPE_DIR_UPDATE 0x23
322#define HAMMER_RECTYPE_DB_CREATE 0x30
323#define HAMMER_RECTYPE_DB_DELETE 0x32
324#define HAMMER_RECTYPE_DB_UPDATE 0x33
325#define HAMMER_RECTYPE_EXT_CREATE 0x40 /* ext attributes */
326#define HAMMER_RECTYPE_EXT_DELETE 0x42
327#define HAMMER_RECTYPE_EXT_UPDATE 0x43
328
329#define HAMMER_OBJTYPE_DIRECTORY 1
330#define HAMMER_OBJTYPE_REGFILE 2
331#define HAMMER_OBJTYPE_DBFILE 3
332#define HAMMER_OBJTYPE_FIFO 4
333#define HAMMER_OBJTYPE_DEVNODE 5
334#define HAMMER_OBJTYPE_SOFTLINK 6
335
336/*
337 * Generic full-sized record
338 */
339struct hammer_generic_record {
340 struct hammer_base_record base;
341 char filler[32];
342};
343
344/*
345 * A HAMMER inode record.
346 *
347 * This forms the basis for a filesystem object. obj_id is the inode number,
348 * key1 represents the pseudo filesystem id for security partitioning
349 * (preventing cross-links and/or restricting a NFS export and specifying the
350 * security policy), and key2 represents the data retention policy id.
351 *
352 * Inode numbers are 64 bit quantities which uniquely identify a filesystem
353 * object for the ENTIRE life of the filesystem, even after the object has
354 * been deleted. For all intents and purposes inode numbers are simply
355 * allocated by incrementing a sequence space.
356 *
357 * There is an important distinction between the data stored in the inode
358 * record and the record's data reference. The record references a
359 * hammer_inode_data structure but the filesystem object size and hard link
360 * count is stored in the inode record itself. This allows multiple inodes
361 * to share the same hammer_inode_data structure. This is possible because
362 * any modifications will lay out new data. The HAMMER implementation need
363 * not use the data-sharing ability when laying down new records.
364 *
365 * A HAMMER inode is subject to the same historical storage requirements
366 * as any other record. In particular any change in filesystem or hard link
367 * count will lay down a new inode record when the filesystem is synced to
368 * disk. This can lead to a lot of junk records which get cleaned up by
369 * the data retention policy.
370 *
371 * The ino_atime and ino_mtime fields are a special case. Modifications to
372 * these fields do NOT lay down a new record by default, though the values
373 * are effectively frozen for snapshots which access historical versions
374 * of the inode record due to other operations. This means that atime will
375 * not necessarily be accurate in snapshots, backups, or mirrors. mtime
376 * will be accurate in backups and mirrors since it can be regenerated from
377 * the mirroring stream.
378 *
379 * Because nlinks is historically retained the hardlink count will be
380 * accurate when accessing a HAMMER filesystem snapshot.
381 */
382struct hammer_inode_record {
383 struct hammer_base_record base;
384 u_int64_t ino_atime; /* last access time (not historical) */
385 u_int64_t ino_mtime; /* last modified time (not historical) */
386 u_int64_t ino_size; /* filesystem object size */
387 u_int64_t ino_nlinks; /* hard links */
388};
389
390/*
391 * Data records specify the entire contents of a regular file object,
392 * including attributes. Small amounts of data can theoretically be
393 * embedded in the record itself but the use of this ability verses using
394 * an out-of-band data reference depends on the implementation.
395 */
396struct hammer_data_record {
397 struct hammer_base_record base;
398 char filler[32];
399};
400
401/*
402 * A directory entry specifies the HAMMER filesystem object id, a copy of
403 * the file type, and file name (either embedded or as out-of-band data).
404 * If the file name is short enough to fit into den_name[] (including a
405 * terminating nul) then it will be embedded in the record, otherwise it
406 * is stored out-of-band. The base record's data reference always points
407 * to the nul-terminated filename regardless.
408 *
409 * Directory entries are indexed with a 128 bit namekey rather then an
410 * offset. A portion of the namekey is an iterator or randomizer to deal
411 * with collisions.
412 */
413struct hammer_entry_record {
414 struct hammer_base_record base;
415 u_int64_t obj_id; /* object being referenced */
416 u_int64_t reserved01;
417 u_int8_t den_type; /* cached file type */
418 char den_name[15]; /* short file names fit in record */
419};
420
421/*
422 * Hammer rollup record
423 */
424union hammer_record {
425 struct hammer_base_record base;
426 struct hammer_generic_record generic;
427 struct hammer_inode_record inode;
428 struct hammer_data_record data;
429 struct hammer_entry_record entry;
430};
431
432typedef union hammer_record *hammer_record_t;
433
434/*
435 * Filesystem buffer for records
436 */
437#define HAMMER_RECORD_NODES \
438 ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \
439 sizeof(union hammer_record))
440
441struct hammer_fsbuf_recs {
442 struct hammer_fsbuf_head head;
443 char unused[32];
444 union hammer_record recs[HAMMER_RECORD_NODES];
445};
446
447/*
448 * Filesystem buffer for piecemeal data. Note that this does not apply
449 * to dedicated pure-data buffers as such buffers do not have a header.
450 */
451
452#define HAMMER_DATA_SIZE (HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head))
453#define HAMMER_DATA_BLKSIZE 64
454#define HAMMER_DATA_NODES (HAMMER_DATA_SIZE / HAMMER_DATA_BLKSIZE)
455
456struct hammer_fsbuf_data {
457 struct hammer_fsbuf_head head;
458 u_int8_t data[HAMMER_DATA_NODES][HAMMER_DATA_BLKSIZE];
459};
460
461
462/*
463 * HAMMER UNIX Attribute data
464 *
465 * The data reference in a HAMMER inode record points to this structure. Any
466 * modifications to the contents of this structure will result in a record
467 * replacement operation.
468 *
469 * state_sum allows a filesystem object to be validated to a degree by
470 * generating a checksum of all of its pieces (in no particular order) and
471 * checking it against this field.
472 */
473struct hammer_inode_data {
474 u_int16_t version; /* inode data version */
475 u_int16_t mode; /* basic unix permissions */
476 u_int32_t uflags; /* chflags */
477 u_int64_t reserved01;
478 u_int64_t reserved02;
479 u_int64_t state_sum; /* cumulative checksum */
480 uuid_t uid;
481 uuid_t gid;
482};
483
484#define HAMMER_INODE_DATA_VERSION 1
485
486/*
487 * Function library support available to kernel and userland
488 */
489void hammer_alist_template(hammer_alist_t, int blocks, int maxmeta);
490void hammer_alist_init(hammer_alist_t bl, hammer_almeta_t *meta);
491int32_t hammer_alist_alloc(hammer_alist_t bl, hammer_almeta_t *meta,
492 int32_t count);
493int32_t hammer_alist_alloc_rev(hammer_alist_t bl, hammer_almeta_t *meta,
494 int32_t count);
495#if 0
496int32_t hammer_alist_alloc_from(hammer_alist_t bl, hammer_almeta_t *meta,
497 int32_t count, int32_t start, int flags);
498#endif
499void hammer_alist_free(hammer_alist_t bl, hammer_almeta_t *meta,
500 int32_t blkno, int32_t count);
501