hammer2 - Refactor flush mechanics
[dragonfly.git] / sys / vfs / hammer2 / hammer2.h
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35
36 /*
37  * This header file contains structures used internally by the HAMMER2
38  * implementation.  See hammer2_disk.h for on-disk structures.
39  */
40
41 #ifndef _VFS_HAMMER2_HAMMER2_H_
42 #define _VFS_HAMMER2_HAMMER2_H_
43
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/kernel.h>
47 #include <sys/conf.h>
48 #include <sys/systm.h>
49 #include <sys/tree.h>
50 #include <sys/malloc.h>
51 #include <sys/mount.h>
52 #include <sys/vnode.h>
53 #include <sys/proc.h>
54 #include <sys/mountctl.h>
55 #include <sys/priv.h>
56 #include <sys/stat.h>
57 #include <sys/thread.h>
58 #include <sys/globaldata.h>
59 #include <sys/lockf.h>
60 #include <sys/buf.h>
61 #include <sys/queue.h>
62 #include <sys/limits.h>
63 #include <sys/buf2.h>
64 #include <sys/signal2.h>
65 #include <sys/dmsg.h>
66 #include <sys/mutex.h>
67 #include <sys/mutex2.h>
68
69 #include "hammer2_disk.h"
70 #include "hammer2_mount.h"
71 #include "hammer2_ioctl.h"
72 #include "hammer2_ccms.h"
73
74 struct hammer2_chain;
75 struct hammer2_inode;
76 struct hammer2_mount;
77 struct hammer2_pfsmount;
78 struct hammer2_span;
79 struct hammer2_state;
80 struct hammer2_msg;
81
82 /*
83  * The chain structure tracks a portion of the media topology from the
84  * root (volume) down.  Chains represent volumes, inodes, indirect blocks,
85  * data blocks, and freemap nodes and leafs.
86  *
87  * The chain structure can be multi-homed and its topological recursion
88  * (chain->core) can be shared amongst several chains.  Chain structures
89  * are topologically stable once placed in the in-memory topology (they
90  * don't move around).  Modifications which cross flush synchronization
91  * boundaries, renames, resizing, or any move of the chain to elsewhere
92  * in the topology is accomplished via the DELETE-DUPLICATE mechanism.
93  *
94  * Deletions and delete-duplicates:
95  *
96  *      Any movement of chains within the topology utilize a delete-duplicate
97  *      operation instead of a simple rename.  That is, the chain must be
98  *      deleted from its original location and then duplicated to the new
99  *      location.  A new chain structure is allocated while the old is
100  *      deleted.  Deleted chains are removed from the above chain_core's
101  *      rbtree but remain linked via the shadow topology for flush
102  *      synchronization purposes.
103  *
104  *      delete_bmap is allocated and a bit set if the chain was originally
105  *      loaded via the blockmap.
106  *
107  * Flush synchronization:
108  *
109  *      Flushes must synchronize chains up through the root.  To do this
110  *      the in-memory topology would normally have to be frozen during the
111  *      flush.  To avoid freezing the topology and to allow concurrent
112  *      foreground / flush activity, any new modifications made while a
113  *      flush is in progress retains the original chain in a shadow topology
114  *      that is only visible to the flush code.  Only one flush can be
115  *      running at a time so the shadow hierarchy can be implemented with
116  *      just a few link fields in our in-memory data structures.
117  *
118  * Advantages:
119  *
120  *      (1) Fully coherent snapshots can be taken without requiring
121  *          a pre-flush, resulting in extremely fast (sub-millisecond)
122  *          snapshots.
123  *
124  *      (2) Multiple synchronization points can be in-flight at the same
125  *          time, representing multiple snapshots or flushes.
126  *
127  *      (3) The algorithms needed to keep track of everything are actually
128  *          not that complex.
129  *
130  * Special Considerations:
131  *
132  *      A chain is ref-counted on a per-chain basis, but the chain's lock
133  *      is associated with the shared chain_core and is not per-chain.
134  *
135  *      The power-of-2 nature of the media radix tree ensures that there
136  *      will be no overlaps which straddle edges.
137  */
138 RB_HEAD(hammer2_chain_tree, hammer2_chain);
139 TAILQ_HEAD(h2_flush_deferral_list, hammer2_chain);
140 TAILQ_HEAD(h2_core_list, hammer2_chain);
141
142 #define CHAIN_CORE_DELETE_BMAP_ENTRIES  \
143         (HAMMER2_PBUFSIZE / sizeof(hammer2_blockref_t) / sizeof(uint32_t))
144
145 struct hammer2_chain_core {
146         int             good;
147         struct ccms_cst cst;
148         struct h2_core_list ownerq;       /* all chains sharing this core */
149         struct hammer2_chain_tree rbtree; /* live chains */
150         struct hammer2_chain_tree dbtree; /* bmapped deletions */
151         struct h2_core_list dbq;          /* other deletions */
152         int             live_zero;      /* blockref array opt */
153         u_int           sharecnt;
154         u_int           flags;
155         u_int           live_count;     /* live (not deleted) chains in tree */
156         u_int           chain_count;    /* live + deleted chains under core */
157         int             generation;     /* generation number (inserts only) */
158 };
159
160 typedef struct hammer2_chain_core hammer2_chain_core_t;
161
162 #define HAMMER2_CORE_UNUSED0001         0x0001
163 #define HAMMER2_CORE_COUNTEDBREFS       0x0002
164
165 /*
166  * H2 is a copy-on-write filesystem.  In order to allow chains to allocate
167  * smaller blocks (down to 64-bytes), but improve performance and make
168  * clustered I/O possible using larger block sizes, the kernel buffer cache
169  * is abstracted via the hammer2_io structure.
170  */
171 RB_HEAD(hammer2_io_tree, hammer2_io);
172
173 struct hammer2_io {
174         RB_ENTRY(hammer2_io) rbnode;    /* indexed by device offset */
175         struct spinlock spin;
176         struct hammer2_mount *hmp;
177         struct buf      *bp;
178         struct bio      *bio;
179         off_t           pbase;
180         int             psize;
181         void            (*callback)(struct hammer2_io *dio,
182                                     struct hammer2_chain *chain,
183                                     void *arg1, off_t arg2);
184         struct hammer2_chain *arg_c;            /* INPROG I/O only */
185         void            *arg_p;                 /* INPROG I/O only */
186         off_t           arg_o;                  /* INPROG I/O only */
187         int             refs;
188         int             act;                    /* activity */
189 };
190
191 typedef struct hammer2_io hammer2_io_t;
192
193 /*
194  * Primary chain structure keeps track of the topology in-memory.
195  */
196 struct hammer2_chain {
197         TAILQ_ENTRY(hammer2_chain) core_entry;  /* contemporary chains */
198         RB_ENTRY(hammer2_chain) rbnode;         /* live chain(s) */
199         TAILQ_ENTRY(hammer2_chain) db_entry;    /* non bmapped deletions */
200         hammer2_blockref_t      bref;
201         hammer2_chain_core_t    *core;
202         hammer2_chain_core_t    *above;
203         struct hammer2_state    *state;         /* if active cache msg */
204         struct hammer2_mount    *hmp;
205         struct hammer2_pfsmount *pmp;           /* can be NULL */
206
207         hammer2_blockref_t      dsrc;                   /* DEBUG */
208         int                     ninserts;               /* DEBUG */
209         int                     nremoves;               /* DEBUG */
210         hammer2_tid_t           dsrc_dupfromat;         /* DEBUG */
211         uint32_t                dsrc_dupfromflags;      /* DEBUG */
212         int                     dsrc_reason;            /* DEBUG */
213         int                     dsrc_ninserts;          /* DEBUG */
214         uint32_t                dsrc_flags;             /* DEBUG */
215         hammer2_tid_t           dsrc_modify;            /* DEBUG */
216         hammer2_tid_t           dsrc_delete;            /* DEBUG */
217         hammer2_tid_t           dsrc_update_lo;         /* DEBUG */
218         struct hammer2_chain    *dsrc_original;         /* DEBUG */
219
220         hammer2_tid_t   modify_tid;             /* flush filter */
221         hammer2_tid_t   delete_tid;             /* flush filter */
222         hammer2_tid_t   update_lo;              /* flush propagation */
223         hammer2_tid_t   update_hi;              /* setsubmod propagation */
224         hammer2_key_t   data_count;             /* delta's to apply */
225         hammer2_key_t   inode_count;            /* delta's to apply */
226         hammer2_io_t    *dio;                   /* physical data buffer */
227         u_int           bytes;                  /* physical data size */
228         u_int           flags;
229         u_int           refs;
230         u_int           lockcnt;
231         hammer2_media_data_t *data;             /* data pointer shortcut */
232         TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
233
234         int             inode_reason;
235 };
236
237 typedef struct hammer2_chain hammer2_chain_t;
238
239 int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
240 RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp);
241
242 /*
243  * Special notes on flags:
244  *
245  * INITIAL - This flag allows a chain to be created and for storage to
246  *           be allocated without having to immediately instantiate the
247  *           related buffer.  The data is assumed to be all-zeros.  It
248  *           is primarily used for indirect blocks.
249  *
250  * MODIFIED- The chain's media data has been modified.
251  */
252 #define HAMMER2_CHAIN_MODIFIED          0x00000001      /* dirty chain data */
253 #define HAMMER2_CHAIN_ALLOCATED         0x00000002      /* kmalloc'd chain */
254 #define HAMMER2_CHAIN_FLUSH_TEMPORARY   0x00000004
255 #define HAMMER2_CHAIN_FORCECOW          0x00000008      /* force copy-on-wr */
256 #define HAMMER2_CHAIN_DELETED           0x00000010      /* deleted chain */
257 #define HAMMER2_CHAIN_INITIAL           0x00000020      /* initial create */
258 #define HAMMER2_CHAIN_FLUSH_CREATE      0x00000040      /* needs flush blkadd */
259 #define HAMMER2_CHAIN_FLUSH_DELETE      0x00000080      /* needs flush blkdel */
260 #define HAMMER2_CHAIN_IOFLUSH           0x00000100      /* bawrite on put */
261 #define HAMMER2_CHAIN_DEFERRED          0x00000200      /* on a deferral list */
262 #define HAMMER2_CHAIN_UNLINKED          0x00000400      /* delete on reclaim */
263 #define HAMMER2_CHAIN_VOLUMESYNC        0x00000800      /* needs volume sync */
264 #define HAMMER2_CHAIN_ONDBQ             0x00001000      /* !bmapped deletes */
265 #define HAMMER2_CHAIN_MOUNTED           0x00002000      /* PFS is mounted */
266 #define HAMMER2_CHAIN_ONRBTREE          0x00004000      /* on parent RB tree */
267 #define HAMMER2_CHAIN_SNAPSHOT          0x00008000      /* snapshot special */
268 #define HAMMER2_CHAIN_EMBEDDED          0x00010000      /* embedded data */
269 #define HAMMER2_CHAIN_RELEASE           0x00020000      /* don't keep around */
270 #define HAMMER2_CHAIN_BMAPPED           0x00040000      /* in parent blkmap */
271 #define HAMMER2_CHAIN_ONDBTREE          0x00080000      /* bmapped deletes */
272 #define HAMMER2_CHAIN_DUPLICATED        0x00100000      /* fwd delete-dup */
273 #define HAMMER2_CHAIN_PFSROOT           0x00200000      /* in pfs->cluster */
274
275 /*
276  * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
277  *
278  * NOTE: MATCHIND allows an indirect block / freemap node to be returned
279  *       when the passed key range matches the radix.  Remember that key_end
280  *       is inclusive (e.g. {0x000,0xFFF}, not {0x000,0x1000}).
281  */
282 #define HAMMER2_LOOKUP_NOLOCK           0x00000001      /* ref only */
283 #define HAMMER2_LOOKUP_NODATA           0x00000002      /* data left NULL */
284 #define HAMMER2_LOOKUP_SHARED           0x00000100
285 #define HAMMER2_LOOKUP_MATCHIND         0x00000200      /* return all chains */
286 #define HAMMER2_LOOKUP_UNUSED0400       0x00000400
287 #define HAMMER2_LOOKUP_ALWAYS           0x00000800      /* resolve data */
288
289 /*
290  * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
291  *
292  * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
293  *       blocks in the INITIAL-create state.
294  */
295 #define HAMMER2_MODIFY_OPTDATA          0x00000002      /* data can be NULL */
296 #define HAMMER2_MODIFY_NO_MODIFY_TID    0x00000004
297 #define HAMMER2_MODIFY_ASSERTNOCOPY     0x00000008      /* assert no del-dup */
298 #define HAMMER2_MODIFY_NOREALLOC        0x00000010
299 #define HAMMER2_MODIFY_INPLACE          0x00000020      /* don't del-dup */
300
301 /*
302  * Flags passed to hammer2_chain_lock()
303  */
304 #define HAMMER2_RESOLVE_NEVER           1
305 #define HAMMER2_RESOLVE_MAYBE           2
306 #define HAMMER2_RESOLVE_ALWAYS          3
307 #define HAMMER2_RESOLVE_MASK            0x0F
308
309 #define HAMMER2_RESOLVE_SHARED          0x10    /* request shared lock */
310 #define HAMMER2_RESOLVE_NOREF           0x20    /* already ref'd on lock */
311
312 /*
313  * Flags passed to hammer2_chain_delete()
314  */
315 #define HAMMER2_DELETE_UNUSED0001       0x0001
316
317 /*
318  * Flags passed to hammer2_chain_delete_duplicate()
319  */
320 #define HAMMER2_DELDUP_RECORE           0x0001
321
322 /*
323  * Cluster different types of storage together for allocations
324  */
325 #define HAMMER2_FREECACHE_INODE         0
326 #define HAMMER2_FREECACHE_INDIR         1
327 #define HAMMER2_FREECACHE_DATA          2
328 #define HAMMER2_FREECACHE_UNUSED3       3
329 #define HAMMER2_FREECACHE_TYPES         4
330
331 /*
332  * hammer2_freemap_alloc() block preference
333  */
334 #define HAMMER2_OFF_NOPREF              ((hammer2_off_t)-1)
335
336 /*
337  * BMAP read-ahead maximum parameters
338  */
339 #define HAMMER2_BMAP_COUNT              16      /* max bmap read-ahead */
340 #define HAMMER2_BMAP_BYTES              (HAMMER2_PBUFSIZE * HAMMER2_BMAP_COUNT)
341
342 /*
343  * Misc
344  */
345 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
346
347 /*
348  * hammer2_freemap_adjust()
349  */
350 #define HAMMER2_FREEMAP_DORECOVER       1
351 #define HAMMER2_FREEMAP_DOMAYFREE       2
352 #define HAMMER2_FREEMAP_DOREALFREE      3
353
354 /*
355  * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
356  *
357  * There is an in-memory representation of all on-media data structure.
358  * Basically everything is represented by a hammer2_chain structure
359  * in-memory and other higher-level structures map to chains.
360  *
361  * A great deal of data is accessed simply via its buffer cache buffer,
362  * which is mapped for the duration of the chain's lock.  However, because
363  * chains may represent blocks smaller than the 16KB minimum we impose
364  * on buffer cache buffers, we cannot hold related buffer cache buffers
365  * locked for smaller blocks.  In these situations we kmalloc() a copy
366  * of the block.
367  *
368  * When modifications are made to a chain a new filesystem block must be
369  * allocated.  Multiple modifications do not necessarily allocate new
370  * blocks.  However, when a flush occurs a flush synchronization point
371  * is created and any new modifications made after this point will allocate
372  * a new block even if the chain is already in a modified state.
373  *
374  * The in-memory representation may remain cached (for example in order to
375  * placemark clustering locks) even after the related data has been
376  * detached.
377  *
378  *                              CORE SHARING
379  *
380  * In order to support concurrent flushes a flush synchronization point
381  * is created represented by a transaction id.  Among other things,
382  * operations may move filesystem objects from one part of the topology
383  * to another (for example, if you rename a file or when indirect blocks
384  * are created or destroyed, and a few other things).  When this occurs
385  * across a flush synchronization point the flusher needs to be able to
386  * recurse down BOTH the 'before' version of the topology and the 'after'
387  * version.
388  *
389  * To facilitate this modifications to chains do what is called a
390  * DELETE-DUPLICATE operation.  Chains are not actually moved in-memory.
391  * Instead the chain we wish to move is deleted and a new chain is created
392  * at the target location in the topology.  ANY SUBCHAINS PLACED UNDER THE
393  * CHAIN BEING MOVED HAVE TO EXIST IN BOTH PLACES.  To make this work
394  * all sub-chains are managed by the hammer2_chain_core structure.  This
395  * structure can be multi-homed, meaning that it can have more than one
396  * chain as its parent.  When a chain is delete-duplicated the chain's core
397  * becomes shared under both the old and new chain.
398  *
399  *                              STALE CHAINS
400  *
401  * When a chain is delete-duplicated the old chain typically becomes stale.
402  * This is detected via the HAMMER2_CHAIN_DUPLICATED flag in chain->flags.
403  * To avoid executing live filesystem operations on stale chains, the inode
404  * locking code will follow stale chains via core->ownerq until it finds
405  * the live chain.  The lock prevents ripups by other threads.  Lookups
406  * must properly order locking operations to prevent other threads from
407  * racing the lookup operation and will also follow stale chains when
408  * required.
409  */
410
411 RB_HEAD(hammer2_inode_tree, hammer2_inode);
412
413 /*
414  * A hammer2 inode.
415  *
416  * NOTE: The inode's attribute CST which is also used to lock the inode
417  *       is embedded in the chain (chain.cst) and aliased w/ attr_cst.
418  */
419 struct hammer2_inode {
420         RB_ENTRY(hammer2_inode) rbnode;         /* inumber lookup (HL) */
421         ccms_cst_t              topo_cst;       /* directory topology cst */
422         struct hammer2_pfsmount *pmp;           /* PFS mount */
423         struct hammer2_inode    *pip;           /* parent inode */
424         struct vnode            *vp;
425         hammer2_chain_t         *chain;         /* NOTE: rehomed on rename */
426         struct lockf            advlock;
427         hammer2_tid_t           inum;
428         u_int                   flags;
429         u_int                   refs;           /* +vpref, +flushref */
430         uint8_t                 comp_heuristic;
431         hammer2_off_t           size;
432         uint64_t                mtime;
433 };
434
435 typedef struct hammer2_inode hammer2_inode_t;
436
437 #define HAMMER2_INODE_MODIFIED          0x0001
438 #define HAMMER2_INODE_SROOT             0x0002  /* kmalloc special case */
439 #define HAMMER2_INODE_RENAME_INPROG     0x0004
440 #define HAMMER2_INODE_ONRBTREE          0x0008
441 #define HAMMER2_INODE_RESIZED           0x0010
442 #define HAMMER2_INODE_MTIME             0x0020
443
444 int hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2);
445 RB_PROTOTYPE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
446                 hammer2_tid_t);
447
448 /*
449  * A hammer2 transaction and flush sequencing structure.
450  *
451  * This global structure is tied into hammer2_mount and is used
452  * to sequence modifying operations and flushes.
453  *
454  * (a) Any modifying operations with sync_tid >= flush_tid will stall until
455  *     all modifying operating with sync_tid < flush_tid complete.
456  *
457  *     The flush related to flush_tid stalls until all modifying operations
458  *     with sync_tid < flush_tid complete.
459  *
460  * (b) Once unstalled, modifying operations with sync_tid > flush_tid are
461  *     allowed to run.  All modifications cause modify/duplicate operations
462  *     to occur on the related chains.  Note that most INDIRECT blocks will
463  *     be unaffected because the modifications just overload the RBTREE
464  *     structurally instead of actually modifying the indirect blocks.
465  *
466  * (c) The actual flush unstalls and RUNS CONCURRENTLY with (b), but only
467  *     utilizes the chain structures with sync_tid <= flush_tid.  The
468  *     flush will modify related indirect blocks and inodes in-place
469  *     (rather than duplicate) since the adjustments are compatible with
470  *     (b)'s RBTREE overloading
471  *
472  *     SPECIAL NOTE:  Inode modifications have to also propagate along any
473  *                    modify/duplicate chains.  File writes detect the flush
474  *                    and force out the conflicting buffer cache buffer(s)
475  *                    before reusing them.
476  *
477  * (d) Snapshots can be made instantly but must be flushed and disconnected
478  *     from their duplicative source before they can be mounted.  This is
479  *     because while H2's on-media structure supports forks, its in-memory
480  *     structure only supports very simple forking for background flushing
481  *     purposes.
482  *
483  * TODO: Flush merging.  When fsync() is called on multiple discrete files
484  *       concurrently there is no reason to stall the second fsync.
485  *       The final flush that reaches to root can cover both fsync()s.
486  *
487  *     The chains typically terminate as they fly onto the disk.  The flush
488  *     ultimately reaches the volume header.
489  */
490 struct hammer2_trans {
491         TAILQ_ENTRY(hammer2_trans) entry;
492         struct hammer2_pfsmount *pmp;           /* might be NULL */
493         struct hammer2_mount    *hmp_single;    /* if single-targetted */
494         hammer2_tid_t           orig_tid;
495         hammer2_tid_t           sync_tid;       /* effective transaction id */
496         hammer2_tid_t           inode_tid;
497         thread_t                td;             /* pointer */
498         int                     flags;
499         int                     blocked;
500         uint8_t                 inodes_created;
501         uint8_t                 dummy[7];
502 };
503
504 typedef struct hammer2_trans hammer2_trans_t;
505
506 #define HAMMER2_TRANS_ISFLUSH           0x0001  /* formal flush */
507 #define HAMMER2_TRANS_CONCURRENT        0x0002  /* concurrent w/flush */
508 #define HAMMER2_TRANS_BUFCACHE          0x0004  /* from bioq strategy write */
509 #define HAMMER2_TRANS_NEWINODE          0x0008  /* caller allocating inode */
510 #define HAMMER2_TRANS_ISALLOCATING      0x0010  /* in allocator */
511
512 #define HAMMER2_FREEMAP_HEUR_NRADIX     4       /* pwr 2 PBUFRADIX-MINIORADIX */
513 #define HAMMER2_FREEMAP_HEUR_TYPES      8
514 #define HAMMER2_FREEMAP_HEUR            (HAMMER2_FREEMAP_HEUR_NRADIX * \
515                                          HAMMER2_FREEMAP_HEUR_TYPES)
516
517 /*
518  * Global (per device) mount structure for device (aka vp->v_mount->hmp)
519  */
520 TAILQ_HEAD(hammer2_trans_queue, hammer2_trans);
521
522 struct hammer2_mount {
523         struct vnode    *devvp;         /* device vnode */
524         int             ronly;          /* read-only mount */
525         int             pmp_count;      /* PFS mounts backed by us */
526         TAILQ_ENTRY(hammer2_mount) mntentry; /* hammer2_mntlist */
527
528         struct malloc_type *mchain;
529         int             nipstacks;
530         int             maxipstacks;
531         struct spinlock io_spin;        /* iotree access */
532         struct hammer2_io_tree iotree;
533         int             iofree_count;
534         hammer2_chain_t vchain;         /* anchor chain (topology) */
535         hammer2_chain_t fchain;         /* anchor chain (freemap) */
536         hammer2_inode_t *sroot;         /* super-root localized to media */
537         struct lock     alloclk;        /* lockmgr lock */
538         struct lock     voldatalk;      /* lockmgr lock */
539         struct hammer2_trans_queue transq; /* all in-progress transactions */
540         hammer2_off_t   heur_freemap[HAMMER2_FREEMAP_HEUR];
541         int             flushcnt;       /* #of flush trans on the list */
542
543         int             volhdrno;       /* last volhdrno written */
544         hammer2_volume_data_t voldata;
545         hammer2_volume_data_t volsync;  /* synchronized voldata */
546 };
547
548 typedef struct hammer2_mount hammer2_mount_t;
549
550 /*
551  * HAMMER2 cluster - a device/root associated with a PFS.
552  *
553  * A PFS may have several hammer2_cluster's associated with it.
554  */
555 #define HAMMER2_MAXCLUSTER      8
556
557 struct hammer2_cluster {
558         int                     nchains;
559         int                     status;
560         hammer2_chain_t         *chains[HAMMER2_MAXCLUSTER];
561 };
562
563 typedef struct hammer2_cluster hammer2_cluster_t;
564
565 /*
566  * HAMMER2 PFS mount point structure (aka vp->v_mount->mnt_data).
567  * This has a 1:1 correspondence to struct mount (note that the
568  * hammer2_mount structure has a N:1 correspondence).
569  *
570  * This structure represents a cluster mount and not necessarily a
571  * PFS under a specific device mount (HMP).  The distinction is important
572  * because the elements backing a cluster mount can change on the fly.
573  *
574  * Usually the first element under the cluster represents the original
575  * user-requested mount that bootstraps the whole mess.  In significant
576  * setups the original is usually just a read-only media image (or
577  * representitive file) that simply contains a bootstrap volume header
578  * listing the configuration.
579  */
580 struct hammer2_pfsmount {
581         struct mount            *mp;
582         hammer2_cluster_t       cluster;
583         hammer2_inode_t         *iroot;         /* PFS root inode */
584         hammer2_inode_t         *ihidden;       /* PFS hidden directory */
585         struct lock             lock;           /* PFS lock for certain ops */
586         hammer2_off_t           inode_count;    /* copy of inode_count */
587         ccms_domain_t           ccms_dom;
588         struct netexport        export;         /* nfs export */
589         int                     ronly;          /* read-only mount */
590         struct malloc_type      *minode;
591         struct malloc_type      *mmsg;
592         kdmsg_iocom_t           iocom;
593         struct spinlock         inum_spin;      /* inumber lookup */
594         struct hammer2_inode_tree inum_tree;
595         long                    inmem_inodes;
596         long                    inmem_dirty_chains;
597         int                     count_lwinprog; /* logical write in prog */
598         thread_t                wthread_td;     /* write thread td */
599         struct bio_queue_head   wthread_bioq;   /* logical buffer bioq */
600         struct mtx              wthread_mtx;    /* interlock */
601         int                     wthread_destroy;/* termination sequencing */
602 };
603
604 typedef struct hammer2_pfsmount hammer2_pfsmount_t;
605
606 #define HAMMER2_DIRTYCHAIN_WAITING      0x80000000
607 #define HAMMER2_DIRTYCHAIN_MASK         0x7FFFFFFF
608
609 #define HAMMER2_LWINPROG_WAITING        0x80000000
610 #define HAMMER2_LWINPROG_MASK           0x7FFFFFFF
611
612 #if defined(_KERNEL)
613
614 MALLOC_DECLARE(M_HAMMER2);
615
616 #define VTOI(vp)        ((hammer2_inode_t *)(vp)->v_data)
617 #define ITOV(ip)        ((ip)->vp)
618
619 /*
620  * Currently locked chains retain the locked buffer cache buffer for
621  * indirect blocks, and indirect blocks can be one of two sizes.  The
622  * device buffer has to match the case to avoid deadlocking recursive
623  * chains that might otherwise try to access different offsets within
624  * the same device buffer.
625  */
626 static __inline
627 int
628 hammer2_devblkradix(int radix)
629 {
630         if (radix <= HAMMER2_LBUFRADIX) {
631                 return (HAMMER2_LBUFRADIX);
632         } else {
633                 return (HAMMER2_PBUFRADIX);
634         }
635 }
636
637 static __inline
638 size_t
639 hammer2_devblksize(size_t bytes)
640 {
641         if (bytes <= HAMMER2_LBUFSIZE) {
642                 return(HAMMER2_LBUFSIZE);
643         } else {
644                 KKASSERT(bytes <= HAMMER2_PBUFSIZE &&
645                          (bytes ^ (bytes - 1)) == ((bytes << 1) - 1));
646                 return (HAMMER2_PBUFSIZE);
647         }
648 }
649
650
651 static __inline
652 hammer2_pfsmount_t *
653 MPTOPMP(struct mount *mp)
654 {
655         return ((hammer2_pfsmount_t *)mp->mnt_data);
656 }
657
658 extern struct vop_ops hammer2_vnode_vops;
659 extern struct vop_ops hammer2_spec_vops;
660 extern struct vop_ops hammer2_fifo_vops;
661
662 extern int hammer2_debug;
663 extern int hammer2_cluster_enable;
664 extern int hammer2_hardlink_enable;
665 extern int hammer2_flush_pipe;
666 extern int hammer2_synchronous_flush;
667 extern long hammer2_limit_dirty_chains;
668 extern long hammer2_iod_file_read;
669 extern long hammer2_iod_meta_read;
670 extern long hammer2_iod_indr_read;
671 extern long hammer2_iod_fmap_read;
672 extern long hammer2_iod_volu_read;
673 extern long hammer2_iod_file_write;
674 extern long hammer2_iod_meta_write;
675 extern long hammer2_iod_indr_write;
676 extern long hammer2_iod_fmap_write;
677 extern long hammer2_iod_volu_write;
678 extern long hammer2_ioa_file_read;
679 extern long hammer2_ioa_meta_read;
680 extern long hammer2_ioa_indr_read;
681 extern long hammer2_ioa_fmap_read;
682 extern long hammer2_ioa_volu_read;
683 extern long hammer2_ioa_file_write;
684 extern long hammer2_ioa_meta_write;
685 extern long hammer2_ioa_indr_write;
686 extern long hammer2_ioa_fmap_write;
687 extern long hammer2_ioa_volu_write;
688
689 extern struct objcache *cache_buffer_read;
690 extern struct objcache *cache_buffer_write;
691
692 extern int destroy;
693 extern int write_thread_wakeup;
694
695 extern mtx_t thread_protect;
696
697 /*
698  * hammer2_subr.c
699  */
700 #define hammer2_icrc32(buf, size)       iscsi_crc32((buf), (size))
701 #define hammer2_icrc32c(buf, size, crc) iscsi_crc32_ext((buf), (size), (crc))
702
703 hammer2_chain_t *hammer2_inode_lock_ex(hammer2_inode_t *ip);
704 hammer2_chain_t *hammer2_inode_lock_sh(hammer2_inode_t *ip);
705 void hammer2_inode_unlock_ex(hammer2_inode_t *ip, hammer2_chain_t *chain);
706 void hammer2_inode_unlock_sh(hammer2_inode_t *ip, hammer2_chain_t *chain);
707 void hammer2_chain_refactor(hammer2_chain_t **chainp);
708 void hammer2_voldata_lock(hammer2_mount_t *hmp);
709 void hammer2_voldata_unlock(hammer2_mount_t *hmp, int modify);
710 ccms_state_t hammer2_inode_lock_temp_release(hammer2_inode_t *ip);
711 void hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, ccms_state_t ostate);
712 ccms_state_t hammer2_inode_lock_upgrade(hammer2_inode_t *ip);
713 void hammer2_inode_lock_downgrade(hammer2_inode_t *ip, ccms_state_t ostate);
714
715 void hammer2_mount_exlock(hammer2_mount_t *hmp);
716 void hammer2_mount_shlock(hammer2_mount_t *hmp);
717 void hammer2_mount_unlock(hammer2_mount_t *hmp);
718
719 int hammer2_get_dtype(hammer2_chain_t *chain);
720 int hammer2_get_vtype(hammer2_chain_t *chain);
721 u_int8_t hammer2_get_obj_type(enum vtype vtype);
722 void hammer2_time_to_timespec(u_int64_t xtime, struct timespec *ts);
723 u_int64_t hammer2_timespec_to_time(struct timespec *ts);
724 u_int32_t hammer2_to_unix_xid(uuid_t *uuid);
725 void hammer2_guid_to_uuid(uuid_t *uuid, u_int32_t guid);
726
727 hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
728 int hammer2_getradix(size_t bytes);
729
730 int hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
731                         hammer2_key_t *lbasep, hammer2_key_t *leofp);
732 int hammer2_calc_physical(hammer2_inode_t *ip, hammer2_key_t lbase);
733 void hammer2_update_time(uint64_t *timep);
734
735 /*
736  * hammer2_inode.c
737  */
738 struct vnode *hammer2_igetv(hammer2_inode_t *ip, int *errorp);
739
740 void hammer2_inode_lock_nlinks(hammer2_inode_t *ip);
741 void hammer2_inode_unlock_nlinks(hammer2_inode_t *ip);
742 hammer2_inode_t *hammer2_inode_lookup(hammer2_pfsmount_t *pmp,
743                         hammer2_tid_t inum);
744 hammer2_inode_t *hammer2_inode_get(hammer2_pfsmount_t *pmp,
745                         hammer2_inode_t *dip, hammer2_chain_t *chain);
746 void hammer2_inode_free(hammer2_inode_t *ip);
747 void hammer2_inode_ref(hammer2_inode_t *ip);
748 void hammer2_inode_drop(hammer2_inode_t *ip);
749 void hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
750                         hammer2_chain_t *chain);
751
752 hammer2_inode_t *hammer2_inode_create(hammer2_trans_t *trans,
753                         hammer2_inode_t *dip,
754                         struct vattr *vap, struct ucred *cred,
755                         const uint8_t *name, size_t name_len,
756                         hammer2_chain_t **chainp, int *errorp);
757 int hammer2_inode_connect(hammer2_trans_t *trans,
758                         hammer2_chain_t **chainp, int hlink,
759                         hammer2_inode_t *dip, hammer2_chain_t **dchainp,
760                         const uint8_t *name, size_t name_len,
761                         hammer2_key_t key);
762 hammer2_inode_t *hammer2_inode_common_parent(hammer2_inode_t *fdip,
763                         hammer2_inode_t *tdip);
764 void hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip,
765                         hammer2_chain_t **parentp);
766 int hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
767                         const uint8_t *name, size_t name_len, int isdir,
768                         int *hlinkp, struct nchandle *nch);
769 int hammer2_hardlink_consolidate(hammer2_trans_t *trans,
770                         hammer2_inode_t *ip, hammer2_chain_t **chainp,
771                         hammer2_inode_t *cdip, hammer2_chain_t **cdchainp,
772                         int nlinks);
773 int hammer2_hardlink_deconsolidate(hammer2_trans_t *trans, hammer2_inode_t *dip,
774                         hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
775 int hammer2_hardlink_find(hammer2_inode_t *dip,
776                         hammer2_chain_t **chainp, hammer2_chain_t **ochainp);
777 void hammer2_inode_install_hidden(hammer2_pfsmount_t *pmp);
778
779 /*
780  * hammer2_chain.c
781  */
782 void hammer2_modify_volume(hammer2_mount_t *hmp);
783 hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
784                                 hammer2_pfsmount_t *pmp,
785                                 hammer2_trans_t *trans,
786                                 hammer2_blockref_t *bref);
787 void hammer2_chain_core_alloc(hammer2_trans_t *trans, hammer2_chain_t *nchain,
788                                 hammer2_chain_t *ochain);
789 void hammer2_chain_ref(hammer2_chain_t *chain);
790 void hammer2_chain_drop(hammer2_chain_t *chain);
791 int hammer2_chain_lock(hammer2_chain_t *chain, int how);
792 void hammer2_chain_load_async(hammer2_chain_t *chain,
793                                 void (*func)(hammer2_io_t *dio,
794                                              hammer2_chain_t *chain,
795                                              void *arg_p, off_t arg_o),
796                                 void *arg_p, off_t arg_o);
797 void hammer2_chain_moved(hammer2_chain_t *chain);
798 void hammer2_chain_modify(hammer2_trans_t *trans,
799                                 hammer2_chain_t **chainp, int flags);
800 hammer2_inode_data_t *hammer2_chain_modify_ip(hammer2_trans_t *trans,
801                                 hammer2_inode_t *ip, hammer2_chain_t **chainp,
802                                 int flags);
803 void hammer2_chain_resize(hammer2_trans_t *trans, hammer2_inode_t *ip,
804                                 hammer2_chain_t *parent,
805                                 hammer2_chain_t **chainp,
806                                 int nradix, int flags);
807 void hammer2_chain_unlock(hammer2_chain_t *chain);
808 void hammer2_chain_wait(hammer2_chain_t *chain);
809 hammer2_chain_t *hammer2_chain_get(hammer2_chain_t *parent, int generation,
810                                 hammer2_blockref_t *bref);
811 hammer2_chain_t *hammer2_chain_lookup_init(hammer2_chain_t *parent, int flags);
812 void hammer2_chain_lookup_done(hammer2_chain_t *parent);
813 hammer2_chain_t *hammer2_chain_lookup(hammer2_chain_t **parentp,
814                                 hammer2_key_t *key_nextp,
815                                 hammer2_key_t key_beg, hammer2_key_t key_end,
816                                 int *cache_indexp, int flags);
817 hammer2_chain_t *hammer2_chain_next(hammer2_chain_t **parentp,
818                                 hammer2_chain_t *chain,
819                                 hammer2_key_t *key_nextp,
820                                 hammer2_key_t key_beg, hammer2_key_t key_end,
821                                 int *cache_indexp, int flags);
822 hammer2_chain_t *hammer2_chain_scan(hammer2_chain_t *parent,
823                                 hammer2_chain_t *chain,
824                                 int *cache_indexp, int flags);
825
826 int hammer2_chain_create(hammer2_trans_t *trans,
827                                 hammer2_chain_t **parentp,
828                                 hammer2_chain_t **chainp,
829                                 hammer2_key_t key, int keybits,
830                                 int type, size_t bytes);
831 void hammer2_chain_duplicate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
832                                 hammer2_chain_t **chainp,
833                                 hammer2_blockref_t *bref, int snapshot,
834                                 int duplicate_reason);
835 int hammer2_chain_snapshot(hammer2_trans_t *trans, hammer2_chain_t **chainp,
836                                 hammer2_ioc_pfs_t *pfs);
837 void hammer2_chain_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
838                                 int flags);
839 void hammer2_chain_delete_duplicate(hammer2_trans_t *trans,
840                                 hammer2_chain_t **chainp, int flags);
841 void hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t **chainp);
842 void hammer2_chain_commit(hammer2_trans_t *trans, hammer2_chain_t *chain);
843 void hammer2_chain_setsubmod(hammer2_trans_t *trans, hammer2_chain_t *chain);
844
845 void hammer2_chain_memory_wait(hammer2_pfsmount_t *pmp);
846 void hammer2_chain_memory_inc(hammer2_pfsmount_t *pmp);
847 void hammer2_chain_memory_wakeup(hammer2_pfsmount_t *pmp);
848 void hammer2_chain_countbrefs(hammer2_chain_t *chain,
849                                 hammer2_blockref_t *base, int count);
850
851 int hammer2_base_find(hammer2_chain_t *chain,
852                                 hammer2_blockref_t *base, int count,
853                                 int *cache_indexp, hammer2_key_t *key_nextp,
854                                 hammer2_key_t key_beg, hammer2_key_t key_end,
855                                 int delete_filter);
856 void hammer2_base_delete(hammer2_trans_t *trans, hammer2_chain_t *chain,
857                                 hammer2_blockref_t *base, int count,
858                                 int *cache_indexp, hammer2_chain_t *child);
859 void hammer2_base_insert(hammer2_trans_t *trans, hammer2_chain_t *chain,
860                                 hammer2_blockref_t *base, int count,
861                                 int *cache_indexp, hammer2_chain_t *child);
862
863 /*
864  * hammer2_trans.c
865  */
866 void hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp,
867                                 hammer2_mount_t *hmp, int flags);
868 void hammer2_trans_clear_invfsync(hammer2_trans_t *trans);
869 void hammer2_trans_done(hammer2_trans_t *trans);
870
871 /*
872  * hammer2_ioctl.c
873  */
874 int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
875                                 int fflag, struct ucred *cred);
876
877 /*
878  * hammer2_io.c
879  */
880 hammer2_io_t *hammer2_io_getblk(hammer2_mount_t *hmp, off_t lbase,
881                                 int lsize, int *ownerp);
882 void hammer2_io_putblk(hammer2_io_t **diop);
883 void hammer2_io_cleanup(hammer2_mount_t *hmp, struct hammer2_io_tree *tree);
884 char *hammer2_io_data(hammer2_io_t *dio, off_t lbase);
885 int hammer2_io_new(hammer2_mount_t *hmp, off_t lbase, int lsize,
886                                 hammer2_io_t **diop);
887 int hammer2_io_newnz(hammer2_mount_t *hmp, off_t lbase, int lsize,
888                                 hammer2_io_t **diop);
889 int hammer2_io_newq(hammer2_mount_t *hmp, off_t lbase, int lsize,
890                                 hammer2_io_t **diop);
891 int hammer2_io_bread(hammer2_mount_t *hmp, off_t lbase, int lsize,
892                                 hammer2_io_t **diop);
893 void hammer2_io_breadcb(hammer2_mount_t *hmp, off_t lbase, int lsize,
894                                 void (*callback)(hammer2_io_t *dio,
895                                                  hammer2_chain_t *arg_c,
896                                                  void *arg_p, off_t arg_o),
897                                 hammer2_chain_t *arg_c,
898                                 void *arg_p, off_t arg_o);
899 void hammer2_io_bawrite(hammer2_io_t **diop);
900 void hammer2_io_bdwrite(hammer2_io_t **diop);
901 int hammer2_io_bwrite(hammer2_io_t **diop);
902 int hammer2_io_isdirty(hammer2_io_t *dio);
903 void hammer2_io_setdirty(hammer2_io_t *dio);
904 void hammer2_io_setinval(hammer2_io_t *dio, u_int bytes);
905 void hammer2_io_brelse(hammer2_io_t **diop);
906 void hammer2_io_bqrelse(hammer2_io_t **diop);
907
908 /*
909  * hammer2_msgops.c
910  */
911 int hammer2_msg_dbg_rcvmsg(kdmsg_msg_t *msg);
912 int hammer2_msg_adhoc_input(kdmsg_msg_t *msg);
913
914 /*
915  * hammer2_vfsops.c
916  */
917 void hammer2_clusterctl_wakeup(kdmsg_iocom_t *iocom);
918 void hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index);
919 void hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp);
920 void hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx);
921 void hammer2_bioq_sync(hammer2_pfsmount_t *pmp);
922 int hammer2_vfs_sync(struct mount *mp, int waitflags);
923 void hammer2_lwinprog_ref(hammer2_pfsmount_t *pmp);
924 void hammer2_lwinprog_drop(hammer2_pfsmount_t *pmp);
925 void hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp);
926
927 /*
928  * hammer2_freemap.c
929  */
930 int hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain,
931                                 size_t bytes);
932 void hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp,
933                                 hammer2_blockref_t *bref, int how);
934
935
936 #endif /* !_KERNEL */
937 #endif /* !_VFS_HAMMER2_HAMMER2_H_ */