hammer2 - Initial CCMS adaptation and code-up
[dragonfly.git] / sys / vfs / hammer2 / hammer2_ccms.h
CommitLineData
f03672ec
MD
1/*
2 * Copyright (c) 2006,2012 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34/*
1ad77ed9
MD
35 * This module is HAMMER2-independent.
36 *
f03672ec 37 * CCMS - Cache Coherency Management System. These structures are used
1ad77ed9
MD
38 * to manage cache coherency and locking for an object.
39 *
40 * ccms_inode
41 *
42 * Cache coherency is tied into a kernel or VFS structure, creating a
43 * directory/file topology and a keyspace on an inode-by-inode basis
44 * via the (ccms_inode) structure.
45 *
46 * Each CCMS inode contains a RB-Tree holding ccms_cst (CST) elements
47 * for its file range or directory key range, plus two independent embedded
48 * ccms_cst structures representing the inode attributes and the entire
49 * recursive sub-tree.
50 *
51 * The CST representing the entire sub-tree is inclusive of that inode's
52 * attribute state and data/key range state AND inclusive of the entire
53 * filesystem topology under that point, recursively.
54 *
55 * Two ccms_cst's are embedded in each cached inode via the ccms_inode
56 * structure to represent attribute and recursive topological cache state.
57 *
58 * ccms_cst
59 *
60 * The (ccms_cst) structure, called the CST, represents specific, persistent
61 * cache state. This structure is allocated and freed on the fly as needed
62 * (except for the two embedded in the ccms_inode).
63 *
64 * The persistence ties into network/cluster operations via the 'rstate'
65 * field. When cluster-maintained state is present then certain operations
66 * on the CST's local state (including when a vnode is reclaimed) will
67 * block while third-party synchronization occurs.
68 *
69 * The number of dynamically allocated CSTs is strictly limited, forcing
70 * a degree of aggregation when the limit is reached.
71 *
72 * ccms_lock
73 *
74 * The (ccms_lock) structure represents a live local lock for the duration of
75 * any given filesystem operation. A single ccms_lock can cover both
76 * attribute state AND a byte-range/key-range.
77 *
78 * This lock represents the exact lock being requested but the CST structure
79 * it points to can be a more general representation which covers the lock.
80 * The minimum granularity for the cst pointer in the ccms_lock will be to
81 * the ccms_inode's embedded topo_cst.
82 *
83 * Theoretically a single CST at the root can cover the entire filesystem,
84 * but this creates a great deal of SMP interaction.
85 *
86 * Management
87 *
88 * Because cache state is persistent the CCMS module may desire to limit the
89 * total number of CSTs under management. It does this by aggregating cache
90 * state which in turn may require callbacks to invalidate third-party
91 * (cluster-related) cache state.
92 *
93 * CCMS operations related to locks can stall on third-party state
94 * transitions. Because third-party state can also change independently
95 * due to foreign interactions (often with a userland program), no filesystem
96 * lock can be held while manipulating CST states. For this reason,
97 * HAMMER2 (or any VFS using CCMS) must provide roll-up functions to acquire
98 * CCMS lock state up-front prior to locking the VFS inode structure.
99 *
100 * vnode locks which are under the control of the filesystem can be more
101 * problematic and may require additional care.
f03672ec
MD
102 */
103
104#ifndef _SYS_CCMS_H_
105#define _SYS_CCMS_H_
106
107#ifndef _SYS_TYPES_H_
108#include <sys/types.h>
109#endif
110#ifndef _SYS_PARAM_H_
111#include <sys/param.h>
112#endif
113#ifndef _SYS_SERIALIZE_H_
114#include <sys/serialize.h>
115#endif
116#ifndef _SYS_SPINLOCK_H_
117#include <sys/spinlock.h>
118#endif
119#ifndef _SYS_TREE_H_
120#include <sys/tree.h>
121#endif
122
1ad77ed9
MD
123typedef uint64_t ccms_off_t;
124typedef uint8_t ccms_state_t;
125
f03672ec 126/*
1ad77ed9 127 * CCMS uses a red-black tree to organize CSTs.
f03672ec
MD
128 */
129RB_HEAD(ccms_rb_tree, ccms_cst);
1ad77ed9 130RB_PROTOTYPE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp, ccms_off_t);
f03672ec 131
1ad77ed9 132struct ccms_inode;
f03672ec 133struct ccms_cst;
1ad77ed9 134struct ccms_lock;
f03672ec
MD
135
136/*
1ad77ed9 137 * CCMS cache states
f03672ec 138 *
1ad77ed9
MD
139 * CCMS uses an extended MESI caching model. There are two extension states,
140 * MASTER and SLAVE, which represents dirty data which has not been
f03672ec
MD
141 * synchronized to backing store but which nevertheless is being shared
142 * between distinct caches. These states are designed to allow data
143 * to be shared between nodes in a cluster without having to wait for it
144 * to be synchronized with its backing store.
145 *
1ad77ed9
MD
146 * Each CST has lstate and rstate. lstate is the local cache state and rstate
147 * is the remotely-granted state. Changes to the lstate require a compatible
148 * rstate. If the rstate is not compatible a third-party transaction is
149 * required to obtain the proper rstate.
150 *
151 * INVALID - Cache state is unknown and must be acquired.
152 *
153 * ALLOWED - (topo_cst.rstate only). This is a granted state which
154 * allows cache state transactions underneath the current
155 * node (data, attribute, and recursively), but is not a proper
156 * grant for topo_cst itself. Someone specifically trying to
157 * acquire topo_cst still needs to do a third party transaction
158 * to get the cache into the proper state.
159 *
160 * SHARED - Indicates that the information is clean, shared, read-only.
161 *
162 * SLAVE - Indicates that the information is clean, shared, read-only.
163 * Indicates that local backing store is out of date but the
164 * in-memory cache is valid, meaning that we can only obtain
165 * the data from the MASTER (somewhere in the cluster), and
166 * that we may not be allowed to sync it to local backing
167 * store yet e.g. due to the quorum protocol not having
168 * completed.
169 *
170 * MASTER - Indicates that the information is dirty, but readonly
171 * because other nodes in the cluster are in a SLAVE state.
172 * This state is typically transitional and occurs while
173 * a quorum operation is in progress, allowing slaves to
174 * access the data without stalling.
175 *
176 * EXCLUSIVE - Indicates that the information is clean, read-only, and
177 * that nobody else can access the data while we are in this
178 * state. A local node can upgrade both rstate and lstate
179 * from EXCLUSIVE to MODIFIED without having to perform a
180 * third-party transaction.
181 *
182 * MODIFIED - Indicates that the information is dirty, read-write, and
183 * that nobody else can access the data while we are in this
184 * state.
185 *
186 * It is important to note that remote cache-state grants can be more
187 * general than what was requested, plus they can be persistent. So,
188 * for example, a remote can grant EXCLUSIVE access even if you just
189 * requested SHARED, which saves you from having to do another network
190 * transaction if you later need EXCLUSIVE.
f03672ec 191 */
f03672ec 192
1ad77ed9
MD
193#define CCMS_STATE_INVALID 0 /* unknown cache state */
194#define CCMS_STATE_ALLOWED 1 /* allow subsystem (topo only) */
195#define CCMS_STATE_SHARED 2 /* clean, shared, read-only */
196#define CCMS_STATE_SLAVE 3 /* live only, shared, read-only */
197#define CCMS_STATE_MASTER 4 /* dirty, shared, read-only */
198#define CCMS_STATE_EXCLUSIVE 5 /* clean, exclusive, read-only */
199#define CCMS_STATE_MODIFIED 6 /* dirty, exclusive, read-write */
f03672ec
MD
200
201/*
1ad77ed9
MD
202 * A CCMS locking element - represents a high level locking request,
203 * such as used by read, write, and attribute operations. Initialize
204 * the ccms_lock structure and call ccms_lock_get().
f03672ec
MD
205 *
206 * When a CCMS lock is established the cache state of the underlying elements
207 * is adjusted to meet the requirements of the lock. The cache state
1ad77ed9
MD
208 * requirements are infered by the lock type. CCMS locks can block on
209 * third party interactions if the underlying remote cache state is not
210 * compatible.
f03672ec 211 *
1ad77ed9
MD
212 * CCMS data locks imply a shared CCMS inode lock. A CCMS topology lock does
213 * not imply a data or inode lock but topology locks can have far-reaching
214 * effects and block on numerous CST state.
f03672ec
MD
215 */
216struct ccms_lock {
1ad77ed9
MD
217 ccms_state_t tstate;
218 ccms_state_t astate;
219 ccms_state_t dstate;
220 ccms_off_t beg_offset; /* applies to dstate */
221 ccms_off_t end_offset; /* applies to dstate */
222 struct ccms_cst *icst; /* points to topo_cst or attr_cst */
223 struct ccms_cst *dcst; /* points to left edge in rbtree */
224#ifdef CCMS_DEBUG
225 TAILQ_ENTRY(ccms_lock) entry;
226#endif
f03672ec
MD
227};
228
1ad77ed9
MD
229#ifdef CCMS_DEBUG
230
231TAILQ_HEAD(ccms_lock_head, ccms_lock);
232
233#endif
234
f03672ec
MD
235/*
236 * CCMS cache state tree element (CST) - represents the actual cache
237 * management state for a data space. The cache state tree is a
238 * non-overlaping red-black tree containing ranged ccms_cst structures
239 * which reflect the resolved state for all current high level locking
240 * requests. For example, two overlapping ccms_lock requests for shared
241 * access would typically be represented by three non-overlapping ccms_cst
242 * items in the CST. The CST item representing the overlapped portion of
243 * the ccms_lock requests would have ref count of 2 while the other CST
244 * items would have a ref count of 1.
245 *
246 * [lock request #01]
247 * [lock request #02]
248 * [--cst--][--cst--][--cst--]
249 *
250 * CSTs are partitioned so their edges line up to all current and pending
251 * ccms_lock requests. CSTs are re-merged whenever possible. A freshly
252 * initialized database typically has a single CST representing the default
253 * cache state for the host.
254 *
1ad77ed9
MD
255 * A CST keeps track of local cache state (lstate) AND remote cache state
256 * (rstate).
f03672ec
MD
257 *
258 * Any arbitrary data range within a dataspace can be locked shared or
259 * exclusive. Obtaining a lock has the side effect of potentially modifying
260 * the cache state. A positive sharecount in a CST indicates that a
261 * shared access lock is being held. A negative sharecount indicates an
262 * exclusive access lock is being held on the range. A MODIFYING lock
263 * type is just an exclusive lock but one which effects the cache state
264 * differently.
265 *
266 * The end offset is byte-inclusive, allowing the entire 64 bit data space
267 * to be represented without overflowing the edge case. For example, a
1ad77ed9
MD
268 * 64 byte area might be represented as (0,63). The offsets are UNSIGNED
269 * entities.
f03672ec
MD
270 */
271struct ccms_cst {
272 RB_ENTRY(ccms_cst) rbnode; /* stored in a red-black tree */
1ad77ed9
MD
273 struct ccms_cst *free_next; /* free cache linked list */
274 struct ccms_inode *cino; /* related ccms_inode */
275 ccms_off_t beg_offset; /* range (inclusive) */
276 ccms_off_t end_offset; /* range (inclusive) */
277 ccms_state_t lstate; /* local cache state */
278 ccms_state_t rstate; /* cache state granted by protocol */
279
280 int32_t flags;
281 int32_t count; /* shared/exclusive count */
282 int32_t blocked; /* indicates a blocked lock request */
283 int32_t xrefs; /* lock overlap references */
284 int32_t lrefs; /* left edge refs */
285 int32_t rrefs; /* right edge refs */
286#ifdef CCMS_DEBUG
287 struct ccms_lock_head list;
288#endif
f03672ec
MD
289};
290
1ad77ed9
MD
291#define CCMS_CST_DYNAMIC 0x00000001
292#define CCMS_CST_DELETING 0x00000002
293#define CCMS_CST_INSERTED 0x00000004
294#define CCMS_CST_INHERITED 0x00000008 /* rstate inherited from par */
295
296/*
297 * A CCMS inode is typically embedded in a VFS file or directory object.
298 *
299 * The subdirectory topology is accessible downward by indexing topo_cst's
300 * from the children in the parent's cst_tree.
301 *
302 * attr_cst is independent of data-range CSTs. However, adjustments to
303 * the topo_cst can have far-reaching effects to attr_cst, the CSTs in
304 * the tree, recursively both downward and upward.
305 */
306struct ccms_inode {
307 struct spinlock spin;
308 struct ccms_inode *parent;
309 struct ccms_rb_tree tree;
310 struct ccms_cst attr_cst;
311 struct ccms_cst topo_cst;
312 struct ccms_cst *free_cache; /* cst free cache */
313 struct ccms_domain *domain;
314 void *handle; /* VFS opaque */
315 int32_t flags;
316};
317
318#define CCMS_INODE_INSERTED 0x0001
319#define CCMS_INODE_DELETING 0x0002
320
321/*
322 * Domain management, contains a pseudo-root for the CCMS topology.
323 */
324struct ccms_domain {
325 struct malloc_type *mcst; /* malloc space for cst's */
326 struct ccms_inode root; /* dummy protocol root */
327 int cst_count; /* dynamic cst count */
328 int cst_limit; /* dynamic cst limit */
329};
330
331typedef struct ccms_lock ccms_lock_t;
332typedef struct ccms_cst ccms_cst_t;
333typedef struct ccms_inode ccms_inode_t;
334typedef struct ccms_domain ccms_domain_t;
f03672ec
MD
335
336/*
337 * Kernel API
338 */
339#ifdef _KERNEL
340
1ad77ed9
MD
341/*
342 * Helper inline to initialize primarily a dstate lock which shortcuts
343 * the more common locking operations. A dstate is specified and an
344 * astate is implied. tstate locks cannot be acquired with this inline.
345 */
f03672ec
MD
346static __inline
347void
1ad77ed9
MD
348ccms_lock_init(ccms_lock_t *lock, ccms_state_t dstate,
349 ccms_off_t beg_offset, ccms_off_t end_offset)
f03672ec 350{
1ad77ed9
MD
351 lock->beg_offset = beg_offset;
352 lock->end_offset = end_offset;
353 lock->tstate = 0;
354 lock->astate = 0;
355 lock->dstate = dstate;
f03672ec
MD
356}
357
1ad77ed9
MD
358void ccms_domain_init(ccms_domain_t *dom);
359void ccms_inode_init(ccms_domain_t *dom, ccms_inode_t *cino, void *handle);
360void ccms_inode_insert(ccms_inode_t *cpar, ccms_inode_t *cino);
361void ccms_inode_delete(ccms_inode_t *cino);
362void ccms_inode_uninit(ccms_inode_t *cino);
363
364int ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock);
365int ccms_lock_get_uio(ccms_inode_t *cino, ccms_lock_t *lock, struct uio *uio);
366int ccms_lock_get_attr(ccms_inode_t *cino, ccms_lock_t *lock, ccms_state_t st);
367int ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock);
f03672ec
MD
368
369#endif
370
371#endif