| Commit | Line | Data |
|---|---|---|
| f03672ec MD |
1 | /* |
| 2 | * Copyright (c) 2006,2012 The DragonFly Project. All rights reserved. | |
| 3 | * | |
| 4 | * This code is derived from software contributed to The DragonFly Project | |
| 5 | * by Matthew Dillon <dillon@backplane.com> | |
| 6 | * | |
| 7 | * Redistribution and use in source and binary forms, with or without | |
| 8 | * modification, are permitted provided that the following conditions | |
| 9 | * are met: | |
| 10 | * | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in | |
| 15 | * the documentation and/or other materials provided with the | |
| 16 | * distribution. | |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its | |
| 18 | * contributors may be used to endorse or promote products derived | |
| 19 | * from this software without specific, prior written permission. | |
| 20 | * | |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 32 | * SUCH DAMAGE. | |
| 33 | */ | |
| 34 | /* | |
| 1ad77ed9 MD |
35 | * This module is HAMMER2-independent. |
| 36 | * | |
| f03672ec | 37 | * CCMS - Cache Coherency Management System. These structures are used |
| 1ad77ed9 MD |
38 | * to manage cache coherency and locking for an object. |
| 39 | * | |
| 40 | * ccms_inode | |
| 41 | * | |
| 42 | * Cache coherency is tied into a kernel or VFS structure, creating a | |
| 43 | * directory/file topology and a keyspace on an inode-by-inode basis | |
| 44 | * via the (ccms_inode) structure. | |
| 45 | * | |
| 46 | * Each CCMS inode contains a RB-Tree holding ccms_cst (CST) elements | |
| 47 | * for its file range or directory key range, plus two independent embedded | |
| 48 | * ccms_cst structures representing the inode attributes and the entire | |
| 49 | * recursive sub-tree. | |
| 50 | * | |
| 51 | * The CST representing the entire sub-tree is inclusive of that inode's | |
| 52 | * attribute state and data/key range state AND inclusive of the entire | |
| 53 | * filesystem topology under that point, recursively. | |
| 54 | * | |
| 55 | * Two ccms_cst's are embedded in each cached inode via the ccms_inode | |
| 56 | * structure to represent attribute and recursive topological cache state. | |
| 57 | * | |
| 58 | * ccms_cst | |
| 59 | * | |
| 60 | * The (ccms_cst) structure, called the CST, represents specific, persistent | |
| 61 | * cache state. This structure is allocated and freed on the fly as needed | |
| 62 | * (except for the two embedded in the ccms_inode). | |
| 63 | * | |
| 64 | * The persistence ties into network/cluster operations via the 'rstate' | |
| 65 | * field. When cluster-maintained state is present then certain operations | |
| 66 | * on the CST's local state (including when a vnode is reclaimed) will | |
| 67 | * block while third-party synchronization occurs. | |
| 68 | * | |
| 69 | * The number of dynamically allocated CSTs is strictly limited, forcing | |
| 70 | * a degree of aggregation when the limit is reached. | |
| 71 | * | |
| 72 | * ccms_lock | |
| 73 | * | |
| 74 | * The (ccms_lock) structure represents a live local lock for the duration of | |
| 75 | * any given filesystem operation. A single ccms_lock can cover both | |
| 76 | * attribute state AND a byte-range/key-range. | |
| 77 | * | |
| 78 | * This lock represents the exact lock being requested but the CST structure | |
| 79 | * it points to can be a more general representation which covers the lock. | |
| 80 | * The minimum granularity for the cst pointer in the ccms_lock will be to | |
| 81 | * the ccms_inode's embedded topo_cst. | |
| 82 | * | |
| 83 | * Theoretically a single CST at the root can cover the entire filesystem, | |
| 84 | * but this creates a great deal of SMP interaction. | |
| 85 | * | |
| 86 | * Management | |
| 87 | * | |
| 88 | * Because cache state is persistent the CCMS module may desire to limit the | |
| 89 | * total number of CSTs under management. It does this by aggregating cache | |
| 90 | * state which in turn may require callbacks to invalidate third-party | |
| 91 | * (cluster-related) cache state. | |
| 92 | * | |
| 93 | * CCMS operations related to locks can stall on third-party state | |
| 94 | * transitions. Because third-party state can also change independently | |
| 95 | * due to foreign interactions (often with a userland program), no filesystem | |
| 96 | * lock can be held while manipulating CST states. For this reason, | |
| 97 | * HAMMER2 (or any VFS using CCMS) must provide roll-up functions to acquire | |
| 98 | * CCMS lock state up-front prior to locking the VFS inode structure. | |
| 99 | * | |
| 100 | * vnode locks which are under the control of the filesystem can be more | |
| 101 | * problematic and may require additional care. | |
| f03672ec MD |
102 | */ |
| 103 | ||
| 104 | #ifndef _SYS_CCMS_H_ | |
| 105 | #define _SYS_CCMS_H_ | |
| 106 | ||
| 107 | #ifndef _SYS_TYPES_H_ | |
| 108 | #include <sys/types.h> | |
| 109 | #endif | |
| 110 | #ifndef _SYS_PARAM_H_ | |
| 111 | #include <sys/param.h> | |
| 112 | #endif | |
| 113 | #ifndef _SYS_SERIALIZE_H_ | |
| 114 | #include <sys/serialize.h> | |
| 115 | #endif | |
| 116 | #ifndef _SYS_SPINLOCK_H_ | |
| 117 | #include <sys/spinlock.h> | |
| 118 | #endif | |
| 119 | #ifndef _SYS_TREE_H_ | |
| 120 | #include <sys/tree.h> | |
| 121 | #endif | |
| 122 | ||
| 1ad77ed9 MD |
123 | typedef uint64_t ccms_off_t; |
| 124 | typedef uint8_t ccms_state_t; | |
| 125 | ||
| f03672ec | 126 | /* |
| 1ad77ed9 | 127 | * CCMS uses a red-black tree to organize CSTs. |
| f03672ec MD |
128 | */ |
| 129 | RB_HEAD(ccms_rb_tree, ccms_cst); | |
| 1ad77ed9 | 130 | RB_PROTOTYPE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp, ccms_off_t); |
| f03672ec | 131 | |
| 1ad77ed9 | 132 | struct ccms_inode; |
| f03672ec | 133 | struct ccms_cst; |
| 1ad77ed9 | 134 | struct ccms_lock; |
| f03672ec MD |
135 | |
| 136 | /* | |
| 1ad77ed9 | 137 | * CCMS cache states |
| f03672ec | 138 | * |
| 1ad77ed9 MD |
139 | * CCMS uses an extended MESI caching model. There are two extension states, |
| 140 | * MASTER and SLAVE, which represents dirty data which has not been | |
| f03672ec MD |
141 | * synchronized to backing store but which nevertheless is being shared |
| 142 | * between distinct caches. These states are designed to allow data | |
| 143 | * to be shared between nodes in a cluster without having to wait for it | |
| 144 | * to be synchronized with its backing store. | |
| 145 | * | |
| 1ad77ed9 MD |
146 | * Each CST has lstate and rstate. lstate is the local cache state and rstate |
| 147 | * is the remotely-granted state. Changes to the lstate require a compatible | |
| 148 | * rstate. If the rstate is not compatible a third-party transaction is | |
| 149 | * required to obtain the proper rstate. | |
| 150 | * | |
| 151 | * INVALID - Cache state is unknown and must be acquired. | |
| 152 | * | |
| 153 | * ALLOWED - (topo_cst.rstate only). This is a granted state which | |
| 154 | * allows cache state transactions underneath the current | |
| 155 | * node (data, attribute, and recursively), but is not a proper | |
| 156 | * grant for topo_cst itself. Someone specifically trying to | |
| 157 | * acquire topo_cst still needs to do a third party transaction | |
| 158 | * to get the cache into the proper state. | |
| 159 | * | |
| 160 | * SHARED - Indicates that the information is clean, shared, read-only. | |
| 161 | * | |
| 162 | * SLAVE - Indicates that the information is clean, shared, read-only. | |
| 163 | * Indicates that local backing store is out of date but the | |
| 164 | * in-memory cache is valid, meaning that we can only obtain | |
| 165 | * the data from the MASTER (somewhere in the cluster), and | |
| 166 | * that we may not be allowed to sync it to local backing | |
| 167 | * store yet e.g. due to the quorum protocol not having | |
| 168 | * completed. | |
| 169 | * | |
| 170 | * MASTER - Indicates that the information is dirty, but readonly | |
| 171 | * because other nodes in the cluster are in a SLAVE state. | |
| 172 | * This state is typically transitional and occurs while | |
| 173 | * a quorum operation is in progress, allowing slaves to | |
| 174 | * access the data without stalling. | |
| 175 | * | |
| 176 | * EXCLUSIVE - Indicates that the information is clean, read-only, and | |
| 177 | * that nobody else can access the data while we are in this | |
| 178 | * state. A local node can upgrade both rstate and lstate | |
| 179 | * from EXCLUSIVE to MODIFIED without having to perform a | |
| 180 | * third-party transaction. | |
| 181 | * | |
| 182 | * MODIFIED - Indicates that the information is dirty, read-write, and | |
| 183 | * that nobody else can access the data while we are in this | |
| 184 | * state. | |
| 185 | * | |
| 186 | * It is important to note that remote cache-state grants can be more | |
| 187 | * general than what was requested, plus they can be persistent. So, | |
| 188 | * for example, a remote can grant EXCLUSIVE access even if you just | |
| 189 | * requested SHARED, which saves you from having to do another network | |
| 190 | * transaction if you later need EXCLUSIVE. | |
| f03672ec | 191 | */ |
| f03672ec | 192 | |
| 1ad77ed9 MD |
193 | #define CCMS_STATE_INVALID 0 /* unknown cache state */ |
| 194 | #define CCMS_STATE_ALLOWED 1 /* allow subsystem (topo only) */ | |
| 195 | #define CCMS_STATE_SHARED 2 /* clean, shared, read-only */ | |
| 196 | #define CCMS_STATE_SLAVE 3 /* live only, shared, read-only */ | |
| 197 | #define CCMS_STATE_MASTER 4 /* dirty, shared, read-only */ | |
| 198 | #define CCMS_STATE_EXCLUSIVE 5 /* clean, exclusive, read-only */ | |
| 199 | #define CCMS_STATE_MODIFIED 6 /* dirty, exclusive, read-write */ | |
| f03672ec MD |
200 | |
| 201 | /* | |
| 1ad77ed9 MD |
202 | * A CCMS locking element - represents a high level locking request, |
| 203 | * such as used by read, write, and attribute operations. Initialize | |
| 204 | * the ccms_lock structure and call ccms_lock_get(). | |
| f03672ec MD |
205 | * |
| 206 | * When a CCMS lock is established the cache state of the underlying elements | |
| 207 | * is adjusted to meet the requirements of the lock. The cache state | |
| 1ad77ed9 MD |
208 | * requirements are infered by the lock type. CCMS locks can block on |
| 209 | * third party interactions if the underlying remote cache state is not | |
| 210 | * compatible. | |
| f03672ec | 211 | * |
| 1ad77ed9 MD |
212 | * CCMS data locks imply a shared CCMS inode lock. A CCMS topology lock does |
| 213 | * not imply a data or inode lock but topology locks can have far-reaching | |
| 214 | * effects and block on numerous CST state. | |
| f03672ec MD |
215 | */ |
| 216 | struct ccms_lock { | |
| 1ad77ed9 MD |
217 | ccms_state_t tstate; |
| 218 | ccms_state_t astate; | |
| 219 | ccms_state_t dstate; | |
| 220 | ccms_off_t beg_offset; /* applies to dstate */ | |
| 221 | ccms_off_t end_offset; /* applies to dstate */ | |
| 222 | struct ccms_cst *icst; /* points to topo_cst or attr_cst */ | |
| 223 | struct ccms_cst *dcst; /* points to left edge in rbtree */ | |
| 224 | #ifdef CCMS_DEBUG | |
| 225 | TAILQ_ENTRY(ccms_lock) entry; | |
| 226 | #endif | |
| f03672ec MD |
227 | }; |
| 228 | ||
| 1ad77ed9 MD |
229 | #ifdef CCMS_DEBUG |
| 230 | ||
| 231 | TAILQ_HEAD(ccms_lock_head, ccms_lock); | |
| 232 | ||
| 233 | #endif | |
| 234 | ||
| f03672ec MD |
235 | /* |
| 236 | * CCMS cache state tree element (CST) - represents the actual cache | |
| 237 | * management state for a data space. The cache state tree is a | |
| 238 | * non-overlaping red-black tree containing ranged ccms_cst structures | |
| 239 | * which reflect the resolved state for all current high level locking | |
| 240 | * requests. For example, two overlapping ccms_lock requests for shared | |
| 241 | * access would typically be represented by three non-overlapping ccms_cst | |
| 242 | * items in the CST. The CST item representing the overlapped portion of | |
| 243 | * the ccms_lock requests would have ref count of 2 while the other CST | |
| 244 | * items would have a ref count of 1. | |
| 245 | * | |
| 246 | * [lock request #01] | |
| 247 | * [lock request #02] | |
| 248 | * [--cst--][--cst--][--cst--] | |
| 249 | * | |
| 250 | * CSTs are partitioned so their edges line up to all current and pending | |
| 251 | * ccms_lock requests. CSTs are re-merged whenever possible. A freshly | |
| 252 | * initialized database typically has a single CST representing the default | |
| 253 | * cache state for the host. | |
| 254 | * | |
| 1ad77ed9 MD |
255 | * A CST keeps track of local cache state (lstate) AND remote cache state |
| 256 | * (rstate). | |
| f03672ec MD |
257 | * |
| 258 | * Any arbitrary data range within a dataspace can be locked shared or | |
| 259 | * exclusive. Obtaining a lock has the side effect of potentially modifying | |
| 260 | * the cache state. A positive sharecount in a CST indicates that a | |
| 261 | * shared access lock is being held. A negative sharecount indicates an | |
| 262 | * exclusive access lock is being held on the range. A MODIFYING lock | |
| 263 | * type is just an exclusive lock but one which effects the cache state | |
| 264 | * differently. | |
| 265 | * | |
| 266 | * The end offset is byte-inclusive, allowing the entire 64 bit data space | |
| 267 | * to be represented without overflowing the edge case. For example, a | |
| 1ad77ed9 MD |
268 | * 64 byte area might be represented as (0,63). The offsets are UNSIGNED |
| 269 | * entities. | |
| f03672ec MD |
270 | */ |
| 271 | struct ccms_cst { | |
| 272 | RB_ENTRY(ccms_cst) rbnode; /* stored in a red-black tree */ | |
| 1ad77ed9 MD |
273 | struct ccms_cst *free_next; /* free cache linked list */ |
| 274 | struct ccms_inode *cino; /* related ccms_inode */ | |
| 275 | ccms_off_t beg_offset; /* range (inclusive) */ | |
| 276 | ccms_off_t end_offset; /* range (inclusive) */ | |
| 277 | ccms_state_t lstate; /* local cache state */ | |
| 278 | ccms_state_t rstate; /* cache state granted by protocol */ | |
| 279 | ||
| 280 | int32_t flags; | |
| 281 | int32_t count; /* shared/exclusive count */ | |
| 282 | int32_t blocked; /* indicates a blocked lock request */ | |
| 283 | int32_t xrefs; /* lock overlap references */ | |
| 284 | int32_t lrefs; /* left edge refs */ | |
| 285 | int32_t rrefs; /* right edge refs */ | |
| 286 | #ifdef CCMS_DEBUG | |
| 287 | struct ccms_lock_head list; | |
| 288 | #endif | |
| f03672ec MD |
289 | }; |
| 290 | ||
| 1ad77ed9 MD |
291 | #define CCMS_CST_DYNAMIC 0x00000001 |
| 292 | #define CCMS_CST_DELETING 0x00000002 | |
| 293 | #define CCMS_CST_INSERTED 0x00000004 | |
| 294 | #define CCMS_CST_INHERITED 0x00000008 /* rstate inherited from par */ | |
| 295 | ||
| 296 | /* | |
| 297 | * A CCMS inode is typically embedded in a VFS file or directory object. | |
| 298 | * | |
| 299 | * The subdirectory topology is accessible downward by indexing topo_cst's | |
| 300 | * from the children in the parent's cst_tree. | |
| 301 | * | |
| 302 | * attr_cst is independent of data-range CSTs. However, adjustments to | |
| 303 | * the topo_cst can have far-reaching effects to attr_cst, the CSTs in | |
| 304 | * the tree, recursively both downward and upward. | |
| 305 | */ | |
| 306 | struct ccms_inode { | |
| 307 | struct spinlock spin; | |
| 308 | struct ccms_inode *parent; | |
| 309 | struct ccms_rb_tree tree; | |
| 310 | struct ccms_cst attr_cst; | |
| 311 | struct ccms_cst topo_cst; | |
| 312 | struct ccms_cst *free_cache; /* cst free cache */ | |
| 313 | struct ccms_domain *domain; | |
| 314 | void *handle; /* VFS opaque */ | |
| 315 | int32_t flags; | |
| 316 | }; | |
| 317 | ||
| 318 | #define CCMS_INODE_INSERTED 0x0001 | |
| 319 | #define CCMS_INODE_DELETING 0x0002 | |
| 320 | ||
| 321 | /* | |
| 322 | * Domain management, contains a pseudo-root for the CCMS topology. | |
| 323 | */ | |
| 324 | struct ccms_domain { | |
| 325 | struct malloc_type *mcst; /* malloc space for cst's */ | |
| 326 | struct ccms_inode root; /* dummy protocol root */ | |
| 327 | int cst_count; /* dynamic cst count */ | |
| 328 | int cst_limit; /* dynamic cst limit */ | |
| 329 | }; | |
| 330 | ||
| 331 | typedef struct ccms_lock ccms_lock_t; | |
| 332 | typedef struct ccms_cst ccms_cst_t; | |
| 333 | typedef struct ccms_inode ccms_inode_t; | |
| 334 | typedef struct ccms_domain ccms_domain_t; | |
| f03672ec MD |
335 | |
| 336 | /* | |
| 337 | * Kernel API | |
| 338 | */ | |
| 339 | #ifdef _KERNEL | |
| 340 | ||
| 1ad77ed9 MD |
341 | /* |
| 342 | * Helper inline to initialize primarily a dstate lock which shortcuts | |
| 343 | * the more common locking operations. A dstate is specified and an | |
| 344 | * astate is implied. tstate locks cannot be acquired with this inline. | |
| 345 | */ | |
| f03672ec MD |
346 | static __inline |
| 347 | void | |
| 1ad77ed9 MD |
348 | ccms_lock_init(ccms_lock_t *lock, ccms_state_t dstate, |
| 349 | ccms_off_t beg_offset, ccms_off_t end_offset) | |
| f03672ec | 350 | { |
| 1ad77ed9 MD |
351 | lock->beg_offset = beg_offset; |
| 352 | lock->end_offset = end_offset; | |
| 353 | lock->tstate = 0; | |
| 354 | lock->astate = 0; | |
| 355 | lock->dstate = dstate; | |
| f03672ec MD |
356 | } |
| 357 | ||
| 1ad77ed9 MD |
358 | void ccms_domain_init(ccms_domain_t *dom); |
| 359 | void ccms_inode_init(ccms_domain_t *dom, ccms_inode_t *cino, void *handle); | |
| 360 | void ccms_inode_insert(ccms_inode_t *cpar, ccms_inode_t *cino); | |
| 361 | void ccms_inode_delete(ccms_inode_t *cino); | |
| 362 | void ccms_inode_uninit(ccms_inode_t *cino); | |
| 363 | ||
| 364 | int ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock); | |
| 365 | int ccms_lock_get_uio(ccms_inode_t *cino, ccms_lock_t *lock, struct uio *uio); | |
| 366 | int ccms_lock_get_attr(ccms_inode_t *cino, ccms_lock_t *lock, ccms_state_t st); | |
| 367 | int ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock); | |
| f03672ec MD |
368 | |
| 369 | #endif | |
| 370 | ||
| 371 | #endif |