From: Matthew Dillon Date: Sat, 2 Jun 2012 17:26:54 +0000 (-0700) Subject: hammer2 - Move CCMS code from kernel to hammer2 X-Git-Tag: v3.4.0rc~1085 X-Git-Url: http://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/f03672ec8417e4da82df0f957e6faa27bd9af19d hammer2 - Move CCMS code from kernel to hammer2 * Move the CCMS cache coherency module from the kernel to hammer2. It will now be hammer2-specific. --- diff --git a/sys/vfs/hammer2/hammer2_ccms.c b/sys/vfs/hammer2/hammer2_ccms.c new file mode 100644 index 0000000..e4cc014 --- /dev/null +++ b/sys/vfs/hammer2/hammer2_ccms.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2006,2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * The Cache Coherency Management System (CCMS) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct ccms_lock_scan_info { + ccms_dataspace_t ds; + ccms_lock_t lock; + ccms_cst_t cst1; + ccms_cst_t cst2; + ccms_cst_t coll_cst; +}; + +static int ccms_cst_cmp(ccms_cst_t b1, ccms_cst_t b2); +static int ccms_lock_scan_cmp(ccms_cst_t b1, void *arg); +static int ccms_lock_undo_cmp(ccms_cst_t b1, void *arg); +static int ccms_dataspace_destroy_match(ccms_cst_t cst, void *arg); +static int ccms_lock_get_match(struct ccms_cst *cst, void *arg); +static int ccms_lock_undo_match(struct ccms_cst *cst, void *arg); +static int ccms_lock_redo_match(struct ccms_cst *cst, void *arg); +static int ccms_lock_put_match(struct ccms_cst *cst, void *arg); + +RB_GENERATE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp, + off_t, beg_offset, end_offset); +static MALLOC_DEFINE(M_CCMS, "CCMS", "Cache Coherency Management System"); + +static int ccms_enable; +SYSCTL_INT(_kern, OID_AUTO, ccms_enable, CTLFLAG_RW, &ccms_enable, 0, ""); + +static struct objcache *ccms_oc; + +/* + * Initialize the CCMS subsystem + */ +static void +ccmsinit(void *dummy) +{ + ccms_oc = objcache_create_simple(M_CCMS, sizeof(struct ccms_cst)); +} +SYSINIT(ccms, SI_BOOT2_MACHDEP, SI_ORDER_ANY, ccmsinit, NULL); + +/* + * Initialize a new CCMS dataspace. Create a new RB tree with a single + * element covering the entire 64 bit offset range. This simplifies + * algorithms enormously by removing a number of special cases. + */ +void +ccms_dataspace_init(ccms_dataspace_t ds) +{ + ccms_cst_t cst; + + RB_INIT(&ds->tree); + ds->info = NULL; + ds->chain = NULL; + cst = objcache_get(ccms_oc, M_WAITOK); + bzero(cst, sizeof(*cst)); + cst->beg_offset = LLONG_MIN; + cst->end_offset = LLONG_MAX; + cst->state = CCMS_STATE_INVALID; + RB_INSERT(ccms_rb_tree, &ds->tree, cst); + spin_init(&ds->spin); +} + +/* + * Helper to destroy deleted cst's. + */ +static __inline +void +ccms_delayed_free(ccms_cst_t cstn) +{ + ccms_cst_t cst; + + while((cst = cstn) != NULL) { + cstn = cst->delayed_next; + objcache_put(ccms_oc, cst); + } +} + +/* + * Destroy a CCMS dataspace. + * + * MPSAFE + */ +void +ccms_dataspace_destroy(ccms_dataspace_t ds) +{ + ccms_cst_t cst; + + spin_lock(&ds->spin); + RB_SCAN(ccms_rb_tree, &ds->tree, NULL, + ccms_dataspace_destroy_match, ds); + cst = ds->delayed_free; + ds->delayed_free = NULL; + spin_unlock(&ds->spin); + ccms_delayed_free(cst); +} + +/* + * Helper routine to delete matches during a destroy. + * + * NOTE: called with spinlock held. + */ +static +int +ccms_dataspace_destroy_match(ccms_cst_t cst, void *arg) +{ + ccms_dataspace_t ds = arg; + + RB_REMOVE(ccms_rb_tree, &ds->tree, cst); + cst->delayed_next = ds->delayed_free; + ds->delayed_free = cst; + return(0); +} + +/* + * Obtain a CCMS lock + * + * MPSAFE + */ +int +ccms_lock_get(ccms_dataspace_t ds, ccms_lock_t lock) +{ + struct ccms_lock_scan_info info; + ccms_cst_t cst; + + if (ccms_enable == 0) { + lock->ds = NULL; + return(0); + } + + /* + * Partition the CST space so the precise range is covered and + * attempt to obtain the requested local lock (ltype) at the same + * time. + */ + lock->ds = ds; + info.lock = lock; + info.ds = ds; + info.coll_cst = NULL; + info.cst1 = objcache_get(ccms_oc, M_WAITOK); + info.cst2 = objcache_get(ccms_oc, M_WAITOK); + + spin_lock(&ds->spin); + RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp, + ccms_lock_get_match, &info); + + /* + * If a collision occured, undo the fragments we were able to obtain, + * block, and try again. + */ + while (info.coll_cst != NULL) { + RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_undo_cmp, + ccms_lock_undo_match, &info); + info.coll_cst->blocked = 1; + ssleep(info.coll_cst, &ds->spin, 0, + ((lock->ltype == CCMS_LTYPE_SHARED) ? "rngsh" : "rngex"), + hz); + info.coll_cst = NULL; + RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp, + ccms_lock_redo_match, &info); + } + cst = ds->delayed_free; + ds->delayed_free = NULL; + spin_unlock(&ds->spin); + + /* + * Cleanup + */ + ccms_delayed_free(cst); + if (info.cst1) + objcache_put(ccms_oc, info.cst1); + if (info.cst2) + objcache_put(ccms_oc, info.cst2); + + return(0); +} + +/* + * Obtain a CCMS lock, initialize the lock structure from the uio. + * + * MPSAFE + */ +int +ccms_lock_get_uio(ccms_dataspace_t ds, ccms_lock_t lock, struct uio *uio) +{ + ccms_ltype_t ltype; + off_t eoff; + + if (uio->uio_rw == UIO_READ) + ltype = CCMS_LTYPE_SHARED; + else + ltype = CCMS_LTYPE_MODIFYING; + + /* + * Calculate the ending offset (byte inclusive), make sure a seek + * overflow does not blow us up. + */ + eoff = uio->uio_offset + uio->uio_resid - 1; + if (eoff < uio->uio_offset) + eoff = 0x7FFFFFFFFFFFFFFFLL; + ccms_lock_init(lock, uio->uio_offset, eoff, ltype); + return(ccms_lock_get(ds, lock)); +} + +/* + * Helper routine. + * + * NOTE: called with spinlock held. + */ +static +int +ccms_lock_get_match(ccms_cst_t cst, void *arg) +{ + struct ccms_lock_scan_info *info = arg; + ccms_lock_t lock = info->lock; + ccms_cst_t ncst; + + /* + * If the lock's left edge is within the CST we must split the CST + * into two pieces [cst][ncst]. lrefs must be bumped on the CST + * containing the left edge. + * + * NOTE! cst->beg_offset may not be modified. This allows us to avoid + * having to manipulate the cst's position in the tree. + */ + if (lock->beg_offset > cst->beg_offset) { + ncst = info->cst1; + info->cst1 = NULL; + KKASSERT(ncst != NULL); + *ncst = *cst; + cst->end_offset = lock->beg_offset - 1; + cst->rrefs = 0; + ncst->beg_offset = lock->beg_offset; + ncst->lrefs = 1; + RB_INSERT(ccms_rb_tree, &info->ds->tree, ncst); + + /* + * ncst becomes our 'matching' cst. + */ + cst = ncst; + } else if (lock->beg_offset == cst->beg_offset) { + ++cst->lrefs; + } + + /* + * If the lock's right edge is within the CST we must split the CST + * into two pieces [cst][ncst]. rrefs must be bumped on the CST + * containing the right edge. + * + * NOTE! cst->beg_offset may not be modified. This allows us to avoid + * having to manipulate the cst's position in the tree. + */ + if (lock->end_offset < cst->end_offset) { + ncst = info->cst2; + info->cst2 = NULL; + KKASSERT(ncst != NULL); + *ncst = *cst; + cst->end_offset = lock->end_offset; + cst->rrefs = 1; + ncst->beg_offset = lock->end_offset + 1; + ncst->lrefs = 0; + RB_INSERT(ccms_rb_tree, &info->ds->tree, ncst); + /* cst remains our 'matching' cst */ + } else if (lock->end_offset == cst->end_offset) { + ++cst->rrefs; + } + + /* + * The lock covers the CST, so increment the CST's coverage count. + * Then attempt to obtain the shared/exclusive ltype. + */ + ++cst->xrefs; + + if (info->coll_cst == NULL) { + switch(lock->ltype) { + case CCMS_LTYPE_SHARED: + if (cst->sharecount < 0) { + info->coll_cst = cst; + } else { + ++cst->sharecount; + if (ccms_enable >= 9) { + kprintf("CST SHARE %d %lld-%lld\n", + cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + } + break; + case CCMS_LTYPE_EXCLUSIVE: + if (cst->sharecount != 0) { + info->coll_cst = cst; + } else { + --cst->sharecount; + if (ccms_enable >= 9) { + kprintf("CST EXCLS %d %lld-%lld\n", + cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + } + break; + case CCMS_LTYPE_MODIFYING: + if (cst->sharecount != 0) { + info->coll_cst = cst; + } else { + --cst->sharecount; + ++cst->modifycount; + if (ccms_enable >= 9) { + kprintf("CST MODXL %d %lld-%lld\n", + cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + } + break; + } + } + return(0); +} + +/* + * Undo a partially resolved ccms_ltype rangelock. This is atomic with + * the scan/redo code so there should not be any blocked locks when + * transitioning to 0. + * + * NOTE: called with spinlock held. + */ +static +int +ccms_lock_undo_match(ccms_cst_t cst, void *arg) +{ + struct ccms_lock_scan_info *info = arg; + ccms_lock_t lock = info->lock; + + switch(lock->ltype) { + case CCMS_LTYPE_SHARED: + KKASSERT(cst->sharecount > 0); + --cst->sharecount; + KKASSERT(cst->sharecount || cst->blocked == 0); + break; + case CCMS_LTYPE_EXCLUSIVE: + KKASSERT(cst->sharecount < 0); + ++cst->sharecount; + KKASSERT(cst->sharecount || cst->blocked == 0); + break; + case CCMS_LTYPE_MODIFYING: + KKASSERT(cst->sharecount < 0 && cst->modifycount > 0); + ++cst->sharecount; + --cst->modifycount; + KKASSERT(cst->sharecount || cst->blocked == 0); + break; + } + return(0); +} + +/* + * Redo the local lock request for a range which has already been + * partitioned. + * + * NOTE: called with spinlock held. + */ +static +int +ccms_lock_redo_match(ccms_cst_t cst, void *arg) +{ + struct ccms_lock_scan_info *info = arg; + ccms_lock_t lock = info->lock; + + if (info->coll_cst == NULL) { + switch(lock->ltype) { + case CCMS_LTYPE_SHARED: + if (cst->sharecount < 0) { + info->coll_cst = cst; + } else { + if (ccms_enable >= 9) { + kprintf("CST SHARE %d %lld-%lld\n", + cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + ++cst->sharecount; + } + break; + case CCMS_LTYPE_EXCLUSIVE: + if (cst->sharecount != 0) { + info->coll_cst = cst; + } else { + --cst->sharecount; + if (ccms_enable >= 9) { + kprintf("CST EXCLS %d %lld-%lld\n", + cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + } + break; + case CCMS_LTYPE_MODIFYING: + if (cst->sharecount != 0) { + info->coll_cst = cst; + } else { + --cst->sharecount; + ++cst->modifycount; + if (ccms_enable >= 9) { + kprintf("CST MODXL %d %lld-%lld\n", + cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + } + break; + } + } + return(0); +} + +/* + * Release a CCMS lock + * + * MPSAFE + */ +int +ccms_lock_put(ccms_dataspace_t ds, ccms_lock_t lock) +{ + struct ccms_lock_scan_info info; + ccms_cst_t cst; + + if (lock->ds == NULL) + return(0); + + lock->ds = NULL; + info.lock = lock; + info.ds = ds; + info.cst1 = NULL; + info.cst2 = NULL; + + spin_lock(&ds->spin); + RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp, + ccms_lock_put_match, &info); + cst = ds->delayed_free; + ds->delayed_free = NULL; + spin_unlock(&ds->spin); + + ccms_delayed_free(cst); + if (info.cst1) + objcache_put(ccms_oc, info.cst1); + if (info.cst2) + objcache_put(ccms_oc, info.cst2); + return(0); +} + +/* + * NOTE: called with spinlock held. + */ +static +int +ccms_lock_put_match(ccms_cst_t cst, void *arg) +{ + struct ccms_lock_scan_info *info = arg; + ccms_lock_t lock = info->lock; + ccms_cst_t ocst; + + /* + * Undo the local shared/exclusive rangelock. + */ + switch(lock->ltype) { + case CCMS_LTYPE_SHARED: + KKASSERT(cst->sharecount > 0); + --cst->sharecount; + if (ccms_enable >= 9) { + kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset, + cst->blocked); + } + if (cst->blocked && cst->sharecount == 0) { + cst->blocked = 0; + wakeup(cst); + } + break; + case CCMS_LTYPE_EXCLUSIVE: + KKASSERT(cst->sharecount < 0); + ++cst->sharecount; + if (ccms_enable >= 9) { + kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset, + cst->blocked); + } + if (cst->blocked && cst->sharecount == 0) { + cst->blocked = 0; + wakeup(cst); + } + break; + case CCMS_LTYPE_MODIFYING: + KKASSERT(cst->sharecount < 0 && cst->modifycount > 0); + ++cst->sharecount; + --cst->modifycount; + if (ccms_enable >= 9) { + kprintf("CST UNMOD %d %lld-%lld (%d)\n", cst->sharecount, + (long long)cst->beg_offset, + (long long)cst->end_offset, + cst->blocked); + } + if (cst->blocked && cst->sharecount == 0) { + cst->blocked = 0; + wakeup(cst); + } + break; + } + + /* + * Decrement the lock coverage count on the CST. Decrement the left and + * right edge counts as appropriate. + * + * When lrefs or rrefs drops to zero we check the adjacent entry to + * determine whether a merge is possible. If the appropriate refs field + * (rrefs for the entry to our left, lrefs for the entry to our right) + * is 0, then all covering locks must cover both entries and the xrefs + * field must match. We can then merge the entries if they have + * compatible cache states. + * + * However, because we are cleaning up the shared/exclusive count at + * the same time, the sharecount field may be temporarily out of + * sync, so require that the sharecount field also match before doing + * a merge. + * + * When merging an element which is being blocked on, the blocking + * thread(s) will be woken up. + * + * If the dataspace has too many CSTs we may be able to merge the + * entries even if their cache states are not the same, by dropping + * both to a compatible (lower) cache state and performing the appropriate + * management operations. XXX + */ + --cst->xrefs; + if (lock->beg_offset == cst->beg_offset) { + --cst->lrefs; + if (cst->lrefs == 0) { + if ((ocst = RB_PREV(ccms_rb_tree, &info->ds->tree, cst)) != NULL && + ocst->rrefs == 0 && + ocst->state == cst->state && + ocst->sharecount == cst->sharecount + ) { + KKASSERT(ocst->xrefs == cst->xrefs); + KKASSERT(ocst->end_offset + 1 == cst->beg_offset); + RB_REMOVE(ccms_rb_tree, &info->ds->tree, ocst); + cst->beg_offset = ocst->beg_offset; + cst->lrefs = ocst->lrefs; + if (ccms_enable >= 9) { + kprintf("MERGELEFT %p %lld-%lld (%d)\n", + ocst, + (long long)cst->beg_offset, + (long long)cst->end_offset, + cst->blocked); + } + if (ocst->blocked) { + ocst->blocked = 0; + wakeup(ocst); + } + /*objcache_put(ccms_oc, ocst);*/ + ocst->delayed_next = info->ds->delayed_free; + info->ds->delayed_free = ocst; + } + } + } + if (lock->end_offset == cst->end_offset) { + --cst->rrefs; + if (cst->rrefs == 0) { + if ((ocst = RB_NEXT(ccms_rb_tree, &info->ds->tree, cst)) != NULL && + ocst->lrefs == 0 && + ocst->state == cst->state && + ocst->sharecount == cst->sharecount + ) { + KKASSERT(ocst->xrefs == cst->xrefs); + KKASSERT(cst->end_offset + 1 == ocst->beg_offset); + RB_REMOVE(ccms_rb_tree, &info->ds->tree, ocst); + cst->end_offset = ocst->end_offset; + cst->rrefs = ocst->rrefs; + if (ccms_enable >= 9) { + kprintf("MERGERIGHT %p %lld-%lld\n", + ocst, + (long long)cst->beg_offset, + (long long)cst->end_offset); + } + /*objcache_put(ccms_oc, ocst);*/ + ocst->delayed_next = info->ds->delayed_free; + info->ds->delayed_free = ocst; + } + } + } + return(0); +} + +/* + * RB tree compare function for insertions and deletions. This function + * compares two CSTs. + */ +static int +ccms_cst_cmp(ccms_cst_t b1, ccms_cst_t b2) +{ + if (b1->end_offset < b2->beg_offset) + return(-1); + if (b1->beg_offset > b2->end_offset) + return(1); + return(0); +} + +/* + * RB tree scanning compare function. This function compares the CST + * from the tree against the supplied ccms_lock and returns the CST's + * placement relative to the lock. + */ +static int +ccms_lock_scan_cmp(ccms_cst_t cst, void *arg) +{ + struct ccms_lock_scan_info *info = arg; + ccms_lock_t lock = info->lock; + + if (cst->end_offset < lock->beg_offset) + return(-1); + if (cst->beg_offset > lock->end_offset) + return(1); + return(0); +} + +/* + * This function works like ccms_lock_scan_cmp but terminates at the + * collision point rather then at the lock's ending offset. Only + * the CSTs that were already partially resolved are returned by the scan. + */ +static int +ccms_lock_undo_cmp(ccms_cst_t cst, void *arg) +{ + struct ccms_lock_scan_info *info = arg; + ccms_lock_t lock = info->lock; + + if (cst->end_offset < lock->beg_offset) + return(-1); + if (cst->beg_offset >= info->coll_cst->beg_offset) + return(1); + return(0); +} diff --git a/sys/vfs/hammer2/hammer2_ccms.h b/sys/vfs/hammer2/hammer2_ccms.h new file mode 100644 index 0000000..00107b2 --- /dev/null +++ b/sys/vfs/hammer2/hammer2_ccms.h @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2006,2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * CCMS - Cache Coherency Management System. These structures are used + * to manage cache coherency and locking for an object. Cache Coherency is + * managed at byte granularity with 64 bit offset ranges. + * + * Management is broken into two distinct pieces: (1) Local shared/exclusive + * locks which essentially replace traditional vnode locks and (2) local + * cache state which interacts with other hosts and follows a MESI-like model. + * + * The core to the entire module is the 'CST' (Cache State Tree) structure + * which stores both pieces of information in a red-black tree styled data + * structure. CSTs are non-overlapping offset-ranged entities. Other + * higher level structures govern how CSTs in the red-black tree or cut up + * or merged. + */ + +#ifndef _SYS_CCMS_H_ +#define _SYS_CCMS_H_ + +#ifndef _SYS_TYPES_H_ +#include +#endif +#ifndef _SYS_PARAM_H_ +#include +#endif +#ifndef _SYS_SERIALIZE_H_ +#include +#endif +#ifndef _SYS_SPINLOCK_H_ +#include +#endif +#ifndef _SYS_TREE_H_ +#include +#endif + +/* + * CCMS uses a red-black tree to sort CSTs. + */ +RB_HEAD(ccms_rb_tree, ccms_cst); +RB_PROTOTYPE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp, off_t); + +struct ccms_lock; +struct ccms_cst; + +/* + * ccms_state_t - CCMS cache states + * + * CCMS uses an extended MESI caching model. There are two extension + * states, MASTER and SLAVE, which represents dirty data which has not been + * synchronized to backing store but which nevertheless is being shared + * between distinct caches. These states are designed to allow data + * to be shared between nodes in a cluster without having to wait for it + * to be synchronized with its backing store. + * + * SLAVE - A shared state where the master copy of the data is being + * held by a foreign cache rather then by backing store. + * This state implies that the backing store may contain stale + * data. + * + * MASTER - A shared state where the master copy of the data is being + * held locally. Zero or more foreign caches may be holding + * a copy of our data, so we cannot modify it without + * invalidating those caches. This state implies that the + * backing store may contain stale data. + * + * MASTER differs from MODIFIED in that the data is read-only + * due to the existance of foreign copies. However, even though + * the data is read-only, it is ALSO DIRTY because the backing + * store has not been synchronized + * + * NOTE! The cache state represents the worst case cache state for caching + * elements such as the buffer cache or VM page cache or the vnode attribute + * cache (or other things) within the specified range. It does NOT mean + * that the local machine actually has all of the requested data in-hand. + */ +typedef enum ccms_state { + CCMS_STATE_INVALID = 0, + CCMS_STATE_SHARED, /* clean, read-only, from backing store */ + CCMS_STATE_SLAVE, /* clean, read-only, from master */ + CCMS_STATE_MASTER, /* dirty, read-only, shared master copy */ + CCMS_STATE_EXCLUSIVE, /* clean, read-only, exclusive */ + CCMS_STATE_MODIFIED /* clean or dirty, read-write, exclusive */ +} ccms_state_t; + +/* + * ccms_ltype_t - local access control lock state + * + * Note: A MODIFYING lock is an exclusive lock where the caller intends to + * make a modification, such as issuing a WRITE. The difference between the + * two is in how the cache state is effected by the lock. The distinction + * exists because there are many situations where the governing structure + * on the local machine needs to be locked exclusively, but the underlying + * data cache does not. + * + * lock type cache state + * --------- --------- + * SHARED >= shared + * EXCLUSIVE >= shared + * MODIFYING >= exclusive + */ +typedef enum ccms_ltype { + CCMS_LTYPE_SHARED = 0, /* shared lock on the range */ + CCMS_LTYPE_EXCLUSIVE, /* exclusive lock on the range */ + CCMS_LTYPE_MODIFYING /* modifying lock on the range */ +} ccms_ltype_t; + +/* + * The CCMS ABI information structure. This structure contains ABI + * calls to resolve incompatible cache states. + */ +struct ccms_info { + int (*ccms_set_cache)(struct ccms_info *, struct ccms_lock *, ccms_state_t); + void *data; + /* XXX */ +}; + +/* + * A CCMS dataspace, typically stored in a vnode or VM object. The primary + * reference is to the ccms_dataspace representing the local machine. The + * chain field is used to link ccms_dataspace's representing other machines. + * These foreign representations typically only contain summary 'worst-case' + * CSTs. The chain only needs to be followed if a CST has a cache state + * that is incompatible with the request. + */ +struct ccms_dataspace { + struct ccms_rb_tree tree; + struct ccms_info *info; + struct ccms_dataspace *chain; + ccms_state_t defstate; + struct spinlock spin; + struct ccms_cst *delayed_free; /* delayed frees */ +}; + +/* + * The CCMS locking element - represents a high level locking request, + * such as used by read, write, and truncate operations. These requests + * are not organized into any tree but instead are shadowed by items in + * the actual cache state tree (ccms_cst). There are no direct links + * between a ccms_lock and the underlying CST items, only reference count + * fields in the CST item. + * + * When a CCMS lock is established the cache state of the underlying elements + * is adjusted to meet the requirements of the lock. The cache state + * requirements are infered by the lock type: + * + * NOTE: Ranges may include negative offsets. These are typically used to + * represent meta-data. + * + * local lock cache state + * ----------------- -------------------- + * SHARED - SHARED must not be invalid + * EXCLUSIVE - EXCLUSIVE must not be invalid + * MODIFYING - EXCLUSIVE must be EXCLUSIVE or MODIFIED + */ +struct ccms_lock { + struct ccms_dataspace *ds; + off_t beg_offset; + off_t end_offset; + ccms_ltype_t ltype; +}; + +/* + * CCMS cache state tree element (CST) - represents the actual cache + * management state for a data space. The cache state tree is a + * non-overlaping red-black tree containing ranged ccms_cst structures + * which reflect the resolved state for all current high level locking + * requests. For example, two overlapping ccms_lock requests for shared + * access would typically be represented by three non-overlapping ccms_cst + * items in the CST. The CST item representing the overlapped portion of + * the ccms_lock requests would have ref count of 2 while the other CST + * items would have a ref count of 1. + * + * [lock request #01] + * [lock request #02] + * [--cst--][--cst--][--cst--] + * + * CSTs are partitioned so their edges line up to all current and pending + * ccms_lock requests. CSTs are re-merged whenever possible. A freshly + * initialized database typically has a single CST representing the default + * cache state for the host. + * + * A CST represents *TWO* different things. First, it represents local + * locks held on data ranges. Second, it represents the best-case cache + * state for data cached on the local machine for local<->remote host + * interactions. + * + * Any arbitrary data range within a dataspace can be locked shared or + * exclusive. Obtaining a lock has the side effect of potentially modifying + * the cache state. A positive sharecount in a CST indicates that a + * shared access lock is being held. A negative sharecount indicates an + * exclusive access lock is being held on the range. A MODIFYING lock + * type is just an exclusive lock but one which effects the cache state + * differently. + * + * The end offset is byte-inclusive, allowing the entire 64 bit data space + * to be represented without overflowing the edge case. For example, a + * 64 byte area might be represented as (0,63). The offsets are SIGNED + * entities. Negative offsets are often used to represent meta-data + * such as ownership and permissions. The file size is typically cached as a + * side effect of file operations occuring at the file EOF rather then + * combined with ownership and permissions. + */ +struct ccms_cst { + RB_ENTRY(ccms_cst) rbnode; /* stored in a red-black tree */ + struct ccms_cst *delayed_next; /* linked list to free */ + off_t beg_offset; + off_t end_offset; + ccms_state_t state; /* local cache state */ + int sharecount; /* shared or exclusive lock count */ + int modifycount; /* number of modifying exclusive lks */ + int blocked; /* indicates a blocked lock request */ + int xrefs; /* lock overlap references */ + int lrefs; /* left edge refs */ + int rrefs; /* right edge refs */ +}; + +typedef struct ccms_info *ccms_info_t; +typedef struct ccms_dataspace *ccms_dataspace_t; +typedef struct ccms_lock *ccms_lock_t; +typedef struct ccms_cst *ccms_cst_t; + +/* + * Kernel API + */ +#ifdef _KERNEL + +static __inline +void +ccms_lock_init(ccms_lock_t lock, off_t beg_offset, off_t end_offset, + ccms_ltype_t ltype) +{ + lock->beg_offset = beg_offset; + lock->end_offset = end_offset; + lock->ltype = ltype; +} + +void ccms_dataspace_init(ccms_dataspace_t); +void ccms_dataspace_destroy(ccms_dataspace_t); +int ccms_lock_get(ccms_dataspace_t, ccms_lock_t); +int ccms_lock_get_uio(ccms_dataspace_t, ccms_lock_t, struct uio *); +int ccms_lock_put(ccms_dataspace_t, ccms_lock_t); + +#endif + +#endif