--- /dev/null
+/*
+ * Copyright (c) 2006,2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * The Cache Coherency Management System (CCMS)
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/objcache.h>
+#include <sys/ccms.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <machine/limits.h>
+
+#include <sys/spinlock2.h>
+
+struct ccms_lock_scan_info {
+ ccms_dataspace_t ds;
+ ccms_lock_t lock;
+ ccms_cst_t cst1;
+ ccms_cst_t cst2;
+ ccms_cst_t coll_cst;
+};
+
+static int ccms_cst_cmp(ccms_cst_t b1, ccms_cst_t b2);
+static int ccms_lock_scan_cmp(ccms_cst_t b1, void *arg);
+static int ccms_lock_undo_cmp(ccms_cst_t b1, void *arg);
+static int ccms_dataspace_destroy_match(ccms_cst_t cst, void *arg);
+static int ccms_lock_get_match(struct ccms_cst *cst, void *arg);
+static int ccms_lock_undo_match(struct ccms_cst *cst, void *arg);
+static int ccms_lock_redo_match(struct ccms_cst *cst, void *arg);
+static int ccms_lock_put_match(struct ccms_cst *cst, void *arg);
+
+RB_GENERATE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp,
+ off_t, beg_offset, end_offset);
+static MALLOC_DEFINE(M_CCMS, "CCMS", "Cache Coherency Management System");
+
+static int ccms_enable;
+SYSCTL_INT(_kern, OID_AUTO, ccms_enable, CTLFLAG_RW, &ccms_enable, 0, "");
+
+static struct objcache *ccms_oc;
+
+/*
+ * Initialize the CCMS subsystem
+ */
+static void
+ccmsinit(void *dummy)
+{
+ ccms_oc = objcache_create_simple(M_CCMS, sizeof(struct ccms_cst));
+}
+SYSINIT(ccms, SI_BOOT2_MACHDEP, SI_ORDER_ANY, ccmsinit, NULL);
+
+/*
+ * Initialize a new CCMS dataspace. Create a new RB tree with a single
+ * element covering the entire 64 bit offset range. This simplifies
+ * algorithms enormously by removing a number of special cases.
+ */
+void
+ccms_dataspace_init(ccms_dataspace_t ds)
+{
+ ccms_cst_t cst;
+
+ RB_INIT(&ds->tree);
+ ds->info = NULL;
+ ds->chain = NULL;
+ cst = objcache_get(ccms_oc, M_WAITOK);
+ bzero(cst, sizeof(*cst));
+ cst->beg_offset = LLONG_MIN;
+ cst->end_offset = LLONG_MAX;
+ cst->state = CCMS_STATE_INVALID;
+ RB_INSERT(ccms_rb_tree, &ds->tree, cst);
+ spin_init(&ds->spin);
+}
+
+/*
+ * Helper to destroy deleted cst's.
+ */
+static __inline
+void
+ccms_delayed_free(ccms_cst_t cstn)
+{
+ ccms_cst_t cst;
+
+ while((cst = cstn) != NULL) {
+ cstn = cst->delayed_next;
+ objcache_put(ccms_oc, cst);
+ }
+}
+
+/*
+ * Destroy a CCMS dataspace.
+ *
+ * MPSAFE
+ */
+void
+ccms_dataspace_destroy(ccms_dataspace_t ds)
+{
+ ccms_cst_t cst;
+
+ spin_lock(&ds->spin);
+ RB_SCAN(ccms_rb_tree, &ds->tree, NULL,
+ ccms_dataspace_destroy_match, ds);
+ cst = ds->delayed_free;
+ ds->delayed_free = NULL;
+ spin_unlock(&ds->spin);
+ ccms_delayed_free(cst);
+}
+
+/*
+ * Helper routine to delete matches during a destroy.
+ *
+ * NOTE: called with spinlock held.
+ */
+static
+int
+ccms_dataspace_destroy_match(ccms_cst_t cst, void *arg)
+{
+ ccms_dataspace_t ds = arg;
+
+ RB_REMOVE(ccms_rb_tree, &ds->tree, cst);
+ cst->delayed_next = ds->delayed_free;
+ ds->delayed_free = cst;
+ return(0);
+}
+
+/*
+ * Obtain a CCMS lock
+ *
+ * MPSAFE
+ */
+int
+ccms_lock_get(ccms_dataspace_t ds, ccms_lock_t lock)
+{
+ struct ccms_lock_scan_info info;
+ ccms_cst_t cst;
+
+ if (ccms_enable == 0) {
+ lock->ds = NULL;
+ return(0);
+ }
+
+ /*
+ * Partition the CST space so the precise range is covered and
+ * attempt to obtain the requested local lock (ltype) at the same
+ * time.
+ */
+ lock->ds = ds;
+ info.lock = lock;
+ info.ds = ds;
+ info.coll_cst = NULL;
+ info.cst1 = objcache_get(ccms_oc, M_WAITOK);
+ info.cst2 = objcache_get(ccms_oc, M_WAITOK);
+
+ spin_lock(&ds->spin);
+ RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp,
+ ccms_lock_get_match, &info);
+
+ /*
+ * If a collision occured, undo the fragments we were able to obtain,
+ * block, and try again.
+ */
+ while (info.coll_cst != NULL) {
+ RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_undo_cmp,
+ ccms_lock_undo_match, &info);
+ info.coll_cst->blocked = 1;
+ ssleep(info.coll_cst, &ds->spin, 0,
+ ((lock->ltype == CCMS_LTYPE_SHARED) ? "rngsh" : "rngex"),
+ hz);
+ info.coll_cst = NULL;
+ RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp,
+ ccms_lock_redo_match, &info);
+ }
+ cst = ds->delayed_free;
+ ds->delayed_free = NULL;
+ spin_unlock(&ds->spin);
+
+ /*
+ * Cleanup
+ */
+ ccms_delayed_free(cst);
+ if (info.cst1)
+ objcache_put(ccms_oc, info.cst1);
+ if (info.cst2)
+ objcache_put(ccms_oc, info.cst2);
+
+ return(0);
+}
+
+/*
+ * Obtain a CCMS lock, initialize the lock structure from the uio.
+ *
+ * MPSAFE
+ */
+int
+ccms_lock_get_uio(ccms_dataspace_t ds, ccms_lock_t lock, struct uio *uio)
+{
+ ccms_ltype_t ltype;
+ off_t eoff;
+
+ if (uio->uio_rw == UIO_READ)
+ ltype = CCMS_LTYPE_SHARED;
+ else
+ ltype = CCMS_LTYPE_MODIFYING;
+
+ /*
+ * Calculate the ending offset (byte inclusive), make sure a seek
+ * overflow does not blow us up.
+ */
+ eoff = uio->uio_offset + uio->uio_resid - 1;
+ if (eoff < uio->uio_offset)
+ eoff = 0x7FFFFFFFFFFFFFFFLL;
+ ccms_lock_init(lock, uio->uio_offset, eoff, ltype);
+ return(ccms_lock_get(ds, lock));
+}
+
+/*
+ * Helper routine.
+ *
+ * NOTE: called with spinlock held.
+ */
+static
+int
+ccms_lock_get_match(ccms_cst_t cst, void *arg)
+{
+ struct ccms_lock_scan_info *info = arg;
+ ccms_lock_t lock = info->lock;
+ ccms_cst_t ncst;
+
+ /*
+ * If the lock's left edge is within the CST we must split the CST
+ * into two pieces [cst][ncst]. lrefs must be bumped on the CST
+ * containing the left edge.
+ *
+ * NOTE! cst->beg_offset may not be modified. This allows us to avoid
+ * having to manipulate the cst's position in the tree.
+ */
+ if (lock->beg_offset > cst->beg_offset) {
+ ncst = info->cst1;
+ info->cst1 = NULL;
+ KKASSERT(ncst != NULL);
+ *ncst = *cst;
+ cst->end_offset = lock->beg_offset - 1;
+ cst->rrefs = 0;
+ ncst->beg_offset = lock->beg_offset;
+ ncst->lrefs = 1;
+ RB_INSERT(ccms_rb_tree, &info->ds->tree, ncst);
+
+ /*
+ * ncst becomes our 'matching' cst.
+ */
+ cst = ncst;
+ } else if (lock->beg_offset == cst->beg_offset) {
+ ++cst->lrefs;
+ }
+
+ /*
+ * If the lock's right edge is within the CST we must split the CST
+ * into two pieces [cst][ncst]. rrefs must be bumped on the CST
+ * containing the right edge.
+ *
+ * NOTE! cst->beg_offset may not be modified. This allows us to avoid
+ * having to manipulate the cst's position in the tree.
+ */
+ if (lock->end_offset < cst->end_offset) {
+ ncst = info->cst2;
+ info->cst2 = NULL;
+ KKASSERT(ncst != NULL);
+ *ncst = *cst;
+ cst->end_offset = lock->end_offset;
+ cst->rrefs = 1;
+ ncst->beg_offset = lock->end_offset + 1;
+ ncst->lrefs = 0;
+ RB_INSERT(ccms_rb_tree, &info->ds->tree, ncst);
+ /* cst remains our 'matching' cst */
+ } else if (lock->end_offset == cst->end_offset) {
+ ++cst->rrefs;
+ }
+
+ /*
+ * The lock covers the CST, so increment the CST's coverage count.
+ * Then attempt to obtain the shared/exclusive ltype.
+ */
+ ++cst->xrefs;
+
+ if (info->coll_cst == NULL) {
+ switch(lock->ltype) {
+ case CCMS_LTYPE_SHARED:
+ if (cst->sharecount < 0) {
+ info->coll_cst = cst;
+ } else {
+ ++cst->sharecount;
+ if (ccms_enable >= 9) {
+ kprintf("CST SHARE %d %lld-%lld\n",
+ cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ }
+ break;
+ case CCMS_LTYPE_EXCLUSIVE:
+ if (cst->sharecount != 0) {
+ info->coll_cst = cst;
+ } else {
+ --cst->sharecount;
+ if (ccms_enable >= 9) {
+ kprintf("CST EXCLS %d %lld-%lld\n",
+ cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ }
+ break;
+ case CCMS_LTYPE_MODIFYING:
+ if (cst->sharecount != 0) {
+ info->coll_cst = cst;
+ } else {
+ --cst->sharecount;
+ ++cst->modifycount;
+ if (ccms_enable >= 9) {
+ kprintf("CST MODXL %d %lld-%lld\n",
+ cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ }
+ break;
+ }
+ }
+ return(0);
+}
+
+/*
+ * Undo a partially resolved ccms_ltype rangelock. This is atomic with
+ * the scan/redo code so there should not be any blocked locks when
+ * transitioning to 0.
+ *
+ * NOTE: called with spinlock held.
+ */
+static
+int
+ccms_lock_undo_match(ccms_cst_t cst, void *arg)
+{
+ struct ccms_lock_scan_info *info = arg;
+ ccms_lock_t lock = info->lock;
+
+ switch(lock->ltype) {
+ case CCMS_LTYPE_SHARED:
+ KKASSERT(cst->sharecount > 0);
+ --cst->sharecount;
+ KKASSERT(cst->sharecount || cst->blocked == 0);
+ break;
+ case CCMS_LTYPE_EXCLUSIVE:
+ KKASSERT(cst->sharecount < 0);
+ ++cst->sharecount;
+ KKASSERT(cst->sharecount || cst->blocked == 0);
+ break;
+ case CCMS_LTYPE_MODIFYING:
+ KKASSERT(cst->sharecount < 0 && cst->modifycount > 0);
+ ++cst->sharecount;
+ --cst->modifycount;
+ KKASSERT(cst->sharecount || cst->blocked == 0);
+ break;
+ }
+ return(0);
+}
+
+/*
+ * Redo the local lock request for a range which has already been
+ * partitioned.
+ *
+ * NOTE: called with spinlock held.
+ */
+static
+int
+ccms_lock_redo_match(ccms_cst_t cst, void *arg)
+{
+ struct ccms_lock_scan_info *info = arg;
+ ccms_lock_t lock = info->lock;
+
+ if (info->coll_cst == NULL) {
+ switch(lock->ltype) {
+ case CCMS_LTYPE_SHARED:
+ if (cst->sharecount < 0) {
+ info->coll_cst = cst;
+ } else {
+ if (ccms_enable >= 9) {
+ kprintf("CST SHARE %d %lld-%lld\n",
+ cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ ++cst->sharecount;
+ }
+ break;
+ case CCMS_LTYPE_EXCLUSIVE:
+ if (cst->sharecount != 0) {
+ info->coll_cst = cst;
+ } else {
+ --cst->sharecount;
+ if (ccms_enable >= 9) {
+ kprintf("CST EXCLS %d %lld-%lld\n",
+ cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ }
+ break;
+ case CCMS_LTYPE_MODIFYING:
+ if (cst->sharecount != 0) {
+ info->coll_cst = cst;
+ } else {
+ --cst->sharecount;
+ ++cst->modifycount;
+ if (ccms_enable >= 9) {
+ kprintf("CST MODXL %d %lld-%lld\n",
+ cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ }
+ break;
+ }
+ }
+ return(0);
+}
+
+/*
+ * Release a CCMS lock
+ *
+ * MPSAFE
+ */
+int
+ccms_lock_put(ccms_dataspace_t ds, ccms_lock_t lock)
+{
+ struct ccms_lock_scan_info info;
+ ccms_cst_t cst;
+
+ if (lock->ds == NULL)
+ return(0);
+
+ lock->ds = NULL;
+ info.lock = lock;
+ info.ds = ds;
+ info.cst1 = NULL;
+ info.cst2 = NULL;
+
+ spin_lock(&ds->spin);
+ RB_SCAN(ccms_rb_tree, &ds->tree, ccms_lock_scan_cmp,
+ ccms_lock_put_match, &info);
+ cst = ds->delayed_free;
+ ds->delayed_free = NULL;
+ spin_unlock(&ds->spin);
+
+ ccms_delayed_free(cst);
+ if (info.cst1)
+ objcache_put(ccms_oc, info.cst1);
+ if (info.cst2)
+ objcache_put(ccms_oc, info.cst2);
+ return(0);
+}
+
+/*
+ * NOTE: called with spinlock held.
+ */
+static
+int
+ccms_lock_put_match(ccms_cst_t cst, void *arg)
+{
+ struct ccms_lock_scan_info *info = arg;
+ ccms_lock_t lock = info->lock;
+ ccms_cst_t ocst;
+
+ /*
+ * Undo the local shared/exclusive rangelock.
+ */
+ switch(lock->ltype) {
+ case CCMS_LTYPE_SHARED:
+ KKASSERT(cst->sharecount > 0);
+ --cst->sharecount;
+ if (ccms_enable >= 9) {
+ kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset,
+ cst->blocked);
+ }
+ if (cst->blocked && cst->sharecount == 0) {
+ cst->blocked = 0;
+ wakeup(cst);
+ }
+ break;
+ case CCMS_LTYPE_EXCLUSIVE:
+ KKASSERT(cst->sharecount < 0);
+ ++cst->sharecount;
+ if (ccms_enable >= 9) {
+ kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset,
+ cst->blocked);
+ }
+ if (cst->blocked && cst->sharecount == 0) {
+ cst->blocked = 0;
+ wakeup(cst);
+ }
+ break;
+ case CCMS_LTYPE_MODIFYING:
+ KKASSERT(cst->sharecount < 0 && cst->modifycount > 0);
+ ++cst->sharecount;
+ --cst->modifycount;
+ if (ccms_enable >= 9) {
+ kprintf("CST UNMOD %d %lld-%lld (%d)\n", cst->sharecount,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset,
+ cst->blocked);
+ }
+ if (cst->blocked && cst->sharecount == 0) {
+ cst->blocked = 0;
+ wakeup(cst);
+ }
+ break;
+ }
+
+ /*
+ * Decrement the lock coverage count on the CST. Decrement the left and
+ * right edge counts as appropriate.
+ *
+ * When lrefs or rrefs drops to zero we check the adjacent entry to
+ * determine whether a merge is possible. If the appropriate refs field
+ * (rrefs for the entry to our left, lrefs for the entry to our right)
+ * is 0, then all covering locks must cover both entries and the xrefs
+ * field must match. We can then merge the entries if they have
+ * compatible cache states.
+ *
+ * However, because we are cleaning up the shared/exclusive count at
+ * the same time, the sharecount field may be temporarily out of
+ * sync, so require that the sharecount field also match before doing
+ * a merge.
+ *
+ * When merging an element which is being blocked on, the blocking
+ * thread(s) will be woken up.
+ *
+ * If the dataspace has too many CSTs we may be able to merge the
+ * entries even if their cache states are not the same, by dropping
+ * both to a compatible (lower) cache state and performing the appropriate
+ * management operations. XXX
+ */
+ --cst->xrefs;
+ if (lock->beg_offset == cst->beg_offset) {
+ --cst->lrefs;
+ if (cst->lrefs == 0) {
+ if ((ocst = RB_PREV(ccms_rb_tree, &info->ds->tree, cst)) != NULL &&
+ ocst->rrefs == 0 &&
+ ocst->state == cst->state &&
+ ocst->sharecount == cst->sharecount
+ ) {
+ KKASSERT(ocst->xrefs == cst->xrefs);
+ KKASSERT(ocst->end_offset + 1 == cst->beg_offset);
+ RB_REMOVE(ccms_rb_tree, &info->ds->tree, ocst);
+ cst->beg_offset = ocst->beg_offset;
+ cst->lrefs = ocst->lrefs;
+ if (ccms_enable >= 9) {
+ kprintf("MERGELEFT %p %lld-%lld (%d)\n",
+ ocst,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset,
+ cst->blocked);
+ }
+ if (ocst->blocked) {
+ ocst->blocked = 0;
+ wakeup(ocst);
+ }
+ /*objcache_put(ccms_oc, ocst);*/
+ ocst->delayed_next = info->ds->delayed_free;
+ info->ds->delayed_free = ocst;
+ }
+ }
+ }
+ if (lock->end_offset == cst->end_offset) {
+ --cst->rrefs;
+ if (cst->rrefs == 0) {
+ if ((ocst = RB_NEXT(ccms_rb_tree, &info->ds->tree, cst)) != NULL &&
+ ocst->lrefs == 0 &&
+ ocst->state == cst->state &&
+ ocst->sharecount == cst->sharecount
+ ) {
+ KKASSERT(ocst->xrefs == cst->xrefs);
+ KKASSERT(cst->end_offset + 1 == ocst->beg_offset);
+ RB_REMOVE(ccms_rb_tree, &info->ds->tree, ocst);
+ cst->end_offset = ocst->end_offset;
+ cst->rrefs = ocst->rrefs;
+ if (ccms_enable >= 9) {
+ kprintf("MERGERIGHT %p %lld-%lld\n",
+ ocst,
+ (long long)cst->beg_offset,
+ (long long)cst->end_offset);
+ }
+ /*objcache_put(ccms_oc, ocst);*/
+ ocst->delayed_next = info->ds->delayed_free;
+ info->ds->delayed_free = ocst;
+ }
+ }
+ }
+ return(0);
+}
+
+/*
+ * RB tree compare function for insertions and deletions. This function
+ * compares two CSTs.
+ */
+static int
+ccms_cst_cmp(ccms_cst_t b1, ccms_cst_t b2)
+{
+ if (b1->end_offset < b2->beg_offset)
+ return(-1);
+ if (b1->beg_offset > b2->end_offset)
+ return(1);
+ return(0);
+}
+
+/*
+ * RB tree scanning compare function. This function compares the CST
+ * from the tree against the supplied ccms_lock and returns the CST's
+ * placement relative to the lock.
+ */
+static int
+ccms_lock_scan_cmp(ccms_cst_t cst, void *arg)
+{
+ struct ccms_lock_scan_info *info = arg;
+ ccms_lock_t lock = info->lock;
+
+ if (cst->end_offset < lock->beg_offset)
+ return(-1);
+ if (cst->beg_offset > lock->end_offset)
+ return(1);
+ return(0);
+}
+
+/*
+ * This function works like ccms_lock_scan_cmp but terminates at the
+ * collision point rather then at the lock's ending offset. Only
+ * the CSTs that were already partially resolved are returned by the scan.
+ */
+static int
+ccms_lock_undo_cmp(ccms_cst_t cst, void *arg)
+{
+ struct ccms_lock_scan_info *info = arg;
+ ccms_lock_t lock = info->lock;
+
+ if (cst->end_offset < lock->beg_offset)
+ return(-1);
+ if (cst->beg_offset >= info->coll_cst->beg_offset)
+ return(1);
+ return(0);
+}
--- /dev/null
+/*
+ * Copyright (c) 2006,2012 The DragonFly Project. All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * CCMS - Cache Coherency Management System. These structures are used
+ * to manage cache coherency and locking for an object. Cache Coherency is
+ * managed at byte granularity with 64 bit offset ranges.
+ *
+ * Management is broken into two distinct pieces: (1) Local shared/exclusive
+ * locks which essentially replace traditional vnode locks and (2) local
+ * cache state which interacts with other hosts and follows a MESI-like model.
+ *
+ * The core to the entire module is the 'CST' (Cache State Tree) structure
+ * which stores both pieces of information in a red-black tree styled data
+ * structure. CSTs are non-overlapping offset-ranged entities. Other
+ * higher level structures govern how CSTs in the red-black tree or cut up
+ * or merged.
+ */
+
+#ifndef _SYS_CCMS_H_
+#define _SYS_CCMS_H_
+
+#ifndef _SYS_TYPES_H_
+#include <sys/types.h>
+#endif
+#ifndef _SYS_PARAM_H_
+#include <sys/param.h>
+#endif
+#ifndef _SYS_SERIALIZE_H_
+#include <sys/serialize.h>
+#endif
+#ifndef _SYS_SPINLOCK_H_
+#include <sys/spinlock.h>
+#endif
+#ifndef _SYS_TREE_H_
+#include <sys/tree.h>
+#endif
+
+/*
+ * CCMS uses a red-black tree to sort CSTs.
+ */
+RB_HEAD(ccms_rb_tree, ccms_cst);
+RB_PROTOTYPE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp, off_t);
+
+struct ccms_lock;
+struct ccms_cst;
+
+/*
+ * ccms_state_t - CCMS cache states
+ *
+ * CCMS uses an extended MESI caching model. There are two extension
+ * states, MASTER and SLAVE, which represents dirty data which has not been
+ * synchronized to backing store but which nevertheless is being shared
+ * between distinct caches. These states are designed to allow data
+ * to be shared between nodes in a cluster without having to wait for it
+ * to be synchronized with its backing store.
+ *
+ * SLAVE - A shared state where the master copy of the data is being
+ * held by a foreign cache rather then by backing store.
+ * This state implies that the backing store may contain stale
+ * data.
+ *
+ * MASTER - A shared state where the master copy of the data is being
+ * held locally. Zero or more foreign caches may be holding
+ * a copy of our data, so we cannot modify it without
+ * invalidating those caches. This state implies that the
+ * backing store may contain stale data.
+ *
+ * MASTER differs from MODIFIED in that the data is read-only
+ * due to the existance of foreign copies. However, even though
+ * the data is read-only, it is ALSO DIRTY because the backing
+ * store has not been synchronized
+ *
+ * NOTE! The cache state represents the worst case cache state for caching
+ * elements such as the buffer cache or VM page cache or the vnode attribute
+ * cache (or other things) within the specified range. It does NOT mean
+ * that the local machine actually has all of the requested data in-hand.
+ */
+typedef enum ccms_state {
+ CCMS_STATE_INVALID = 0,
+ CCMS_STATE_SHARED, /* clean, read-only, from backing store */
+ CCMS_STATE_SLAVE, /* clean, read-only, from master */
+ CCMS_STATE_MASTER, /* dirty, read-only, shared master copy */
+ CCMS_STATE_EXCLUSIVE, /* clean, read-only, exclusive */
+ CCMS_STATE_MODIFIED /* clean or dirty, read-write, exclusive */
+} ccms_state_t;
+
+/*
+ * ccms_ltype_t - local access control lock state
+ *
+ * Note: A MODIFYING lock is an exclusive lock where the caller intends to
+ * make a modification, such as issuing a WRITE. The difference between the
+ * two is in how the cache state is effected by the lock. The distinction
+ * exists because there are many situations where the governing structure
+ * on the local machine needs to be locked exclusively, but the underlying
+ * data cache does not.
+ *
+ * lock type cache state
+ * --------- ---------
+ * SHARED >= shared
+ * EXCLUSIVE >= shared
+ * MODIFYING >= exclusive
+ */
+typedef enum ccms_ltype {
+ CCMS_LTYPE_SHARED = 0, /* shared lock on the range */
+ CCMS_LTYPE_EXCLUSIVE, /* exclusive lock on the range */
+ CCMS_LTYPE_MODIFYING /* modifying lock on the range */
+} ccms_ltype_t;
+
+/*
+ * The CCMS ABI information structure. This structure contains ABI
+ * calls to resolve incompatible cache states.
+ */
+struct ccms_info {
+ int (*ccms_set_cache)(struct ccms_info *, struct ccms_lock *, ccms_state_t);
+ void *data;
+ /* XXX */
+};
+
+/*
+ * A CCMS dataspace, typically stored in a vnode or VM object. The primary
+ * reference is to the ccms_dataspace representing the local machine. The
+ * chain field is used to link ccms_dataspace's representing other machines.
+ * These foreign representations typically only contain summary 'worst-case'
+ * CSTs. The chain only needs to be followed if a CST has a cache state
+ * that is incompatible with the request.
+ */
+struct ccms_dataspace {
+ struct ccms_rb_tree tree;
+ struct ccms_info *info;
+ struct ccms_dataspace *chain;
+ ccms_state_t defstate;
+ struct spinlock spin;
+ struct ccms_cst *delayed_free; /* delayed frees */
+};
+
+/*
+ * The CCMS locking element - represents a high level locking request,
+ * such as used by read, write, and truncate operations. These requests
+ * are not organized into any tree but instead are shadowed by items in
+ * the actual cache state tree (ccms_cst). There are no direct links
+ * between a ccms_lock and the underlying CST items, only reference count
+ * fields in the CST item.
+ *
+ * When a CCMS lock is established the cache state of the underlying elements
+ * is adjusted to meet the requirements of the lock. The cache state
+ * requirements are infered by the lock type:
+ *
+ * NOTE: Ranges may include negative offsets. These are typically used to
+ * represent meta-data.
+ *
+ * local lock cache state
+ * ----------------- --------------------
+ * SHARED - SHARED must not be invalid
+ * EXCLUSIVE - EXCLUSIVE must not be invalid
+ * MODIFYING - EXCLUSIVE must be EXCLUSIVE or MODIFIED
+ */
+struct ccms_lock {
+ struct ccms_dataspace *ds;
+ off_t beg_offset;
+ off_t end_offset;
+ ccms_ltype_t ltype;
+};
+
+/*
+ * CCMS cache state tree element (CST) - represents the actual cache
+ * management state for a data space. The cache state tree is a
+ * non-overlaping red-black tree containing ranged ccms_cst structures
+ * which reflect the resolved state for all current high level locking
+ * requests. For example, two overlapping ccms_lock requests for shared
+ * access would typically be represented by three non-overlapping ccms_cst
+ * items in the CST. The CST item representing the overlapped portion of
+ * the ccms_lock requests would have ref count of 2 while the other CST
+ * items would have a ref count of 1.
+ *
+ * [lock request #01]
+ * [lock request #02]
+ * [--cst--][--cst--][--cst--]
+ *
+ * CSTs are partitioned so their edges line up to all current and pending
+ * ccms_lock requests. CSTs are re-merged whenever possible. A freshly
+ * initialized database typically has a single CST representing the default
+ * cache state for the host.
+ *
+ * A CST represents *TWO* different things. First, it represents local
+ * locks held on data ranges. Second, it represents the best-case cache
+ * state for data cached on the local machine for local<->remote host
+ * interactions.
+ *
+ * Any arbitrary data range within a dataspace can be locked shared or
+ * exclusive. Obtaining a lock has the side effect of potentially modifying
+ * the cache state. A positive sharecount in a CST indicates that a
+ * shared access lock is being held. A negative sharecount indicates an
+ * exclusive access lock is being held on the range. A MODIFYING lock
+ * type is just an exclusive lock but one which effects the cache state
+ * differently.
+ *
+ * The end offset is byte-inclusive, allowing the entire 64 bit data space
+ * to be represented without overflowing the edge case. For example, a
+ * 64 byte area might be represented as (0,63). The offsets are SIGNED
+ * entities. Negative offsets are often used to represent meta-data
+ * such as ownership and permissions. The file size is typically cached as a
+ * side effect of file operations occuring at the file EOF rather then
+ * combined with ownership and permissions.
+ */
+struct ccms_cst {
+ RB_ENTRY(ccms_cst) rbnode; /* stored in a red-black tree */
+ struct ccms_cst *delayed_next; /* linked list to free */
+ off_t beg_offset;
+ off_t end_offset;
+ ccms_state_t state; /* local cache state */
+ int sharecount; /* shared or exclusive lock count */
+ int modifycount; /* number of modifying exclusive lks */
+ int blocked; /* indicates a blocked lock request */
+ int xrefs; /* lock overlap references */
+ int lrefs; /* left edge refs */
+ int rrefs; /* right edge refs */
+};
+
+typedef struct ccms_info *ccms_info_t;
+typedef struct ccms_dataspace *ccms_dataspace_t;
+typedef struct ccms_lock *ccms_lock_t;
+typedef struct ccms_cst *ccms_cst_t;
+
+/*
+ * Kernel API
+ */
+#ifdef _KERNEL
+
+static __inline
+void
+ccms_lock_init(ccms_lock_t lock, off_t beg_offset, off_t end_offset,
+ ccms_ltype_t ltype)
+{
+ lock->beg_offset = beg_offset;
+ lock->end_offset = end_offset;
+ lock->ltype = ltype;
+}
+
+void ccms_dataspace_init(ccms_dataspace_t);
+void ccms_dataspace_destroy(ccms_dataspace_t);
+int ccms_lock_get(ccms_dataspace_t, ccms_lock_t);
+int ccms_lock_get_uio(ccms_dataspace_t, ccms_lock_t, struct uio *);
+int ccms_lock_put(ccms_dataspace_t, ccms_lock_t);
+
+#endif
+
+#endif