hammer2 - Initial CCMS locking tie-in
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 7 Jun 2012 05:43:14 +0000 (22:43 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 7 Jun 2012 05:43:14 +0000 (22:43 -0700)
This is a necessary precursor step to being able to integrate the cache
state grants with our chain locks.  Basically we are replacing the
hammer2 chain lockmgr lock (hammer2_chain->lk) with a CCMS cst structure
(hammer2_chain->cst).

This structure will become the attribute CST for hammer2 inodes.  The
topological CST is built into the hammer2_inode.  Data-space CSTs will
initially be the hammer2_chain->cst for indirect blocks though we will
probably also need one or more in hammer2_inode to handle generic casess.

sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_ccms.c
sys/vfs/hammer2/hammer2_ccms.h
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_subr.c
sys/vfs/hammer2/hammer2_vfsops.c

index fadb555..f099fcf 100644 (file)
@@ -54,6 +54,7 @@
 #include <sys/mountctl.h>
 #include <sys/priv.h>
 #include <sys/stat.h>
+#include <sys/thread.h>
 #include <sys/globaldata.h>
 #include <sys/lockf.h>
 #include <sys/buf.h>
@@ -99,6 +100,7 @@ struct hammer2_pfsmount;
 SPLAY_HEAD(hammer2_chain_splay, hammer2_chain);
 
 struct hammer2_chain {
+       ccms_cst_t      cst;                    /* attr or data cst */
        struct hammer2_blockref bref;
        struct hammer2_blockref bref_flush;     /* synchronized w/MOVED bit */
        struct hammer2_chain *parent;           /* return chain to root */
@@ -115,7 +117,6 @@ struct hammer2_chain {
        struct buf      *bp;            /* buffer cache (ro) */
        hammer2_media_data_t *data;     /* modified copy of data (rw) */
        u_int           bytes;          /* physical size of data */
-       struct lock     lk;             /* lockmgr lock */
        int             index;          /* index in parent */
        u_int           refs;
        u_int           busy;           /* soft-busy */
@@ -221,13 +222,16 @@ SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 
 /*
  * A hammer2 inode.
+ *
+ * NOTE: The inode's attribute CST which is also used to lock the inode
+ *      is embedded in the chain (chain.cst) and aliased w/ attr_cst.
  */
 struct hammer2_inode {
        struct hammer2_mount    *hmp;           /* Global mount */
        struct hammer2_pfsmount *pmp;           /* PFS mount */
        struct hammer2_inode    *pip;           /* parent inode */
        struct vnode            *vp;
-       ccms_inode_t            *cino;          /* cluster cache state */
+       ccms_cst_t              topo_cst;       /* directory topology cst */
        hammer2_chain_t         chain;
        struct hammer2_inode_data ip_data;
        struct lockf            advlock;
@@ -238,6 +242,12 @@ struct hammer2_inode {
 
 typedef struct hammer2_inode hammer2_inode_t;
 
+#if defined(_KERNEL)
+
+#define attr_cst       chain.cst
+
+#endif
+
 /*
  * A hammer2 indirect block
  */
index 70b3631..4d3c8b4 100644 (file)
 
 #include "hammer2_ccms.h"
 
-struct ccms_lock_scan_info {
-       ccms_inode_t    *cino;
-       ccms_lock_t     *lock;
-       ccms_cst_t      *coll_cst;
-       int             rstate_upgrade_needed;
-};
-
-static int ccms_cst_cmp(ccms_cst_t *b1, ccms_cst_t *b2);
-static int ccms_lock_scan_cmp(ccms_cst_t *b1, void *arg);
-
-static int ccms_lock_get_match(ccms_cst_t *cst, void *arg);
-static int ccms_lock_undo_match(ccms_cst_t *cst, void *arg);
-static int ccms_lock_redo_match(ccms_cst_t *cst, void *arg);
-static int ccms_lock_upgrade_match(ccms_cst_t *cst, void *arg);
-static int ccms_lock_put_match(ccms_cst_t *cst, void *arg);
-
-static void ccms_lstate_get(ccms_cst_t *cst, ccms_state_t state);
-static void ccms_lstate_put(ccms_cst_t *cst);
-static void ccms_rstate_get(ccms_cst_t *cst, ccms_state_t state);
-static void ccms_rstate_put(ccms_cst_t *cst);
-
-struct ccms_rb_tree;
-RB_GENERATE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp,
-            ccms_off_t, beg_offset, end_offset);
-static MALLOC_DEFINE(M_CCMS, "CCMS", "Cache Coherency Management System");
-
-static int ccms_debug = 0;
-
-/*
- * These helpers are called to manage the CST cache so we can avoid
- * unnecessary kmalloc()'s and kfree()'s in hot paths.
- *
- * ccms_free_pass1() must be called with the spinlock held.
- * ccms_free_pass2() must be called with the spinlock not held.
- */
-static __inline
-ccms_cst_t *
-ccms_free_pass1(ccms_inode_t *cino, int keep)
-{
-       ccms_cst_t *cst;
-       ccms_cst_t **cstp;
-
-       cstp = &cino->free_cache;
-       while ((cst = *cstp) != NULL && keep) {
-               cstp = &cst->free_next;
-               --keep;
-       }
-       *cstp = NULL;
-       return (cst);
-}
-
-static __inline
-void
-ccms_free_pass2(ccms_cst_t *next)
-{
-       ccms_cst_t *cst;
-       ccms_domain_t *dom;
-
-       while ((cst = next) != NULL) {
-               next = cst->free_next;
-               cst->free_next = NULL;
-
-               dom = cst->cino->domain;
-               atomic_add_int(&dom->cst_count, -1);
-
-               kfree(cst, dom->mcst);
-       }
-}
+int ccms_debug = 0;
 
 /*
  * Initialize a new CCMS dataspace.  Create a new RB tree with a single
@@ -126,1098 +59,272 @@ void
 ccms_domain_init(ccms_domain_t *dom)
 {
        bzero(dom, sizeof(*dom));
-       kmalloc_create(&dom->mcst, "CCMS-cst");
+       /*kmalloc_create(&dom->mcst, "CCMS-cst");*/
        /*dom->root.domain = dom;*/
 }
 
 void
 ccms_domain_uninit(ccms_domain_t *dom)
 {
-       kmalloc_destroy(&dom->mcst);
+       /*kmalloc_destroy(&dom->mcst);*/
 }
 
-#if 0
-/*
- * Initialize a ccms_inode for use.  The inode will be initialized but
- * is not yet connected to the rest of the topology.  However, it can
- * still be used stand-alone if desired without being connected to the
- * topology.
- */
 void
-ccms_inode_init(ccms_domain_t *dom, ccms_inode_t *cino, void *handle)
+ccms_cst_init(ccms_cst_t *cst, void *handle)
 {
-       ccms_cst_t *cst;
-
-       bzero(cino, sizeof(*cino));
-
-       spin_init(&cino->spin);
-       RB_INIT(&cino->tree);
-       cino->domain = dom;
-       cino->handle = handle;
-       /* cino->attr_cst.cino = cino; no rbtree association */
-       cino->attr_cst.lstate = CCMS_STATE_INVALID;
-       cino->attr_cst.rstate = CCMS_STATE_INVALID;
-
-       /*
-        * The dataspace must be initialized w/cache-state set to INVALID
-        * for the entire range.
-        */
-       cst = kmalloc(sizeof(*cst), dom->mcst, M_WAITOK | M_ZERO);
-       cst->cino = cino;
-       cst->flags = CCMS_CST_DYNAMIC;
-       cst->beg_offset = 0;
-       cst->end_offset = 0xFFFFFFFFFFFFFFFFLLU;
-       cst->lstate = CCMS_STATE_INVALID;
-       cst->rstate = CCMS_STATE_INVALID;
-       RB_INSERT(ccms_rb_tree, &cino->tree, cst);
-       atomic_add_int(&dom->cst_count, 1);
+       bzero(cst, sizeof(*cst));
+       cst->handle = handle;
 }
 
-/*
- * Associate the topology CST with a CCMS inode.  The topology CST must
- * be held locked (typically SHARED) by the caller.  The caller is responsible
- * for interlocking a unique ccms_inode to prevent SMP races.
- */
 void
-ccms_inode_associate(ccms_inode_t *cino, ccms_cst_t *topo_cst)
+ccms_cst_uninit(ccms_cst_t *cst)
 {
-       KKASSERT(topo_cst->tag.cino == NULL);
-
-       spin_lock(&cino->spin);
-       topo_cst->tag.cino = cino;
-       topo_cst->flags |= CCMS_CST_INODE;
-
-       cino->topo_cst = topo_cst;
-       cino->parent = topo_cst->cino;
-       cino->flags |= CCMS_INODE_INSERTED;
-       spin_unlock(&cino->spin);
-}
-
-#if 0
-
-int
-ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock)
-
-       spin_lock(&cpar->spin);
-       spin_lock(&cino->spin);
-
-       KKASSERT((cino->flags & CCMS_INODE_INSERTED) == 0);
-       cino->topo_cst.beg_offset = key;
-       cino->topo_cst.end_offset = key;
-
-       if (RB_INSERT(ccms_rb_tree, &cpar->tree, &cino->topo_cst)) {
-               spin_unlock(&cino->spin);
-               spin_unlock(&cpar->spin);
-               panic("ccms_inode_insert: duplicate entry");
+       KKASSERT(cst->count == 0);
+       if (cst->state != CCMS_STATE_INVALID) {
+               /* XXX */
        }
-       cino->parent = cpar;
-       cino->flags |= CCMS_INODE_INSERTED;
-       spin_unlock(&cino->spin);
-       spin_unlock(&cpar->spin);
+       cst->handle = NULL;
 }
 
-#endif
-
+#if 0
 /*
- * Delete an inode from the topology.  The inode can remain in active use
- * after the deletion (e.g. when unlinking a file which still has open
- * descriptors) but it's topo_cst is removed from its parent.
+ * Acquire an operational CCMS lock on multiple CSTs.
  *
- * If the caller is destroying the ccms_inode the caller must call
- * ccms_inode_uninit() to invalidate the cache state (which can block).
+ * This code is in the critical path and highly streamlined.
  */
 void
-ccms_inode_disassociate(ccms_inode_t *cino)
+ccms_lock_get(ccms_lock_t *lock)
 {
-       ccms_inode_t *cpar;
-       ccms_cst_t *topo_cst;
-       int flags;
-
-       /*
-        * Interlock with the DELETING flag.
-        */
-       spin_lock(&cino->spin);
-       flags = cino->flags;
-       cino->flags |= CCMS_INODE_DELETING;
-       spin_unlock(&cino->spin);
-
-       if (flags & CCMS_INODE_DELETING)
-               return;
-       if ((flags & CCMS_INODE_INSERTED) == 0)
-               return;
-
-       /*
-        *
-        */
-       topo_cst = cino->topo_cst;
+       ccms_inode_t *cino = lock->cino;
 
-ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock)
+again:
+       lock->flags &= ~CCMS_LOCK_FAILED;
 
        /*
-        * We have the interlock, we are the only ones who can delete
-        * the inode now.
+        * Acquire all local locks first, then resolve them against the
+        * remote cache state.  Order is important here.
         */
-       cpar = cino->parent;
-       spin_lock(&cpar->spin);
-       spin_lock(&cino->spin);
-       KKASSERT(cpar == cino->parent);
-
-       cino->flags &= ~CCMS_INODE_INSERTED;
-       RB_REMOVE(ccms_rb_tree, &cpar->tree, &cino->topo_cst);
-
-       spin_unlock(&cino->spin);
-       spin_unlock(&cpar->spin);
-}
-
-/*
- * The caller has removed the inode from the topology and is now trying
- * to destroy the structure.  This routine flushes the cache state and
- * can block on third-party interactions.
- *
- * NOTE: Caller must have already destroyed any recursive inode state.
- */
-void
-ccms_inode_uninit(ccms_inode_t *cino)
-{
-       ccms_cst_t *scan;
-
-       KKASSERT((cino->flags & CCMS_INODE_INSERTED) == 0);
-       spin_lock(&cino->spin);
-
-       while ((scan = RB_ROOT(&cino->tree)) != NULL) {
-               KKASSERT(scan->flags & CCMS_CST_DYNAMIC);
-               KKASSERT((scan->flags & CCMS_CST_DELETING) == 0);
-               RB_REMOVE(ccms_rb_tree, &cino->tree, scan);
-               scan->flags |= CCMS_CST_DELETING;
-               scan->flags &= ~CCMS_CST_INSERTED;
-               spin_unlock(&cino->spin);
-
-               /*
-                * Inval can be called without the inode spinlock because
-                * we own the DELETING flag.
-                */
-               ccms_lstate_put(scan);
-               ccms_rstate_put(scan);
-               atomic_add_int(&cino->domain->cst_count, -1);
-
-               kfree(scan, cino->domain->mcst);
-               spin_lock(&cino->spin);
+       if (lock->req_t) {
+               KKASSERT(lock->req_d <= lock->req_t);
+               KKASSERT(lock->req_a <= lock->req_t);
+               ccms_thread_lock(&cino->topo_cst, lock->req_t);
        }
-       KKASSERT((cino->attr_cst.flags & CCMS_CST_DELETING) == 0);
-       cino->attr_cst.flags |= CCMS_CST_DELETING;
-       KKASSERT((cino->topo_cst.flags & CCMS_CST_DELETING) == 0);
-       cino->topo_cst.flags |= CCMS_CST_DELETING;
-       spin_unlock(&cino->spin);
-
-       /*
-        * Inval can be called without the inode spinlock because
-        * we own the DELETING flag.  Similarly we can clear cino->domain
-        * and cino->handle because we own the DELETING flag on the cino.
-        */
-       ccms_lstate_put(&cino->attr_cst);
-       ccms_rstate_put(&cino->attr_cst);
-       ccms_lstate_put(&cino->topo_cst);
-       ccms_rstate_put(&cino->topo_cst);
+       if (lock->req_a)
+               ccms_thread_lock(&cino->attr_cst, lock->req_a);
+       if (lock->req_d)
+               ccms_thread_lock(&cino->data_cst[0], lock->req_d);
 
        /*
-        * Clean out the ccms_inode free CST cache
+        * Once the local locks are established the CST grant state cannot
+        * be pulled out from under us.  However, it is entirely possible
+        * to deadlock on it so when CST grant state cannot be obtained
+        * trivially we have to unwind our local locks, then get the state,
+        * and then loop.
         */
-       spin_lock(&cino->spin);
-       scan = ccms_free_pass1(cino, 0);
-       spin_unlock(&cino->spin);
-       ccms_free_pass2(scan);
-
-       cino->domain = NULL;
-       cino->handle = NULL;
-}
-
-#endif
-
-/*
- * This is the core CCMS lock acquisition code and is typically called
- * by program-specific wrappers which initialize the lock structure.
- *
- * Three cache coherent domains can be obtained, the topological 't'
- * domain, the attribute 'a' domain, and a range in the data 'd' domain.
- *
- * A topological CCMS lock covers the entire attribute and data domain
- * plus recursively covers the entire directory sub-tree, so if a topo
- * lock is requested the other 'a' and 'd' locks currently assert if
- * specified in the same request.
- *
- * You can get both an 'a' and a 'd' lock at the same time and, in
- * particular, a VFS can use the 'a' lock to also lock the related
- * VFS inode structure if it desires to.  HAMMER2 utilizes this feature.
- *
- * Topo locks are typically needed for rename operations and topo CST
- * cache state on the backend can be used to limit the number of dynamic
- * CST allocations backing the live CCMS locks.
- */
-int
-ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock)
-{
-       struct ccms_lock_scan_info info;
-       ccms_cst_t *cst;
-       int use_redo = 0;
-       ccms_state_t highest_state;
-
-       /*
-        * Live local locks prevent remotes from downgrading the rstate,
-        * so we have to acquire a local lock before testing rstate.  If
-        *
-        * The local lock must be released if a remote upgrade is required
-        * to avoid a deadlock, and we retry in that situation.
-        */
-again:
-       if (lock->tstate) {
-               KKASSERT(lock->astate == 0 && lock->dstate == 0);
-               lock->icst = &cino->topo_cst;
-               ccms_lstate_get(lock->icst, lock->tstate);
-
-               if (cino->topo_cst.rstate < lock->tstate) {
-                       ccms_lstate_put(&cino->topo_cst);
-                       ccms_rstate_get(&cino->topo_cst, lock->tstate);
-                       goto again;
-               }
-       } else {
-               /*
-                * The topo rstate must be at least ALLOWED for us to be
-                * able to acquire any other cache state.  If the topo
-                * rstate is already higher than that then we may have
-                * to upgrade it further to cover the lstate's we are
-                * requesting.
-                */
-               highest_state = CCMS_STATE_ALLOWED;
-               if (cino->topo_cst.rstate > highest_state) {
-                       if (highest_state < lock->astate)
-                               highest_state = lock->astate;
-                       if (highest_state < lock->dstate)
-                               highest_state = lock->dstate;
-               }
-               if (cino->topo_cst.rstate < highest_state)
-                       ccms_rstate_get(&cino->topo_cst, highest_state);
-               /* no need to retry */
+       if (lock->req_t > cino->topo_cst.state) {
+               ccms_rstate_get(lock, &cino->topo_cst, lock->req_t);
+       } else if (cino->topo_cst.state == CCMS_STATE_INVALID) {
+               ccms_rstate_get(lock, &cino->topo_cst, CCMS_STATE_ALLOWED);
+       } else if (cino->topo_cst.state == CCMS_STATE_SHARED &&
+                   (lock->req_d > CCMS_STATE_SHARED ||
+                    lock->req_a > CCMS_STATE_SHARED)) {
+               ccms_rstate_get(lock, &cino->topo_cst, CCMS_STATE_ALLOWED);
        }
-       if (lock->astate) {
-               lock->icst = &cino->attr_cst;
-               ccms_lstate_get(lock->icst, lock->astate);
-
-               if (cino->attr_cst.rstate < lock->astate) {
-                       ccms_lstate_put(&cino->attr_cst);
-                       if (lock->tstate)
-                               ccms_lstate_put(&cino->topo_cst);
-                       ccms_rstate_get(&cino->attr_cst, lock->astate);
-                       goto again;
-               }
-       }
-
-       /*
-        * The data-lock is a range-lock and requires a bit more code.
-        * The CST space is partitioned so the precise range is covered.
-        *
-        * Multiple CST's may be involved and dcst points to the left hand
-        * edge.
-        */
-       if (lock->dstate) {
-               info.lock = lock;
-               info.cino = cino;
-               info.coll_cst = NULL;
+       /* else the rstate is compatible */
 
-               spin_lock(&cino->spin);
-
-               /*
-                * Make sure cino has enough free CSTs to cover the operation,
-                * so we can hold the spinlock through the scan later on.
-                */
-               while (cino->free_cache == NULL ||
-                      cino->free_cache->free_next == NULL) {
-                       spin_unlock(&cino->spin);
-                       cst = kmalloc(sizeof(*cst), cino->domain->mcst,
-                                     M_WAITOK | M_ZERO);
-                       atomic_add_int(&cino->domain->cst_count, 1);
-                       spin_lock(&cino->spin);
-                       cst->free_next = cino->free_cache;
-                       cino->free_cache = cst;
-               }
-
-               /*
-                * The partitioning code runs with the spinlock held.  If
-                * we've already partitioned due to having to do an rstate
-                * upgrade we run a redo instead of a get.
-                */
-               info.rstate_upgrade_needed = 0;
-               if (use_redo == 0) {
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_get_match, &info);
-               } else {
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_redo_match, &info);
-               }
-
-               /*
-                * If a collision occured, undo the fragments we were able
-                * to obtain, block, and try again.
-                */
-               while (info.coll_cst != NULL) {
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_undo_match, &info);
-                       info.coll_cst->blocked = 1;
-                       info.coll_cst = NULL;
-                       ssleep(info.coll_cst, &cino->spin, 0, "ccmsget", hz);
-                       info.rstate_upgrade_needed = 0;
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_redo_match, &info);
-               }
-
-               /*
-                * If the rstate needs to be upgraded we have to undo the
-                * local locks (but we retain the partitioning).
-                *
-                * Set use_redo to indicate that the partioning was retained
-                * (i.e. lrefs and rrefs remain intact).
-                */
-               if (info.rstate_upgrade_needed) {
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_undo_match, &info);
-                       spin_unlock(&cino->spin);
-                       if (lock->astate)
-                               ccms_lstate_put(&cino->attr_cst);
-                       if (lock->tstate)
-                               ccms_lstate_put(&cino->topo_cst);
-                       spin_lock(&cino->spin);
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_upgrade_match, &info);
-                       spin_unlock(&cino->spin);
-                       use_redo = 1;
-                       goto again;
-               }
+       if (lock->req_a > cino->attr_cst.state)
+               ccms_rstate_get(lock, &cino->attr_cst, lock->req_a);
 
-               /*
-                * Cleanup free CSTs beyond the 2 we wish to retain.
-                */
-               cst = ccms_free_pass1(cino, 2);
-               spin_unlock(&cino->spin);
-               ccms_free_pass2(cst);
-       }
+       if (lock->req_d > cino->data_cst[0].state)
+               ccms_rstate_get(lock, &cino->data_cst[0], lock->req_d);
 
        /*
-        * Ok, everything is in good shape EXCEPT we might not have
-        * sufficient topo_cst.rstate.  It could have gotten ripped
-        * out from under us.  Once we have the local locks it can
-        * no longer be downgraded so a check here suffices.
+        * If the ccms_rstate_get() code deadlocks (or even if it just
+        * blocks), it will release all local locks and set the FAILED
+        * bit.  The routine will still acquire the requested remote grants
+        * before returning but since the local locks are lost at that
+        * point the remote grants are no longer protected and we have to
+        * retry.
         */
-       highest_state = CCMS_STATE_ALLOWED;
-       if (highest_state < lock->tstate)
-               highest_state = lock->tstate;
-       if (highest_state < lock->astate)
-               highest_state = lock->astate;
-       if (highest_state < lock->dstate)
-               highest_state = lock->dstate;
-
-       if (cino->topo_cst.rstate < highest_state) {
-               if (lock->dstate) {
-                       spin_lock(&cino->spin);
-                       RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                               ccms_lock_put_match, &info);
-                       spin_unlock(&cino->spin);
-               }
-               if (lock->astate)
-                       ccms_lstate_put(&cino->attr_cst);
-               if (lock->tstate)
-                       ccms_lstate_put(&cino->topo_cst);
-               ccms_rstate_get(&cino->topo_cst, highest_state);
-               use_redo = 0;
+       if (lock->flags & CCMS_LOCK_FAILED) {
                goto again;
        }
-       return(0);
 }
 
 /*
- * Obtain a CCMS lock, initialize the lock structure based on the uio.
- *
- * Both the attribute AND a ranged-data lock is acquired.
+ * Release a previously acquired CCMS lock.
  */
-int
-ccms_lock_get_uio(ccms_inode_t *cino, ccms_lock_t *lock, struct uio *uio)
+void
+ccms_lock_put(ccms_lock_t *lock)
 {
-       ccms_state_t dstate;
-       ccms_off_t eoff;
-
-       if (uio->uio_rw == UIO_READ)
-               dstate = CCMS_STATE_SHARED;
-       else
-               dstate = CCMS_STATE_MODIFIED;
+       ccms_inode_t *cino = lock->cino;
 
-       /*
-        * Calculate the ending offset (byte inclusive), make sure a seek
-        * overflow does not blow us up.
-        */
-       eoff = uio->uio_offset + uio->uio_resid - 1;
-       if (eoff < uio->uio_offset)
-               eoff = 0x7FFFFFFFFFFFFFFFLL;
-       lock->beg_offset = uio->uio_offset;
-       lock->end_offset = eoff;
-       lock->tstate = 0;
-       lock->astate = dstate;
-       lock->dstate = dstate;
-       return (ccms_lock_get(cino, lock));
+       if (lock->req_d) {
+               ccms_thread_unlock(&cino->data_cst[0]);
+       }
+       if (lock->req_a) {
+               ccms_thread_unlock(&cino->attr_cst);
+       }
+       if (lock->req_t) {
+               ccms_thread_unlock(&cino->topo_cst);
+       }
 }
 
-/*
- * Obtain a CCMS lock.  Only the attribute lock is acquired.
- */
-int
-ccms_lock_get_attr(ccms_inode_t *cino, ccms_lock_t *lock, ccms_state_t astate)
-{
-       lock->tstate = 0;
-       lock->astate = astate;
-       lock->dstate = 0;
-       return (ccms_lock_get(cino, lock));
-}
+#endif
+
+/************************************************************************
+ *                         CST SUPPORT FUNCTIONS                       *
+ ************************************************************************/
 
 /*
- * Helper routine.
- *
- * NOTE: called with spinlock held.
+ * Acquire local cache state & lock.  If the current thread already holds
+ * the lock exclusively we bump the exclusive count, even if the thread is
+ * trying to get a shared lock.
  */
-static
-int
-ccms_lock_get_match(ccms_cst_t *cst, void *arg)
+void
+ccms_thread_lock(ccms_cst_t *cst, ccms_state_t state)
 {
-       struct ccms_lock_scan_info *info = arg;
-       ccms_lock_t *lock = info->lock;
-       ccms_cst_t *ncst;
-
-       /*
-        * If the lock's left edge is within the CST we must split the CST
-        * into two pieces [cst][ncst].  lrefs must be bumped on the CST
-        * containing the left edge.
-        *
-        * NOTE! cst->beg_offset may not be modified.  This allows us to
-        *       avoid having to manipulate the cst's position in the tree.
-        */
-       if (lock->beg_offset > cst->beg_offset) {
-               ncst = info->cino->free_cache;
-               info->cino->free_cache = ncst->free_next;
-               ncst->free_next = NULL;
-               KKASSERT(ncst != NULL);
-
-               *ncst = *cst;
-               cst->end_offset = lock->beg_offset - 1;
-               cst->rrefs = 0;
-               ncst->beg_offset = lock->beg_offset;
-               ncst->lrefs = 1;
-               RB_INSERT(ccms_rb_tree, &info->cino->tree, ncst);
-
-               /*
-                * ncst becomes our 'matching' cst.
-                */
-               cst = ncst;
-       } else if (lock->beg_offset == cst->beg_offset) {
-               ++cst->lrefs;
-       }
-
-       /*
-        * If the lock's right edge is within the CST we must split the CST
-        * into two pieces [cst][ncst].  rrefs must be bumped on the CST
-        * containing the right edge.
-        *
-        * NOTE! cst->beg_offset may not be modified.  This allows us to
-        * avoid having to manipulate the cst's position in the tree.
-        */
-       if (lock->end_offset < cst->end_offset) {
-               ncst = info->cino->free_cache;
-               info->cino->free_cache = ncst->free_next;
-               ncst->free_next = NULL;
-               KKASSERT(ncst != NULL);
-
-               *ncst = *cst;
-               cst->end_offset = lock->end_offset;
-               cst->rrefs = 1;
-               ncst->beg_offset = lock->end_offset + 1;
-               ncst->lrefs = 0;
-               RB_INSERT(ccms_rb_tree, &info->cino->tree, ncst);
-               /* cst remains our 'matching' cst */
-       } else if (lock->end_offset == cst->end_offset) {
-               ++cst->rrefs;
+       if (cst->count < 0 && cst->td == curthread) {
+               --cst->count;
+               return;
        }
 
-       /*
-        * The lock covers the CST, so increment the CST's coverage count.
-        * Then attempt to obtain the shared/exclusive lock.  The coverage
-        * count is maintained until the put operation.
-        */
-       ++cst->xrefs;
-       if (cst->lstate < lock->dstate)
-               cst->lstate = lock->dstate;
-
-       /*
-        * If we have already collided we make no more modifications
-        * to cst->count, but we must continue the scan to properly
-        * partition the cst.
-        */
-       if (info->coll_cst)
-               return(0);
-
-       switch(lock->dstate) {
-       case CCMS_STATE_INVALID:
-               break;
-       case CCMS_STATE_ALLOWED:
-       case CCMS_STATE_SHARED:
-       case CCMS_STATE_SLAVE:
-               if (cst->count < 0) {
-                       info->coll_cst = cst;
-               } else {
-                       ++cst->count;
-                       if (ccms_debug >= 9) {
-                               kprintf("CST SHARE %d %lld-%lld\n",
-                                       cst->count,
-                                       (long long)cst->beg_offset,
-                                       (long long)cst->end_offset);
-                       }
-               }
-               break;
-       case CCMS_STATE_MASTER:
-       case CCMS_STATE_EXCLUSIVE:
-               if (cst->count != 0) {
-                       info->coll_cst = cst;
-               } else {
-                       --cst->count;
-                       if (ccms_debug >= 9) {
-                               kprintf("CST EXCLS %d %lld-%lld\n",
-                                       cst->count,
-                                       (long long)cst->beg_offset,
-                                       (long long)cst->end_offset);
-                       }
+       spin_lock(&cst->spin);
+       if (state == CCMS_STATE_SHARED) {
+               while (cst->count < 0) {
+                       cst->blocked = 1;
+                       ssleep(cst, &cst->spin, 0, "ccmslck", hz);
                }
-               break;
-       case CCMS_STATE_MODIFIED:
-               if (cst->count != 0) {
-                       info->coll_cst = cst;
-               } else {
-                       --cst->count;
-                       if (cst->lstate <= CCMS_STATE_EXCLUSIVE)
-                               cst->lstate = CCMS_STATE_MODIFIED;
-                       if (ccms_debug >= 9) {
-                               kprintf("CST MODXL %d %lld-%lld\n",
-                                       cst->count,
-                                       (long long)cst->beg_offset,
-                                       (long long)cst->end_offset);
-                       }
+               ++cst->count;
+       } else if (state == CCMS_STATE_EXCLUSIVE) {
+               while (cst->count != 0) {
+                       cst->blocked = 1;
+                       ssleep(cst, &cst->spin, 0, "ccmslck", hz);
                }
-               break;
-       default:
-               panic("ccms_lock_get_match: bad state %d\n", lock->dstate);
-               break;
+               cst->count = -1;
+               cst->td = curthread;
+       } else {
+               spin_unlock(&cst->spin);
+               panic("ccms_thread_lock: bad state %d\n", state);
        }
-       return(0);
+       spin_unlock(&cst->spin);
 }
 
 /*
- * Undo a partially resolved ccms_ltype rangelock.  This is atomic with
- * the scan/redo code so there should not be any blocked locks when
- * transitioning to 0.  lrefs and rrefs are not touched in order to
- * retain the partitioning.
- *
- * If coll_cst is non-NULL we stop when we hit this element as locks on
- * no further elements were obtained.  This element might not represent
- * a left or right edge but coll_cst can only be non-NULL if the spinlock
- * was held throughout the get/redo and the undo.
- *
- * NOTE: called with spinlock held.
+ * Same as ccms_thread_lock() but acquires the lock non-blocking.  Returns
+ * 0 on success, EBUSY on failure.
  */
-static
 int
-ccms_lock_undo_match(ccms_cst_t *cst, void *arg)
+ccms_thread_lock_nonblock(ccms_cst_t *cst, ccms_state_t state)
 {
-       struct ccms_lock_scan_info *info = arg;
-       ccms_lock_t *lock = info->lock;
-
-       if (cst == info->coll_cst)
-               return(-1);
-
-       switch (lock->dstate) {
-       case CCMS_STATE_INVALID:
-               break;
-       case CCMS_STATE_ALLOWED:
-       case CCMS_STATE_SHARED:
-       case CCMS_STATE_SLAVE:
-               KKASSERT(cst->count > 0);
+       if (cst->count < 0 && cst->td == curthread) {
                --cst->count;
-               KKASSERT(cst->count || cst->blocked == 0);
-               break;
-       case CCMS_STATE_MASTER:
-       case CCMS_STATE_EXCLUSIVE:
-       case CCMS_STATE_MODIFIED:
-               KKASSERT(cst->count < 0);
-               ++cst->count;
-               KKASSERT(cst->count || cst->blocked == 0);
-               break;
-       default:
-               panic("ccms_lock_undo_match: bad state %d\n", lock->dstate);
-               break;
+               return(0);
        }
-       return(0);
-}
-
-/*
- * Redo the local lock request for a range which has already been
- * partitioned.
- *
- * NOTE: called with spinlock held.
- */
-static
-int
-ccms_lock_redo_match(ccms_cst_t *cst, void *arg)
-{
-       struct ccms_lock_scan_info *info = arg;
-       ccms_lock_t *lock = info->lock;
 
-       KKASSERT(info->coll_cst == NULL);
-
-       switch(lock->dstate) {
-       case CCMS_STATE_INVALID:
-               break;
-       case CCMS_STATE_ALLOWED:
-       case CCMS_STATE_SHARED:
-       case CCMS_STATE_SLAVE:
+       spin_lock(&cst->spin);
+       if (state == CCMS_STATE_SHARED) {
                if (cst->count < 0) {
-                       info->coll_cst = cst;
-               } else {
-                       if (ccms_debug >= 9) {
-                               kprintf("CST SHARE %d %lld-%lld\n",
-                                       cst->count,
-                                       (long long)cst->beg_offset,
-                                       (long long)cst->end_offset);
-                       }
-                       ++cst->count;
+                       spin_unlock(&cst->spin);
+                       return (EBUSY);
                }
-               break;
-       case CCMS_STATE_MASTER:
-       case CCMS_STATE_EXCLUSIVE:
-               if (cst->count != 0) {
-                       info->coll_cst = cst;
-               } else {
-                       --cst->count;
-                       if (ccms_debug >= 9) {
-                               kprintf("CST EXCLS %d %lld-%lld\n",
-                                       cst->count,
-                                       (long long)cst->beg_offset,
-                                       (long long)cst->end_offset);
-                       }
-               }
-               break;
-       case CCMS_STATE_MODIFIED:
+               ++cst->count;
+       } else if (state == CCMS_STATE_EXCLUSIVE) {
                if (cst->count != 0) {
-                       info->coll_cst = cst;
-               } else {
-                       --cst->count;
-                       if (ccms_debug >= 9) {
-                               kprintf("CST MODXL %d %lld-%lld\n",
-                                       cst->count,
-                                       (long long)cst->beg_offset,
-                                       (long long)cst->end_offset);
-                       }
+                       spin_unlock(&cst->spin);
+                       return (EBUSY);
                }
-               break;
-       default:
-               panic("ccms_lock_redo_match: bad state %d\n", lock->dstate);
-               break;
-       }
-
-       if (info->coll_cst)
-               return(-1);     /* stop the scan */
-       return(0);              /* continue the scan */
-}
-
-/*
- * Upgrade the rstate for the matching range.
- *
- * NOTE: Called with spinlock held.
- */
-static
-int
-ccms_lock_upgrade_match(ccms_cst_t *cst, void *arg)
-{
-       struct ccms_lock_scan_info *info = arg;
-       ccms_lock_t *lock = info->lock;
-
-       /*
-        * ccms_rstate_get() can block so we must release the spinlock.
-        * To prevent the cst from getting ripped out on us we temporarily
-        * bump both lrefs and rrefs.
-        */
-       if (cst->rstate < lock->dstate) {
-               ++cst->lrefs;
-               ++cst->rrefs;
-               spin_unlock(&info->cino->spin);
-               ccms_rstate_get(cst, lock->dstate);
-               spin_lock(&info->cino->spin);
-               --cst->lrefs;
-               --cst->rrefs;
-       }
-       return(0);
-}
-
-/*
- * Release a previously acquired CCMS lock.
- */
-int
-ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock)
-{
-       struct ccms_lock_scan_info info;
-       ccms_cst_t *scan;
-
-       if (lock->tstate) {
-               ccms_lstate_put(lock->icst);
-               lock->tstate = 0;
-               lock->icst = NULL;
-       } else if (lock->astate) {
-               ccms_lstate_put(lock->icst);
-               lock->astate = 0;
-               lock->icst = NULL;
-       }
-
-       if (lock->dstate) {
-               info.lock = lock;
-               info.cino = cino;
-               spin_lock(&cino->spin);
-               RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
-                       ccms_lock_put_match, &info);
-               scan = ccms_free_pass1(cino, 2);
-               spin_unlock(&cino->spin);
-               ccms_free_pass2(scan);
-               lock->dstate = 0;
-               lock->dcst = NULL;
+               cst->count = -1;
+               cst->td = curthread;
+       } else {
+               spin_unlock(&cst->spin);
+               panic("ccms_thread_lock_nonblock: bad state %d\n", state);
        }
-
+       spin_unlock(&cst->spin);
        return(0);
 }
 
 /*
- * Release a local lock.  The related CST's lstate is set to INVALID once
- * the coverage drops to 0 and adjacent compatible entries will be
- * recombined.
- *
- * NOTE: called with spinlock held.
+ * Release a local thread lock
  */
-static
-int
-ccms_lock_put_match(ccms_cst_t *cst, void *arg)
+void
+ccms_thread_unlock(ccms_cst_t *cst)
 {
-       struct ccms_lock_scan_info *info = arg;
-       ccms_lock_t *lock = info->lock;
-       ccms_cst_t *ocst;
-
-       /*
-        * Undo the local shared/exclusive rangelock.
-        */
-       switch(lock->dstate) {
-       case CCMS_STATE_INVALID:
-               break;
-       case CCMS_STATE_ALLOWED:
-       case CCMS_STATE_SHARED:
-       case CCMS_STATE_SLAVE:
-               KKASSERT(cst->count > 0);
-               --cst->count;
-               if (ccms_debug >= 9) {
-                       kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->count,
-                               (long long)cst->beg_offset,
-                               (long long)cst->end_offset,
-                               cst->blocked);
+       if (cst->count < 0) {
+               if (cst->count < -1) {
+                       ++cst->count;
+                       return;
                }
-               if (cst->blocked && cst->count == 0) {
+               spin_lock(&cst->spin);
+               KKASSERT(cst->count == -1);
+               cst->count = 0;
+               cst->td = NULL;
+               if (cst->blocked) {
                        cst->blocked = 0;
+                       spin_unlock(&cst->spin);
                        wakeup(cst);
+                       return;
                }
-               break;
-       case CCMS_STATE_MASTER:
-       case CCMS_STATE_EXCLUSIVE:
-       case CCMS_STATE_MODIFIED:
-               KKASSERT(cst->count < 0);
-               ++cst->count;
-               if (ccms_debug >= 9) {
-                       kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->count,
-                               (long long)cst->beg_offset,
-                               (long long)cst->end_offset,
-                               cst->blocked);
-               }
-               if (cst->blocked && cst->count == 0) {
+               spin_unlock(&cst->spin);
+       } else if (cst->count > 0) {
+               spin_lock(&cst->spin);
+               if (--cst->count == 0 && cst->blocked) {
                        cst->blocked = 0;
+                       spin_unlock(&cst->spin);
                        wakeup(cst);
+                       return;
                }
-               break;
-       default:
-               panic("ccms_lock_put_match: bad state %d\n", lock->dstate);
-               break;
-       }
-
-       /*
-        * Decrement the lock coverage count on the CST.  Decrement the left
-        * and right edge counts as appropriate.
-        *
-        * When lrefs or rrefs drops to zero we check the adjacent entry to
-        * determine whether a merge is possible.  If the appropriate refs
-        * field (rrefs for the entry to our left, lrefs for the entry to
-        * our right) is 0, then all covering locks must cover both entries
-        * and the xrefs field must match.  We can then merge the entries
-        * if they have compatible cache states.
-        *
-        * However, because we are cleaning up the shared/exclusive count
-        * at the same time, the count field may be temporarily out of
-        * sync, so require that the count field also match before doing
-        * a merge.
-        *
-        * When merging an element which is being blocked on, the blocking
-        * thread(s) will be woken up.
-        *
-        * If the dataspace has too many CSTs we may be able to merge the
-        * entries even if their cache states are not the same, by dropping
-        * both to a compatible (lower) cache state and performing the
-        * appropriate management operations.  XXX
-        */
-       if (--cst->xrefs == 0)
-               cst->lstate = CCMS_STATE_INVALID;
-
-       if (lock->beg_offset == cst->beg_offset && --cst->lrefs == 0) {
-               if ((ocst = RB_PREV(ccms_rb_tree,
-                                   &info->cino->tree, cst)) != NULL &&
-                   ocst->rrefs == 0 &&
-                   ocst->lstate == cst->lstate &&
-                   ocst->rstate == cst->rstate &&
-                   ocst->count == cst->count
-               ) {
-                       KKASSERT(ocst->xrefs == cst->xrefs);
-                       KKASSERT(ocst->end_offset + 1 == cst->beg_offset);
-                       RB_REMOVE(ccms_rb_tree, &info->cino->tree, ocst);
-                       cst->beg_offset = ocst->beg_offset;
-                       cst->lrefs = ocst->lrefs;
-                       if (ccms_debug >= 9) {
-                               kprintf("MERGELEFT %p %lld-%lld (%d)\n",
-                                      ocst,
-                                      (long long)cst->beg_offset,
-                                      (long long)cst->end_offset,
-                                      cst->blocked);
-                       }
-                       if (ocst->blocked) {
-                               ocst->blocked = 0;
-                               wakeup(ocst);
-                       }
-                       ocst->free_next = info->cino->free_cache;
-                       info->cino->free_cache = ocst;
-               }
-       }
-       if (lock->end_offset == cst->end_offset && --cst->rrefs == 0) {
-               if ((ocst = RB_NEXT(ccms_rb_tree,
-                                   &info->cino->tree, cst)) != NULL &&
-                   ocst->lrefs == 0 &&
-                   ocst->lstate == cst->lstate &&
-                   ocst->rstate == cst->rstate &&
-                   ocst->count == cst->count
-               ) {
-                       KKASSERT(ocst->xrefs == cst->xrefs);
-                       KKASSERT(cst->end_offset + 1 == ocst->beg_offset);
-                       RB_REMOVE(ccms_rb_tree, &info->cino->tree, ocst);
-                       cst->end_offset = ocst->end_offset;
-                       cst->rrefs = ocst->rrefs;
-                       if (ccms_debug >= 9) {
-                               kprintf("MERGERIGHT %p %lld-%lld\n",
-                                      ocst,
-                                      (long long)cst->beg_offset,
-                                      (long long)cst->end_offset);
-                       }
-                       ocst->free_next = info->cino->free_cache;
-                       info->cino->free_cache = ocst;
-               }
+               spin_unlock(&cst->spin);
+       } else {
+               panic("ccms_thread_unlock: bad zero count\n");
        }
-       return(0);
 }
 
 /*
- * RB tree compare function for insertions and deletions.  This function
- * compares two CSTs.
- */
-static int
-ccms_cst_cmp(ccms_cst_t *b1, ccms_cst_t *b2)
-{
-       if (b1->end_offset < b2->beg_offset)
-               return(-1);
-       if (b1->beg_offset > b2->end_offset)
-               return(1);
-       return(0);
-}
-
-/*
- * RB tree scanning compare function.  This function compares the CST
- * from the tree against the supplied ccms_lock and returns the CST's
- * placement relative to the lock.
- */
-static int
-ccms_lock_scan_cmp(ccms_cst_t *cst, void *arg)
-{
-       struct ccms_lock_scan_info *info = arg;
-       ccms_lock_t *lock = info->lock;
-
-       if (cst->end_offset < lock->beg_offset)
-               return(-1);
-       if (cst->beg_offset > lock->end_offset)
-               return(1);
-       return(0);
-}
-
-/************************************************************************
- *             STANDALONE LSTATE AND RSTATE SUPPORT FUNCTIONS          *
- ************************************************************************
+ * Release a local thread lock with special handling of the last lock
+ * reference.
+ *
+ * On the last lock reference the lock, if shared, will be upgraded to
+ * an exclusive lock and we return 0 without unlocking it.
  *
- * These functions are used to perform work on the attr_cst and topo_cst
- * embedded in a ccms_inode, and to issue remote state operations.  These
- * functions are called without the ccms_inode spinlock held.
+ * If more than one reference remains we drop the reference and return
+ * non-zero.
  */
-
-static
-void
-ccms_lstate_get(ccms_cst_t *cst, ccms_state_t state)
-{
-       int blocked;
-
-       spin_lock(&cst->cino->spin);
-       ++cst->xrefs;
-
-       for (;;) {
-               blocked = 0;
-
-               switch(state) {
-               case CCMS_STATE_INVALID:
-                       break;
-               case CCMS_STATE_ALLOWED:
-               case CCMS_STATE_SHARED:
-               case CCMS_STATE_SLAVE:
-                       if (cst->count < 0) {
-                               blocked = 1;
-                       } else {
-                               ++cst->count;
-                               if (ccms_debug >= 9) {
-                                       kprintf("CST SHARE %d %lld-%lld\n",
-                                               cst->count,
-                                               (long long)cst->beg_offset,
-                                               (long long)cst->end_offset);
-                               }
-                       }
-                       break;
-               case CCMS_STATE_MASTER:
-               case CCMS_STATE_EXCLUSIVE:
-                       if (cst->count != 0) {
-                               blocked = 1;
-                       } else {
-                               --cst->count;
-                               if (ccms_debug >= 9) {
-                                       kprintf("CST EXCLS %d %lld-%lld\n",
-                                               cst->count,
-                                               (long long)cst->beg_offset,
-                                               (long long)cst->end_offset);
-                               }
-                       }
-                       break;
-               case CCMS_STATE_MODIFIED:
-                       if (cst->count != 0) {
-                               blocked = 1;
-                       } else {
-                               --cst->count;
-                               if (cst->lstate <= CCMS_STATE_EXCLUSIVE)
-                                       cst->lstate = CCMS_STATE_MODIFIED;
-                               if (ccms_debug >= 9) {
-                                       kprintf("CST MODXL %d %lld-%lld\n",
-                                               cst->count,
-                                               (long long)cst->beg_offset,
-                                               (long long)cst->end_offset);
-                               }
-                       }
-                       break;
-               default:
-                       panic("ccms_lock_get_match: bad state %d\n", state);
-                       break;
-               }
-               if (blocked == 0)
-                       break;
-               ssleep(cst, &cst->cino->spin, 0, "ccmslget", hz);
-       }
-       if (cst->lstate < state)
-               cst->lstate = state;
-       spin_unlock(&cst->cino->spin);
-}
-
-static
-void
-ccms_lstate_put(ccms_cst_t *cst)
+int
+ccms_thread_unlock_zero(ccms_cst_t *cst)
 {
-       spin_lock(&cst->cino->spin);
-
-       switch(cst->lstate) {
-       case CCMS_STATE_INVALID:
-               break;
-       case CCMS_STATE_ALLOWED:
-       case CCMS_STATE_SHARED:
-       case CCMS_STATE_SLAVE:
-               KKASSERT(cst->count > 0);
-               --cst->count;
-               if (ccms_debug >= 9) {
-                       kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->count,
-                               (long long)cst->beg_offset,
-                               (long long)cst->end_offset,
-                               cst->blocked);
-               }
-               if (cst->blocked && cst->count == 0) {
-                       cst->blocked = 0;
-                       wakeup(cst);
-               }
-               break;
-       case CCMS_STATE_MASTER:
-       case CCMS_STATE_EXCLUSIVE:
-       case CCMS_STATE_MODIFIED:
-               KKASSERT(cst->count < 0);
+       if (cst->count < 0) {
+               if (cst->count == -1)
+                       return(0);
                ++cst->count;
-               if (ccms_debug >= 9) {
-                       kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->count,
-                               (long long)cst->beg_offset,
-                               (long long)cst->end_offset,
-                               cst->blocked);
-               }
-               if (cst->blocked && cst->count == 0) {
-                       cst->blocked = 0;
-                       wakeup(cst);
+       } else {
+               KKASSERT(cst->count > 0);
+               spin_lock(&cst->spin);
+               if (cst->count == 1) {
+                       cst->count = -1;
+                       cst->td = curthread;
+                       spin_unlock(&cst->spin);
+                       return(0);
                }
-               break;
-       default:
-               panic("ccms_lock_put_match: bad state %d\n", cst->lstate);
-               break;
+               --cst->count;
+               spin_unlock(&cst->spin);
        }
-
-       if (--cst->xrefs == 0)
-               cst->lstate = CCMS_STATE_INVALID;
-       spin_unlock(&cst->cino->spin);
+       return(1);
 }
 
+#if 0
 /*
- * XXX third-party interaction & granularity
+ * Acquire remote grant state.  This routine can be used to upgrade or
+ * downgrade the state.  If it blocks it will release any local locks
+ * acquired via (lock) but then it will continue getting the requested
+ * remote grant.
  */
 static
 void
-ccms_rstate_get(ccms_cst_t *cst, ccms_state_t state)
+ccms_rstate_get(ccms_lock_t *lock, ccms_cst_t *cst, ccms_state_t state)
 {
-       spin_lock(&cst->cino->spin);
-       if (cst->rstate < state)
-               cst->rstate = state;
-       spin_unlock(&cst->cino->spin);
+       /* XXX */
+       cst->state = state;
 }
 
-/*
- * XXX third-party interaction & granularity
- */
-static
-void
-ccms_rstate_put(ccms_cst_t *cst)
-{
-       spin_lock(&cst->cino->spin);
-       cst->rstate = CCMS_STATE_INVALID;
-       spin_unlock(&cst->cino->spin);
-}
+#endif
index c677d36..510f13a 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+
 /*
- * This module is HAMMER2-independent.
- *
- * CCMS - Cache Coherency Management System.  These structures are used
- * to manage cache coherency and locking for an object.
- *
- *                             ccms_inode
- *
- * Cache coherency is tied into a kernel or VFS structure, creating a
- * directory/file topology and a keyspace on an inode-by-inode basis
- * via the (ccms_inode) structure.
- *
- * Each CCMS inode contains a RB-Tree holding ccms_cst (CST) elements
- * for its file range or directory key range, plus two independent embedded
- * ccms_cst structures representing the inode attributes and the entire
- * recursive sub-tree.
- *
- * The CST representing the entire sub-tree is inclusive of that inode's
- * attribute state and data/key range state AND inclusive of the entire
- * filesystem topology under that point, recursively.
- *
- * Two ccms_cst's are embedded in each cached inode via the ccms_inode
- * structure to represent attribute and recursive topological cache state.
+ * CCMS - Cache Coherency Management System.
  *
- *                              ccms_cst
+ * This subsystem can be tied into a VFS in order to supply persistent
+ * cache management state for cluster or for remote cache-coherent operations.
  *
- * The (ccms_cst) structure, called the CST, represents specific, persistent
- * cache state.  This structure is allocated and freed on the fly as needed
- * (except for the two embedded in the ccms_inode).
+ * Local and cluster/remote cache state is maintained in a cache-coherent
+ * fashion as well as integrated into the VFS's inode locking subsystem
+ * (as a means of avoiding deadlocks).
  *
- * The persistence ties into network/cluster operations via the 'rstate'
- * field.  When cluster-maintained state is present then certain operations
- * on the CST's local state (including when a vnode is reclaimed) will
- * block while third-party synchronization occurs.
+ * To operate properly the VFS must maintain a complete directory topology
+ * leading to any given vnode/inode either open or cached by the system.
+ * The vnode/namecache subsystem does not have to implement this but the
+ * VFS (aka HAMMER2) does.
  *
- * The number of dynamically allocated CSTs is strictly limited, forcing
- * a degree of aggregation when the limit is reached.
+ * The filesystem embeds CCMS_CST structures in its internal inode
+ * representatino as needed and implements callback to allow CCMS to
+ * do topological recursions.
  *
- *                              ccms_lock
+ * --
  *
- * The (ccms_lock) structure represents a live local lock for the duration of
- * any given filesystem operation.  A single ccms_lock can cover both
- * attribute state AND a byte-range/key-range.
+ * The CCMS_CST structures represent granted cache and local locking states.
+ * Grants can be recursively inherited, minimizing protocol overhead in
+ * situations where there are no conflicts of interest.
  *
- * This lock represents the exact lock being requested but the CST structure
- * it points to can be a more general representation which covers the lock.
- * The minimum granularity for the cst pointer in the ccms_lock will be to
- * the ccms_inode's embedded topo_cst.
+ * --
  *
- * Theoretically a single CST at the root can cover the entire filesystem,
- * but this creates a great deal of SMP interaction.
- *
- *                                Management
- *
- * Because cache state is persistent the CCMS module may desire to limit the
- * total number of CSTs under management.  It does this by aggregating cache
- * state which in turn may require callbacks to invalidate third-party
- * (cluster-related) cache state.
- *
- * CCMS operations related to locks can stall on third-party state
- * transitions.  Because third-party state can also change independently
- * due to foreign interactions (often with a userland program), no filesystem
- * lock can be held while manipulating CST states.  For this reason,
- * HAMMER2 (or any VFS using CCMS) must provide roll-up functions to acquire
- * CCMS lock state up-front prior to locking the VFS inode structure.
- *
- * vnode locks which are under the control of the filesystem can be more
- * problematic and may require additional care.
+ * CCMS supports active front-end 'locks' on data objects utilizing the
+ * ccms_inode, key, and desired cache state.  It can grant the lock based
+ * on inherited CST state and prevents downgrading of the CST by other
+ * parties or threads while the lock is held.  The CST's arranged
+ * lock within the embedded CCMS_INODE and ref-counts the related CST.
  */
 
 #ifndef _SYS_CCMS_H_
 #ifndef _SYS_SPINLOCK_H_
 #include <sys/spinlock.h>
 #endif
-#ifndef _SYS_TREE_H_
-#include <sys/tree.h>
-#endif
 
-typedef uint64_t       ccms_off_t;
+typedef uint64_t       ccms_key_t;
+typedef uint64_t       ccms_tid_t;
 typedef uint8_t                ccms_state_t;
+typedef uint8_t                ccms_type_t;
 
-/*
- * CCMS uses a red-black tree to organize CSTs.
- */
-RB_HEAD(ccms_rb_tree, ccms_cst);
-RB_PROTOTYPE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp, ccms_off_t);
-
-struct ccms_inode;
 struct ccms_cst;
 struct ccms_lock;
 
 /*
- * CCMS cache states
+ * CCMS_STATE_T - CCMS cache states.
  *
- * CCMS uses an extended MESI caching model.  There are two extension states,
- * MASTER and SLAVE, which represents dirty data which has not been
- * synchronized to backing store but which nevertheless is being shared
- * between distinct caches.   These states are designed to allow data
- * to be shared between nodes in a cluster without having to wait for it
- * to be synchronized with its backing store.
+ * INVALID   - Cache state is unknown and must be acquired.
  *
- * Each CST has lstate and rstate.  lstate is the local cache state and rstate
- * is the remotely-granted state.  Changes to the lstate require a compatible
- * rstate.  If the rstate is not compatible a third-party transaction is
- * required to obtain the proper rstate.
+ * ALLOWED   -  Cache state allows any recursive state to be acquired.
  *
- * INVALID   - Cache state is unknown and must be acquired.
+ * SHARED    - Cache state allows shared access.  If this is a topo_cst
+ *             only INVALID or SHARED recursive states are allowed.
  *
- * ALLOWED   -  (topo_cst.rstate only).  This is a granted state which
- *             allows cache state transactions underneath the current
- *             node (data, attribute, and recursively), but is not a proper
- *             grant for topo_cst itself.  Someone specifically trying to
- *             acquire topo_cst still needs to do a third party transaction
- *             to get the cache into the proper state.
+ * EXCLUSIVE -  Cache state allows exclusive access.  If this is a
+ *             topo_cst then INVALID, SHARED, or EXCLUSIVE recursive
+ *             state is allowed.
  *
- * SHARED    -  Indicates that the information is clean, shared, read-only.
+ * CCMS Implements an extended MESI model.  The extensions are implemented
+ * as CCMS_TYPE_T flags.
+ */
+#define CCMS_STATE_INVALID     0       /* unknown cache state */
+#define CCMS_STATE_ALLOWED     1       /* allow subsystem (topo only) */
+#define CCMS_STATE_SHARED      2       /* clean, shared, read-only */
+#define CCMS_STATE_EXCLUSIVE   3       /* clean, exclusive, read-only */
+
+/*
+ * CCMS_TYPE_T FLAGS
  *
- * SLAVE     -  Indicates that the information is clean, shared, read-only.
- *             Indicates that local backing store is out of date but the
- *             in-memory cache is valid, meaning that we can only obtain
- *             the data from the MASTER (somewhere in the cluster), and
- *             that we may not be allowed to sync it to local backing
- *             store yet e.g. due to the quorum protocol not having
- *             completed.
+ * INHERITED -  Indicates the state field was inherited and was not directly
+ *             granted by the cluster controller.
  *
- * MASTER    -  Indicates that the information is dirty, but readonly
- *             because other nodes in the cluster are in a SLAVE state.
- *             This state is typically transitional and occurs while
- *             a quorum operation is in progress, allowing slaves to
- *             access the data without stalling.
+ * MODIFIED  -  This is a type-field flag associated with an EXCLUSIVE cache
+ *             state
  *
- * EXCLUSIVE - Indicates that the information is clean, read-only, and
- *             that nobody else can access the data while we are in this
- *             state.  A local node can upgrade both rstate and lstate
- *             from EXCLUSIVE to MODIFIED without having to perform a
- *             third-party transaction.
+ * MASTER    -  This is a type-field flag associated with an EXCLUSIVE+MODIFIED
+ *             cache state which indicates that slaves might be present
+ *             which are caching our unsynchronized state.
  *
- * MODIFIED  -  Indicates that the information is dirty, read-write, and
- *             that nobody else can access the data while we are in this
- *             state.
+ * SLAVE     -  This is a type-field flag associated with the SHARED cache
+ *             state which indicates that the data present in our memory
+ *             caches is being mastered elsewhere and has not been
+ *             synchronized (meaning no quorum protocol has been run to
+ *             sync the data yet).  Thus only the version of the data in
+ *             our memory and its originator is valid.
  *
- * It is important to note that remote cache-state grants can be more
- * general than what was requested, plus they can be persistent.  So,
- * for example, a remote can grant EXCLUSIVE access even if you just
- * requested SHARED, which saves you from having to do another network
- * transaction if you later need EXCLUSIVE.
+ * QSLAVE    -  This indicates that the slaved data is also present in the
+ *             memory caches of a quorum of master nodes.
  */
-
-#define CCMS_STATE_INVALID     0       /* unknown cache state */
-#define CCMS_STATE_ALLOWED     1       /* allow subsystem (topo only) */
-#define CCMS_STATE_SHARED      2       /* clean, shared, read-only */
-#define CCMS_STATE_SLAVE       3       /* live only, shared, read-only */
-#define CCMS_STATE_MASTER      4       /* dirty, shared, read-only */
-#define CCMS_STATE_EXCLUSIVE   5       /* clean, exclusive, read-only */
-#define CCMS_STATE_MODIFIED    6       /* dirty, exclusive, read-write */
+#define CCMS_TYPE_INHERITED    0x01
+#define CCMS_TYPE_MODIFIED     0x02
+#define CCMS_TYPE_MASTER       0x04
+#define CCMS_TYPE_SLAVE                0x08
+#define CCMS_TYPE_QSALVE       0x10
+#define CCMS_TYPE_RECURSIVE    0x80
 
 /*
- * A CCMS locking element - represents a high level locking request,
- * such as used by read, write, and attribute operations.  Initialize
- * the ccms_lock structure and call ccms_lock_get().
+ * CCMS_LOCK - High level active lock
+ *
+ * This represents a high level locking request, such as used by
+ * read, write, and attribute operations.  Initialize the ccms_lock
+ * structure and call ccms_lock_get().
  *
  * When a CCMS lock is established the cache state of the underlying elements
  * is adjusted to meet the requirements of the lock.  The cache state
@@ -211,126 +157,62 @@ struct ccms_lock;
  *
  * CCMS data locks imply a shared CCMS inode lock.  A CCMS topology lock does
  * not imply a data or inode lock but topology locks can have far-reaching
- * effects and block on numerous CST state.
+ * effects such as block ccms_locks on multiple inodes.
  */
 struct ccms_lock {
-       ccms_state_t    tstate;
-       ccms_state_t    astate;
-       ccms_state_t    dstate;
-       ccms_off_t      beg_offset;     /* applies to dstate */
-       ccms_off_t      end_offset;     /* applies to dstate */
-       struct ccms_cst *icst;          /* points to topo_cst or attr_cst */
-       struct ccms_cst *dcst;          /* points to left edge in rbtree */
-#ifdef CCMS_DEBUG
        TAILQ_ENTRY(ccms_lock) entry;
-#endif
+       ccms_state_t    req_t;
+       ccms_state_t    req_a;
+       ccms_state_t    req_d;
+       uint8_t         flags;
+       struct ccms_cst *topo_cst;
+       struct ccms_cst *attr_cst;
+       struct ccms_cst *data_cst;
+       ccms_key_t      key_beg;        /* applies to dstate */
+       ccms_key_t      key_end;        /* applies to dstate */
 };
 
-#ifdef CCMS_DEBUG
-
-TAILQ_HEAD(ccms_lock_head, ccms_lock);
-
-#endif
+#define CCMS_LOCK_FAILED       0x01
 
 /*
- * CCMS cache state tree element (CST) - represents the actual cache
- * management state for a data space.  The cache state tree is a
- * non-overlaping red-black tree containing ranged ccms_cst structures
- * which reflect the resolved state for all current high level locking
- * requests.  For example, two overlapping ccms_lock requests for shared
- * access would typically be represented by three non-overlapping ccms_cst
- * items in the CST.  The CST item representing the overlapped portion of
- * the ccms_lock requests would have ref count of 2 while the other CST
- * items would have a ref count of 1.
- *
- *     [lock request #01]
- *              [lock request #02]
- *     [--cst--][--cst--][--cst--]
+ * CCMS_CST - Low level locking state, persistent cache state
  *
- * CSTs are partitioned so their edges line up to all current and pending
- * ccms_lock requests.  CSTs are re-merged whenever possible.  A freshly
- * initialized database typically has a single CST representing the default
- * cache state for the host.
- *
- * A CST keeps track of local cache state (lstate) AND remote cache state
- * (rstate).
- *
- * Any arbitrary data range within a dataspace can be locked shared or
- * exclusive.  Obtaining a lock has the side effect of potentially modifying
- * the cache state.  A positive sharecount in a CST indicates that a
- * shared access lock is being held.  A negative sharecount indicates an
- * exclusive access lock is being held on the range.  A MODIFYING lock
- * type is just an exclusive lock but one which effects the cache state
- * differently.
- *
- * The end offset is byte-inclusive, allowing the entire 64 bit data space
+ * Offset ranges are byte-inclusive, allowing the entire 64 bit data space
  * to be represented without overflowing the edge case.  For example, a
  * 64 byte area might be represented as (0,63).  The offsets are UNSIGNED
  * entities.
- */
-struct ccms_cst {
-       RB_ENTRY(ccms_cst) rbnode;      /* stored in a red-black tree */
-       struct ccms_cst *free_next;     /* free cache linked list */
-       struct ccms_inode *cino;        /* related ccms_inode */
-       ccms_off_t beg_offset;          /* range (inclusive) */
-       ccms_off_t end_offset;          /* range (inclusive) */
-       ccms_state_t lstate;            /* local cache state */
-       ccms_state_t rstate;            /* cache state granted by protocol */
-
-       int32_t flags;
-       int32_t count;                  /* shared/exclusive count */
-       int32_t blocked;                /* indicates a blocked lock request */
-       int32_t xrefs;                  /* lock overlap references */
-       int32_t lrefs;                  /* left edge refs */
-       int32_t rrefs;                  /* right edge refs */
-#ifdef CCMS_DEBUG
-       struct ccms_lock_head list;
-#endif
-};
-
-#define CCMS_CST_DYNAMIC       0x00000001
-#define CCMS_CST_DELETING      0x00000002
-#define CCMS_CST_INSERTED      0x00000004
-#define CCMS_CST_INHERITED     0x00000008      /* rstate inherited from par */
-
-/*
- * A CCMS inode is typically embedded in a VFS file or directory object.
- *
- * The subdirectory topology is accessible downward by indexing topo_cst's
- * from the children in the parent's cst_tree.
  *
- * attr_cst is independent of data-range CSTs.  However, adjustments to
- * the topo_cst can have far-reaching effects to attr_cst, the CSTs in
- * the tree, recursively both downward and upward.
+ * count - negative value indicates active exclusive lock, positive value
+ *        indicates active shared lock.
  */
-struct ccms_inode {
-       struct spinlock         spin;
-       struct ccms_inode       *parent;
-       struct ccms_rb_tree     tree;
-       struct ccms_cst         attr_cst;
-       struct ccms_cst         topo_cst;
-       struct ccms_cst         *free_cache;    /* cst free cache */
-       struct ccms_domain      *domain;
-       void                    *handle;        /* VFS opaque */
-       int32_t                 flags;
+struct ccms_cst {
+       struct spinlock spin;           /* thread spinlock */
+       void            *handle;        /* opaque VFS handle */
+       ccms_state_t    state;          /* granted or inherited state */
+       ccms_type_t     type;           /* CST type and flags */
+       uint8_t         unused02;
+       uint8_t         unused03;
+
+       ccms_tid_t      path_id;        /* rendezvous inode id */
+       ccms_tid_t      tid;            /* [meta]data versioning id */
+       ccms_key_t      key_beg;        /* key range (inclusive) */
+       ccms_key_t      key_end;        /* key range (inclusive) */
+
+       int32_t         count;          /* active shared/exclusive count */
+       int32_t         blocked;        /* wakeup blocked on release */
+       thread_t        td;             /* if excl lock (count < 0) */
 };
 
-#define CCMS_INODE_INSERTED    0x0001
-#define CCMS_INODE_DELETING    0x0002
-
 /*
  * Domain management, contains a pseudo-root for the CCMS topology.
  */
 struct ccms_domain {
-       struct malloc_type      *mcst;          /* malloc space for cst's */
-       struct ccms_inode       root;           /* dummy protocol root */
        int                     cst_count;      /* dynamic cst count */
        int                     cst_limit;      /* dynamic cst limit */
 };
 
 typedef struct ccms_lock       ccms_lock_t;
 typedef struct ccms_cst                ccms_cst_t;
-typedef struct ccms_inode      ccms_inode_t;
 typedef struct ccms_domain     ccms_domain_t;
 
 /*
@@ -338,33 +220,18 @@ typedef struct ccms_domain        ccms_domain_t;
  */
 #ifdef _KERNEL
 
-/*
- * Helper inline to initialize primarily a dstate lock which shortcuts
- * the more common locking operations.  A dstate is specified and an
- * astate is implied.  tstate locks cannot be acquired with this inline.
- */
-static __inline
-void
-ccms_lock_init(ccms_lock_t *lock, ccms_state_t dstate,
-              ccms_off_t beg_offset, ccms_off_t end_offset)
-{
-       lock->beg_offset = beg_offset;
-       lock->end_offset = end_offset;
-       lock->tstate = 0;
-       lock->astate = 0;
-       lock->dstate = dstate;
-}
-
 void ccms_domain_init(ccms_domain_t *dom);
-void ccms_inode_init(ccms_domain_t *dom, ccms_inode_t *cino, void *handle);
-void ccms_inode_insert(ccms_inode_t *cpar, ccms_inode_t *cino);
-void ccms_inode_delete(ccms_inode_t *cino);
-void ccms_inode_uninit(ccms_inode_t *cino);
+void ccms_domain_uninit(ccms_domain_t *dom);
+void ccms_cst_init(ccms_cst_t *cst, void *handle);
+void ccms_cst_uninit(ccms_cst_t *cst);
+
+void ccms_thread_lock(ccms_cst_t *cst, ccms_state_t state);
+int ccms_thread_lock_nonblock(ccms_cst_t *cst, ccms_state_t state);
+void ccms_thread_unlock(ccms_cst_t *cst);
+int ccms_thread_unlock_zero(ccms_cst_t *cst);
 
-int ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock);
-int ccms_lock_get_uio(ccms_inode_t *cino, ccms_lock_t *lock, struct uio *uio);
-int ccms_lock_get_attr(ccms_inode_t *cino, ccms_lock_t *lock, ccms_state_t st);
-int ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock);
+void ccms_lock_get(ccms_lock_t *lock);
+void ccms_lock_put(ccms_lock_t *lock);
 
 #endif
 
index 5921a78..7a193a1 100644 (file)
@@ -111,20 +111,17 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
                ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
                chain = &ip->chain;
                chain->u.ip = ip;
-               lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
                ip->hmp = hmp;
                break;
        case HAMMER2_BREF_TYPE_INDIRECT:
                np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
                chain = &np->chain;
                chain->u.np = np;
-               lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
                break;
        case HAMMER2_BREF_TYPE_DATA:
                dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
                chain = &dp->chain;
                chain->u.dp = dp;
-               lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
                break;
        case HAMMER2_BREF_TYPE_VOLUME:
                chain = NULL;
@@ -146,7 +143,8 @@ hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
        chain->index = -1;              /* not yet assigned */
        chain->refs = 1;
        chain->bytes = bytes;
-       lockmgr(&chain->lk, LK_EXCLUSIVE);
+       ccms_cst_init(&chain->cst, chain);
+       ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
 
        return (chain);
 }
@@ -217,8 +215,10 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                if (refs == 1) {
                        KKASSERT(chain != &hmp->vchain);
                        parent = chain->parent;
-                       if (parent)
-                               lockmgr(&parent->lk, LK_EXCLUSIVE);
+                       if (parent) {
+                               ccms_thread_lock(&parent->cst,
+                                               CCMS_STATE_EXCLUSIVE);
+                       }
                        if (atomic_cmpset_int(&chain->refs, 1, 0)) {
                                /*
                                 * Succeeded, recurse and drop parent.
@@ -240,13 +240,6 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                                 */
                                if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
                                        /*
-                                        * Disconnect the CCMS inode if this
-                                        * was an inode.
-                                        */
-                                       if (ip && ip->cino)
-                                               ccms_inode_delete(ip->cino);
-
-                                       /*
                                         * Disconnect the chain and clear
                                         * pip if it was an inode.
                                         */
@@ -260,22 +253,20 @@ hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                                }
 
                                /*
-                                * Destroy the disconnected ccms_inode if
-                                * applicable.
+                                * When cleaning out a hammer2_inode we must
+                                * also clean out the related ccms_inode.
                                 */
-                               if (ip && ip->cino) {
-                                       ccms_inode_destroy(ip->cino);
-                                       ip->cino = NULL;
-                               }
+                               if (ip)
+                                       ccms_cst_uninit(&ip->topo_cst);
                                chain->parent = NULL;
                                if (parent)
-                                       lockmgr(&parent->lk, LK_RELEASE);
+                                       ccms_thread_unlock(&parent->cst);
                                hammer2_chain_free(hmp, chain);
                                chain = parent;
                                /* recurse on parent */
                        } else {
                                if (parent)
-                                       lockmgr(&parent->lk, LK_RELEASE);
+                                       ccms_thread_unlock(&parent->cst);
                                /* retry the same chain */
                        }
                } else {
@@ -345,7 +336,7 @@ hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
         */
        KKASSERT(chain->refs > 0);
        atomic_add_int(&chain->refs, 1);
-       lockmgr(&chain->lk, LK_EXCLUSIVE);
+       ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE);
 
        /*
         * If we already have a valid data pointer no further action is
@@ -485,14 +476,15 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
        long *counterp;
 
        /*
-        * Undo a recursive lock
+        * Release the CST lock but with a special 1->0 transition case.
         *
-        * XXX shared locks not handled properly
+        * Returns non-zero if lock references remain.  When zero is
+        * returned the last lock reference is retained and any shared
+        * lock is upgraded to an exclusive lock for final disposition.
         */
-       if (lockcountnb(&chain->lk) > 1) {
+       if (ccms_thread_unlock_zero(&chain->cst)) {
                KKASSERT(chain->refs > 1);
                atomic_add_int(&chain->refs, -1);
-               lockmgr(&chain->lk, LK_RELEASE);
                return;
        }
 
@@ -506,7 +498,7 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
         */
        if (chain->bp == NULL) {
                atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
-               lockmgr(&chain->lk, LK_RELEASE);
+               ccms_thread_unlock(&chain->cst);
                hammer2_chain_drop(hmp, chain);
                return;
        }
@@ -590,7 +582,7 @@ hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
                }
        }
        chain->bp = NULL;
-       lockmgr(&chain->lk, LK_RELEASE);
+       ccms_thread_unlock(&chain->cst);
        hammer2_chain_drop(hmp, chain);
 }
 
@@ -947,9 +939,9 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                  int index, int flags)
 {
        hammer2_blockref_t *bref;
+       hammer2_inode_t *ip;
        hammer2_chain_t *chain;
        hammer2_chain_t dummy;
-       ccms_cst_t *cst;
        int how;
 
        /*
@@ -962,11 +954,6 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                how = HAMMER2_RESOLVE_MAYBE;
 
        /*
-        * Resolve cache state XXX
-        */
-       cst = NULL;
-
-       /*
         * First see if we have a (possibly modified) chain element cached
         * for this (parent, index).  Acquire the data if necessary.
         *
@@ -1043,20 +1030,18 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * Additional linkage for inodes.  Reuse the parent pointer to
         * find the parent directory.
         *
-        * The CCMS for the pfs-root is initialized from the mount code,
-        * this chain_get, or chain_create, when the pmp is assigned and
-        * non-NULL.  No CCMS is initialized here for the super-root and
-        * the CCMS for the PFS root is initialized in the mount code.
+        * The ccms_inode is initialized from its parent directory.  The
+        * chain of ccms_inode's is seeded by the mount code.
         */
        if (bref->type == HAMMER2_BREF_TYPE_INODE) {
+               ip = chain->u.ip;
                while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
                        parent = parent->parent;
                if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
-                       chain->u.ip->pip = parent->u.ip;
-                       chain->u.ip->pmp = parent->u.ip->pmp;
-                       chain->u.ip->depth = parent->u.ip->depth + 1;
-                       if (cst)
-                               chain->u.ip->cino = cst->tag.cino;
+                       ip->pip = parent->u.ip;
+                       ip->pmp = parent->u.ip->pmp;
+                       ip->depth = parent->u.ip->depth + 1;
+                       ccms_cst_init(&ip->topo_cst, &ip->chain);
                }
        }
 
@@ -1071,7 +1056,7 @@ hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
                hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
                hammer2_chain_drop(hmp, chain);         /* excess ref */
        }
-       lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
+       ccms_thread_unlock(&chain->cst);                        /* from alloc */
 
        return (chain);
 }
@@ -1463,12 +1448,6 @@ hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        int allocated = 0;
        int count;
        int i;
-       ccms_cst_t *cst;
-
-       /*
-        * Resolve cache state
-        */
-       cst = NULL;
 
        if (chain == NULL) {
                /*
@@ -1627,10 +1606,8 @@ again:
         * Cumulative adjustments are inherited on [re]attach and will
         * propagate up the tree on the next flush.
         *
-        * The CCMS for the pfs-root is initialized from the mount code,
-        * this chain_get, or chain_create, when the pmp is assigned and
-        * non-NULL.  No CCMS is initialized here for the super-root and
-        * the CCMS for the PFS root is initialized in the mount code.
+        * The ccms_inode is initialized from its parent directory.  The
+        * chain of ccms_inode's is seeded by the mount code.
         */
        if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
                hammer2_chain_t *scan = parent;
@@ -1645,9 +1622,7 @@ again:
                        ip->pip->delta_icount += ip->ip_data.inode_count;
                        ip->pip->delta_dcount += ip->ip_data.data_count;
                        ++ip->pip->delta_icount;
-
-                       if (cst)
-                               ip->cino = cst->tag.cino;
+                       ccms_cst_init(&ip->topo_cst, &ip->chain);
                }
        }
 
@@ -2146,11 +2121,6 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
         * Cumulative adjustments must be propagated to the parent inode
         * when deleting and synchronized to ip.
         *
-        * The CCMS is deleted when pip is NULL'd out, here and also in
-        * chain_drop().  The CCMS is uninitialized when the pmp is NULL'd
-        * out (if it was non-NULL).  This is interlocked by the
-        * HAMMER2_CHAIN_DELETED flag to prevent reentrancy.
-        *
         * NOTE:  We do not propagate ip->delta_*count to the parent because
         *        these represent adjustments that have not yet been
         *        propagated upward, so we don't need to remove them from
@@ -2161,8 +2131,6 @@ hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
        if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
                ip = chain->u.ip;
                if (ip->pip) {
-                       ccms_inode_delete(ip->cino);
-
                        ip->pip->delta_icount -= ip->ip_data.inode_count;
                        ip->pip->delta_dcount -= ip->ip_data.data_count;
                        ip->ip_data.inode_count += ip->delta_icount;
@@ -2801,11 +2769,10 @@ hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
        }
 
        /*
-        * We are locking backwards so allow the lock to fail
+        * We are locking backwards so allow the lock to fail.
         */
-       if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+       if (ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE))
                return;
-       }
 
        /*
         * We are updating brefs but we have to call chain_modify()
@@ -2871,7 +2838,6 @@ hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
                   sizeof(chain->bref)) != 0) {
                panic("hammer2: unflagged bref update(2)");
        }
-
-       lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
+       ccms_thread_unlock(&parent->cst);               /* release manual op */
        hammer2_chain_unlock(hmp, parent);
 }
index c356bb5..af88da3 100644 (file)
@@ -77,13 +77,13 @@ void
 hammer2_inode_lock_sh(hammer2_inode_t *ip)
 {
        KKASSERT(ip->chain.refs > 0);
-       lockmgr(&ip->chain.lk, LK_SHARED);
+       ccms_thread_lock(&ip->chain.cst, CCMS_STATE_SHARED);
 }
 
 void
 hammer2_inode_unlock_sh(hammer2_inode_t *ip)
 {
-       lockmgr(&ip->chain.lk, LK_RELEASE);
+       ccms_thread_unlock(&ip->chain.cst);
 }
 
 /*
@@ -113,19 +113,19 @@ hammer2_inode_unbusy(hammer2_inode_t *ip)
 void
 hammer2_mount_exlock(hammer2_mount_t *hmp)
 {
-       lockmgr(&hmp->vchain.lk, LK_EXCLUSIVE);
+       ccms_thread_lock(&hmp->vchain.cst, CCMS_STATE_EXCLUSIVE);
 }
 
 void
 hammer2_mount_shlock(hammer2_mount_t *hmp)
 {
-       lockmgr(&hmp->vchain.lk, LK_SHARED);
+       ccms_thread_lock(&hmp->vchain.cst, CCMS_STATE_SHARED);
 }
 
 void
 hammer2_mount_unlock(hammer2_mount_t *hmp)
 {
-       lockmgr(&hmp->vchain.lk, LK_RELEASE);
+       ccms_thread_unlock(&hmp->vchain.cst);
 }
 
 void
index bc17dbe..3d6147f 100644 (file)
@@ -371,8 +371,8 @@ hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
                hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
                hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
                hmp->vchain.bref_flush = hmp->vchain.bref;
+               ccms_cst_init(&hmp->vchain.cst, NULL);
                /* hmp->vchain.u.xxx is left NULL */
-               lockinit(&hmp->vchain.lk, "volume", 0, LK_CANRECURSE);
                lockinit(&hmp->alloclk, "h2alloc", 0, 0);
                lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);