Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 7 Jun 2012 05:42:56 +0000 (22:42 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 7 Jun 2012 05:42:56 +0000 (22:42 -0700)
1  2 
sys/vfs/hammer2/hammer2.h
sys/vfs/hammer2/hammer2_ccms.c
sys/vfs/hammer2/hammer2_chain.c
sys/vfs/hammer2/hammer2_vfsops.c

index 183616a,0000000..fadb555
mode 100644,000000..100644
--- /dev/null
@@@ -1,469 -1,0 +1,471 @@@
 +/*
 + * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
 + *
 + * This code is derived from software contributed to The DragonFly Project
 + * by Matthew Dillon <dillon@dragonflybsd.org>
 + * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + *
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in
 + *    the documentation and/or other materials provided with the
 + *    distribution.
 + * 3. Neither the name of The DragonFly Project nor the names of its
 + *    contributors may be used to endorse or promote products derived
 + *    from this software without specific, prior written permission.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +
 +/*
 + * This header file contains structures used internally by the HAMMER2
 + * implementation.  See hammer2_disk.h for on-disk structures.
 + */
 +
 +#ifndef _VFS_HAMMER2_HAMMER2_H_
 +#define _VFS_HAMMER2_HAMMER2_H_
 +
 +#include <sys/param.h>
 +#include <sys/types.h>
 +#include <sys/kernel.h>
 +#include <sys/conf.h>
 +#include <sys/systm.h>
 +#include <sys/tree.h>
 +#include <sys/malloc.h>
 +#include <sys/mount.h>
 +#include <sys/vnode.h>
 +#include <sys/proc.h>
 +#include <sys/mountctl.h>
 +#include <sys/priv.h>
 +#include <sys/stat.h>
 +#include <sys/globaldata.h>
 +#include <sys/lockf.h>
 +#include <sys/buf.h>
 +#include <sys/queue.h>
 +#include <sys/limits.h>
 +#include <sys/buf2.h>
 +#include <sys/signal2.h>
 +#include <sys/tree.h>
 +
 +#include "hammer2_disk.h"
 +#include "hammer2_mount.h"
 +#include "hammer2_ioctl.h"
 +#include "hammer2_ccms.h"
 +
 +struct hammer2_chain;
 +struct hammer2_inode;
 +struct hammer2_mount;
 +struct hammer2_pfsmount;
 +
 +/*
 + * The chain structure tracks blockref recursions all the way to
 + * the root volume.  These consist of indirect blocks, inodes,
 + * and eventually the volume header.
 + *
 + * The chain structure is embedded in the hammer2_mount, hammer2_inode,
 + * and other system memory structures.  The chain structure typically
 + * implements the reference count and busy flag for the larger structure.
 + *
 + * It is always possible to track a chain element all the way back to the
 + * root by following the (parent) links.  (index) is a type-dependent index
 + * in the parent indicating where in the parent the chain element resides.
 + *
 + * When a blockref is added or deleted the related chain element is marked
 + * modified and all of its parents are marked SUBMODIFIED (the parent
 + * recursion can stop once we hit a node that is already marked SUBMODIFIED).
 + * A deleted chain element must remain intact until synchronized against
 + * its parent.
 + *
 + * The blockref at (parent, index) is not adjusted until the modified chain
 + * element is flushed and unmarked.  Until then the child's blockref may
 + * not match the blockref at (parent, index).
 + */
 +SPLAY_HEAD(hammer2_chain_splay, hammer2_chain);
 +
 +struct hammer2_chain {
 +      struct hammer2_blockref bref;
 +      struct hammer2_blockref bref_flush;     /* synchronized w/MOVED bit */
 +      struct hammer2_chain *parent;           /* return chain to root */
 +      struct hammer2_chain_splay shead;
 +      SPLAY_ENTRY(hammer2_chain) snode;
 +      TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
 +      union {
 +              struct hammer2_inode *ip;
 +              struct hammer2_indblock *np;
 +              struct hammer2_data *dp;
 +              void *mem;
 +      } u;
 +
 +      struct buf      *bp;            /* buffer cache (ro) */
 +      hammer2_media_data_t *data;     /* modified copy of data (rw) */
 +      u_int           bytes;          /* physical size of data */
 +      struct lock     lk;             /* lockmgr lock */
 +      int             index;          /* index in parent */
 +      u_int           refs;
 +      u_int           busy;           /* soft-busy */
 +      u_int           flags;
 +};
 +
 +typedef struct hammer2_chain hammer2_chain_t;
 +
 +int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
 +SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 +
 +/*
 + * MOVED - This bit is set during the flush when the MODIFIED bit is cleared,
 + *       indicating that the parent's blocktable must inherit a change to
 + *       the bref (typically a block reallocation)
 + *
 + *       It must also be set in situations where a chain is not MODIFIED
 + *       but whos bref has changed (typically due to fields other than
 + *       a block reallocation).
 + */
 +#define HAMMER2_CHAIN_MODIFIED                0x00000001      /* active mods */
 +#define HAMMER2_CHAIN_DIRTYEMBED      0x00000002      /* inode embedded */
 +#define HAMMER2_CHAIN_DIRTYBP         0x00000004      /* dirty on unlock */
 +#define HAMMER2_CHAIN_SUBMODIFIED     0x00000008      /* 1+ subs modified */
 +#define HAMMER2_CHAIN_DELETED         0x00000010
 +#define HAMMER2_CHAIN_INITIAL         0x00000020      /* initial create */
 +#define HAMMER2_CHAIN_FLUSHED         0x00000040      /* flush on unlock */
 +#define HAMMER2_CHAIN_MOVED           0x00000080      /* bref changed */
 +#define HAMMER2_CHAIN_IOFLUSH         0x00000100      /* bawrite on put */
 +#define HAMMER2_CHAIN_DEFERRED                0x00000200      /* on a deferral list*/
 +#define HAMMER2_CHAIN_DESTROYED               0x00000400      /* destroying */
 +#define HAMMER2_CHAIN_MODIFIED_AUX    0x00000800      /* hmp->vchain only */
 +#define HAMMER2_CHAIN_MODIFY_TID      0x00001000      /* mod updates field */
 +#define HAMMER2_CHAIN_MOUNTED         0x00002000      /* PFS is mounted */
 +
 +/*
 + * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
 + */
 +#define HAMMER2_LOOKUP_NOLOCK         0x00000001      /* ref only */
 +#define HAMMER2_LOOKUP_NODATA         0x00000002      /* data left NULL */
 +
 +/*
 + * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
 + *
 + * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
 + *     blocks in the INITIAL-create state.
 + *
 + * NOTE: NO_MODIFY_TID tells the function to not set HAMMER2_CHAIN_MODIFY_TID
 + *     when marking the chain modified (used when a sub-chain modification
 + *     propagates upward).
 + */
 +#define HAMMER2_MODIFY_NOSUB          0x00000001      /* do not set SUBMOD */
 +#define HAMMER2_MODIFY_OPTDATA                0x00000002      /* data can be NULL */
 +#define HAMMER2_MODIFY_NO_MODIFY_TID  0x00000004
 +
 +/*
 + * Flags passed to hammer2_chain_lock()
 + */
 +#define HAMMER2_RESOLVE_NEVER         1
 +#define HAMMER2_RESOLVE_MAYBE         2
 +#define HAMMER2_RESOLVE_ALWAYS                3
 +
 +/*
 + * Cluster different types of storage together for allocations
 + */
 +#define HAMMER2_FREECACHE_INODE               0
 +#define HAMMER2_FREECACHE_INDIR               1
 +#define HAMMER2_FREECACHE_DATA                2
 +#define HAMMER2_FREECACHE_UNUSED3     3
 +#define HAMMER2_FREECACHE_TYPES               4
 +
 +/*
 + * BMAP read-ahead maximum parameters
 + */
 +#define HAMMER2_BMAP_COUNT            16      /* max bmap read-ahead */
 +#define HAMMER2_BMAP_BYTES            (HAMMER2_PBUFSIZE * HAMMER2_BMAP_COUNT)
 +
 +/*
 + * Misc
 + */
 +#define HAMMER2_FLUSH_DEPTH_LIMIT     40      /* stack recursion limit */
 +
 +/*
 + * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
 + *
 + * There is an in-memory representation of all on-media data structure.
 + *
 + * When accessed read-only the data will be mapped to the related buffer
 + * cache buffer.
 + *
 + * When accessed read-write (marked modified) a kmalloc()'d copy of the
 + * is created which can then be modified.  The copy is destroyed when a
 + * filesystem block is allocated to replace it.
 + *
 + * Active inodes (those with vnodes attached) will maintain the kmalloc()'d
 + * copy for both the read-only and the read-write case.  The combination of
 + * (bp) and (data) determines whether (data) was allocated or not.
 + *
 + * The in-memory representation may remain cached (for example in order to
 + * placemark clustering locks) even after the related data has been
 + * detached.
 + */
 +
 +/*
 + * A hammer2 inode.
 + */
 +struct hammer2_inode {
 +      struct hammer2_mount    *hmp;           /* Global mount */
 +      struct hammer2_pfsmount *pmp;           /* PFS mount */
 +      struct hammer2_inode    *pip;           /* parent inode */
 +      struct vnode            *vp;
++      ccms_inode_t            *cino;          /* cluster cache state */
 +      hammer2_chain_t         chain;
 +      struct hammer2_inode_data ip_data;
 +      struct lockf            advlock;
 +      u_int                   depth;          /* directory depth */
 +      hammer2_off_t           delta_dcount;   /* adjust data_count */
 +      hammer2_off_t           delta_icount;   /* adjust inode_count */
 +};
 +
 +typedef struct hammer2_inode hammer2_inode_t;
 +
 +/*
 + * A hammer2 indirect block
 + */
 +struct hammer2_indblock {
 +      hammer2_chain_t         chain;
 +};
 +
 +typedef struct hammer2_indblock hammer2_indblock_t;
 +
 +/*
 + * A hammer2 data block
 + */
 +struct hammer2_data {
 +      hammer2_chain_t         chain;
 +};
 +
 +typedef struct hammer2_data hammer2_data_t;
 +
 +struct hammer2_freecache {
 +      hammer2_off_t   bulk;
 +      hammer2_off_t   single;
 +};
 +
 +typedef struct hammer2_freecache hammer2_freecache_t;
 +
 +/*
 + * Global (per device) mount structure for device (aka vp->v_mount->hmp)
 + */
 +struct hammer2_mount {
 +      struct vnode    *devvp;         /* device vnode */
 +      int             ronly;          /* read-only mount */
 +      int             pmp_count;      /* PFS mounts backed by us */
 +      TAILQ_ENTRY(hammer2_mount) mntentry; /* hammer2_mntlist */
 +
 +      struct malloc_type *minode;
 +      int             ninodes;
 +      int             maxinodes;
 +
 +      struct malloc_type *mchain;
 +      int             nipstacks;
 +      int             maxipstacks;
 +      hammer2_chain_t vchain;         /* anchor chain */
 +      hammer2_chain_t *schain;        /* super-root */
 +      struct lock     alloclk;        /* lockmgr lock */
 +      struct lock     voldatalk;      /* lockmgr lock */
 +
 +      hammer2_volume_data_t voldata;
 +      hammer2_freecache_t freecache[HAMMER2_FREECACHE_TYPES][HAMMER2_MAX_RADIX+1];
 +};
 +
 +typedef struct hammer2_mount hammer2_mount_t;
 +
 +/*
 + * Per-PFS mount structure for device (aka vp->v_mount)
 + */
 +struct hammer2_pfsmount {
 +      struct mount            *mp;            /* kernel mount */
 +      struct hammer2_mount    *hmp;           /* device global mount */
 +      hammer2_chain_t         *rchain;        /* PFS root chain */
 +      hammer2_inode_t         *iroot;         /* PFS root inode */
++      ccms_domain_t           ccms_dom;
 +      struct netexport        export;         /* nfs export */
 +      int                     ronly;          /* read-only mount */
 +};
 +
 +typedef struct hammer2_pfsmount hammer2_pfsmount_t;
 +
 +#if defined(_KERNEL)
 +
 +MALLOC_DECLARE(M_HAMMER2);
 +
 +#define VTOI(vp)      ((hammer2_inode_t *)(vp)->v_data)
 +#define ITOV(ip)      ((ip)->vp)
 +
 +static __inline
 +hammer2_pfsmount_t *
 +MPTOPMP(struct mount *mp)
 +{
 +      return ((hammer2_pfsmount_t *)mp->mnt_data);
 +}
 +
 +static __inline
 +hammer2_mount_t *
 +MPTOHMP(struct mount *mp)
 +{
 +      return (((hammer2_pfsmount_t *)mp->mnt_data)->hmp);
 +}
 +
 +extern struct vop_ops hammer2_vnode_vops;
 +extern struct vop_ops hammer2_spec_vops;
 +extern struct vop_ops hammer2_fifo_vops;
 +
 +extern int hammer2_debug;
 +extern int hammer2_cluster_enable;
 +extern int hammer2_hardlink_enable;
 +extern long hammer2_iod_file_read;
 +extern long hammer2_iod_meta_read;
 +extern long hammer2_iod_indr_read;
 +extern long hammer2_iod_file_write;
 +extern long hammer2_iod_meta_write;
 +extern long hammer2_iod_indr_write;
 +extern long hammer2_iod_volu_write;
 +extern long hammer2_ioa_file_read;
 +extern long hammer2_ioa_meta_read;
 +extern long hammer2_ioa_indr_read;
 +extern long hammer2_ioa_file_write;
 +extern long hammer2_ioa_meta_write;
 +extern long hammer2_ioa_indr_write;
 +extern long hammer2_ioa_volu_write;
 +
 +/*
 + * hammer2_subr.c
 + */
 +void hammer2_inode_lock_ex(hammer2_inode_t *ip);
 +void hammer2_inode_unlock_ex(hammer2_inode_t *ip);
 +void hammer2_inode_lock_sh(hammer2_inode_t *ip);
 +void hammer2_inode_unlock_sh(hammer2_inode_t *ip);
 +void hammer2_inode_busy(hammer2_inode_t *ip);
 +void hammer2_inode_unbusy(hammer2_inode_t *ip);
 +void hammer2_voldata_lock(hammer2_mount_t *hmp);
 +void hammer2_voldata_unlock(hammer2_mount_t *hmp);
 +
 +void hammer2_mount_exlock(hammer2_mount_t *hmp);
 +void hammer2_mount_shlock(hammer2_mount_t *hmp);
 +void hammer2_mount_unlock(hammer2_mount_t *hmp);
 +
 +int hammer2_get_dtype(hammer2_inode_t *ip);
 +int hammer2_get_vtype(hammer2_inode_t *ip);
 +u_int8_t hammer2_get_obj_type(enum vtype vtype);
 +void hammer2_time_to_timespec(u_int64_t xtime, struct timespec *ts);
 +u_int64_t hammer2_timespec_to_time(struct timespec *ts);
 +u_int32_t hammer2_to_unix_xid(uuid_t *uuid);
 +void hammer2_guid_to_uuid(uuid_t *uuid, u_int32_t guid);
 +
 +hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
 +int hammer2_bytes_to_radix(size_t bytes);
 +
 +int hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
 +                       hammer2_key_t *lbasep, hammer2_key_t *leofp);
 +void hammer2_update_time(uint64_t *timep);
 +
 +/*
 + * hammer2_inode.c
 + */
 +struct vnode *hammer2_igetv(hammer2_inode_t *ip, int *errorp);
 +
 +void hammer2_inode_lock_nlinks(hammer2_inode_t *ip);
 +void hammer2_inode_unlock_nlinks(hammer2_inode_t *ip);
 +hammer2_inode_t *hammer2_inode_alloc(hammer2_pfsmount_t *pmp, void *data);
 +void hammer2_inode_free(hammer2_inode_t *ip);
 +void hammer2_inode_ref(hammer2_inode_t *ip);
 +void hammer2_inode_drop(hammer2_inode_t *ip);
 +int hammer2_inode_calc_alloc(hammer2_key_t filesize);
 +
 +int hammer2_inode_create(hammer2_inode_t *dip,
 +                      struct vattr *vap, struct ucred *cred,
 +                      const uint8_t *name, size_t name_len,
 +                      hammer2_inode_t **nipp);
 +
 +int hammer2_inode_duplicate(hammer2_inode_t *dip,
 +                      hammer2_inode_t *oip, hammer2_inode_t **nipp,
 +                      const uint8_t *name, size_t name_len);
 +int hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
 +                      const uint8_t *name, size_t name_len);
 +
 +int hammer2_unlink_file(hammer2_inode_t *dip,
 +                      const uint8_t *name, size_t name_len,
 +                      int isdir, hammer2_inode_t *retain_ip);
 +int hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip);
 +int hammer2_hardlink_deconsolidate(hammer2_inode_t *dip,
 +                      hammer2_chain_t **chainp, hammer2_inode_t **ipp);
 +int hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
 +                      hammer2_inode_t **ipp);
 +
 +/*
 + * hammer2_chain.c
 + */
 +void hammer2_modify_volume(hammer2_mount_t *hmp);
 +hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
 +                              hammer2_blockref_t *bref);
 +void hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 +void hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 +void hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 +int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how);
 +void hammer2_chain_moved(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 +void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
 +                              int flags);
 +void hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
 +                              int nradix, int flags);
 +void hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 +hammer2_chain_t *hammer2_chain_find(hammer2_mount_t *hmp,
 +                              hammer2_chain_t *parent, int index);
 +hammer2_chain_t *hammer2_chain_get(hammer2_mount_t *hmp,
 +                              hammer2_chain_t *parent,
 +                              int index, int flags);
 +hammer2_chain_t *hammer2_chain_lookup(hammer2_mount_t *hmp,
 +                              hammer2_chain_t **parentp,
 +                              hammer2_key_t key_beg, hammer2_key_t key_end,
 +                              int flags);
 +hammer2_chain_t *hammer2_chain_next(hammer2_mount_t *hmp,
 +                              hammer2_chain_t **parentp,
 +                              hammer2_chain_t *chain,
 +                              hammer2_key_t key_beg, hammer2_key_t key_end,
 +                              int flags);
 +hammer2_chain_t *hammer2_chain_create(hammer2_mount_t *hmp,
 +                              hammer2_chain_t *parent,
 +                              hammer2_chain_t *chain,
 +                              hammer2_key_t key, int keybits,
 +                              int type, size_t bytes);
 +void hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 +                              hammer2_chain_t *chain, int retain);
 +void hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
 +                              hammer2_tid_t modify_tid);
 +void hammer2_chain_commit(hammer2_mount_t *hmp, hammer2_chain_t *chain);
 +
 +/*
 + * hammer2_ioctl.c
 + */
 +int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
 +                              int fflag, struct ucred *cred);
 +
 +/*
 + * hammer2_freemap.c
 + */
 +hammer2_off_t hammer2_freemap_alloc(hammer2_mount_t *hmp,
 +                              int type, size_t bytes);
 +void hammer2_freemap_free(hammer2_mount_t *hmp, hammer2_off_t data_off,
 +                              int type);
 +
 +#endif /* !_KERNEL */
 +#endif /* !_VFS_HAMMER2_HAMMER2_H_ */
index 0c08947,0000000..70b3631
mode 100644,000000..100644
--- /dev/null
@@@ -1,1181 -1,0 +1,1223 @@@
-       ccms_inode_init(dom, &dom->root, NULL);
-       dom->root.domain = dom;
 +/*
 + * Copyright (c) 2006,2012 The DragonFly Project.  All rights reserved.
 + *
 + * This code is derived from software contributed to The DragonFly Project
 + * by Matthew Dillon <dillon@backplane.com>
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + *
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in
 + *    the documentation and/or other materials provided with the
 + *    distribution.
 + * 3. Neither the name of The DragonFly Project nor the names of its
 + *    contributors may be used to endorse or promote products derived
 + *    from this software without specific, prior written permission.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +/*
 + * The Cache Coherency Management System (CCMS)
 + */
 +
 +#include <sys/param.h>
 +#include <sys/systm.h>
 +#include <sys/kernel.h>
 +#include <sys/malloc.h>
 +#include <sys/objcache.h>
 +#include <sys/sysctl.h>
 +#include <sys/uio.h>
 +#include <machine/limits.h>
 +
 +#include <sys/spinlock2.h>
 +
 +#include "hammer2_ccms.h"
 +
 +struct ccms_lock_scan_info {
 +      ccms_inode_t    *cino;
 +      ccms_lock_t     *lock;
 +      ccms_cst_t      *coll_cst;
 +      int             rstate_upgrade_needed;
 +};
 +
 +static int ccms_cst_cmp(ccms_cst_t *b1, ccms_cst_t *b2);
 +static int ccms_lock_scan_cmp(ccms_cst_t *b1, void *arg);
 +
 +static int ccms_lock_get_match(ccms_cst_t *cst, void *arg);
 +static int ccms_lock_undo_match(ccms_cst_t *cst, void *arg);
 +static int ccms_lock_redo_match(ccms_cst_t *cst, void *arg);
 +static int ccms_lock_upgrade_match(ccms_cst_t *cst, void *arg);
 +static int ccms_lock_put_match(ccms_cst_t *cst, void *arg);
 +
 +static void ccms_lstate_get(ccms_cst_t *cst, ccms_state_t state);
 +static void ccms_lstate_put(ccms_cst_t *cst);
 +static void ccms_rstate_get(ccms_cst_t *cst, ccms_state_t state);
 +static void ccms_rstate_put(ccms_cst_t *cst);
 +
 +struct ccms_rb_tree;
 +RB_GENERATE3(ccms_rb_tree, ccms_cst, rbnode, ccms_cst_cmp,
 +           ccms_off_t, beg_offset, end_offset);
 +static MALLOC_DEFINE(M_CCMS, "CCMS", "Cache Coherency Management System");
 +
 +static int ccms_debug = 0;
 +
 +/*
 + * These helpers are called to manage the CST cache so we can avoid
 + * unnecessary kmalloc()'s and kfree()'s in hot paths.
 + *
 + * ccms_free_pass1() must be called with the spinlock held.
 + * ccms_free_pass2() must be called with the spinlock not held.
 + */
 +static __inline
 +ccms_cst_t *
 +ccms_free_pass1(ccms_inode_t *cino, int keep)
 +{
 +      ccms_cst_t *cst;
 +      ccms_cst_t **cstp;
 +
 +      cstp = &cino->free_cache;
 +      while ((cst = *cstp) != NULL && keep) {
 +              cstp = &cst->free_next;
 +              --keep;
 +      }
 +      *cstp = NULL;
 +      return (cst);
 +}
 +
 +static __inline
 +void
 +ccms_free_pass2(ccms_cst_t *next)
 +{
 +      ccms_cst_t *cst;
 +      ccms_domain_t *dom;
 +
 +      while ((cst = next) != NULL) {
 +              next = cst->free_next;
 +              cst->free_next = NULL;
 +
 +              dom = cst->cino->domain;
 +              atomic_add_int(&dom->cst_count, -1);
 +
 +              kfree(cst, dom->mcst);
 +      }
 +}
 +
 +/*
 + * Initialize a new CCMS dataspace.  Create a new RB tree with a single
 + * element covering the entire 64 bit offset range.  This simplifies
 + * algorithms enormously by removing a number of special cases.
 + */
 +void
 +ccms_domain_init(ccms_domain_t *dom)
 +{
 +      bzero(dom, sizeof(*dom));
 +      kmalloc_create(&dom->mcst, "CCMS-cst");
-       cino->attr_cst.cino = cino;
++      /*dom->root.domain = dom;*/
 +}
 +
++void
++ccms_domain_uninit(ccms_domain_t *dom)
++{
++      kmalloc_destroy(&dom->mcst);
++}
++
++#if 0
 +/*
 + * Initialize a ccms_inode for use.  The inode will be initialized but
 + * is not yet connected to the rest of the topology.  However, it can
 + * still be used stand-alone if desired without being connected to the
 + * topology.
 + */
 +void
 +ccms_inode_init(ccms_domain_t *dom, ccms_inode_t *cino, void *handle)
 +{
 +      ccms_cst_t *cst;
 +
 +      bzero(cino, sizeof(*cino));
 +
 +      spin_init(&cino->spin);
 +      RB_INIT(&cino->tree);
 +      cino->domain = dom;
 +      cino->handle = handle;
-       cino->topo_cst.cino = cino;
-       cino->topo_cst.lstate = CCMS_STATE_INVALID;
-       cino->topo_cst.rstate = CCMS_STATE_INVALID;
++      /* cino->attr_cst.cino = cino; no rbtree association */
 +      cino->attr_cst.lstate = CCMS_STATE_INVALID;
 +      cino->attr_cst.rstate = CCMS_STATE_INVALID;
-  * Insert an inode into the topology.  The inode has already been
-  * initialized and could even be in active.
 +
 +      /*
 +       * The dataspace must be initialized w/cache-state set to INVALID
 +       * for the entire range.
 +       */
 +      cst = kmalloc(sizeof(*cst), dom->mcst, M_WAITOK | M_ZERO);
 +      cst->cino = cino;
 +      cst->flags = CCMS_CST_DYNAMIC;
 +      cst->beg_offset = 0;
 +      cst->end_offset = 0xFFFFFFFFFFFFFFFFLLU;
 +      cst->lstate = CCMS_STATE_INVALID;
 +      cst->rstate = CCMS_STATE_INVALID;
 +      RB_INSERT(ccms_rb_tree, &cino->tree, cst);
 +      atomic_add_int(&dom->cst_count, 1);
 +}
 +
 +/*
- ccms_inode_insert(ccms_inode_t *cpar, ccms_inode_t *cino)
++ * Associate the topology CST with a CCMS inode.  The topology CST must
++ * be held locked (typically SHARED) by the caller.  The caller is responsible
++ * for interlocking a unique ccms_inode to prevent SMP races.
 + */
 +void
-  * descriptors).
++ccms_inode_associate(ccms_inode_t *cino, ccms_cst_t *topo_cst)
 +{
++      KKASSERT(topo_cst->tag.cino == NULL);
++
++      spin_lock(&cino->spin);
++      topo_cst->tag.cino = cino;
++      topo_cst->flags |= CCMS_CST_INODE;
++
++      cino->topo_cst = topo_cst;
++      cino->parent = topo_cst->cino;
++      cino->flags |= CCMS_INODE_INSERTED;
++      spin_unlock(&cino->spin);
++}
++
++#if 0
++
++int
++ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock)
++
 +      spin_lock(&cpar->spin);
 +      spin_lock(&cino->spin);
++
 +      KKASSERT((cino->flags & CCMS_INODE_INSERTED) == 0);
++      cino->topo_cst.beg_offset = key;
++      cino->topo_cst.end_offset = key;
++
 +      if (RB_INSERT(ccms_rb_tree, &cpar->tree, &cino->topo_cst)) {
 +              spin_unlock(&cino->spin);
 +              spin_unlock(&cpar->spin);
 +              panic("ccms_inode_insert: duplicate entry");
 +      }
 +      cino->parent = cpar;
 +      cino->flags |= CCMS_INODE_INSERTED;
 +      spin_unlock(&cino->spin);
 +      spin_unlock(&cpar->spin);
 +}
 +
++#endif
++
 +/*
 + * Delete an inode from the topology.  The inode can remain in active use
 + * after the deletion (e.g. when unlinking a file which still has open
- ccms_inode_delete(ccms_inode_t *cino)
++ * descriptors) but it's topo_cst is removed from its parent.
 + *
 + * If the caller is destroying the ccms_inode the caller must call
 + * ccms_inode_uninit() to invalidate the cache state (which can block).
 + */
 +void
-       KKASSERT(cino->flags & CCMS_INODE_DELETING);
++ccms_inode_disassociate(ccms_inode_t *cino)
 +{
 +      ccms_inode_t *cpar;
++      ccms_cst_t *topo_cst;
 +      int flags;
 +
 +      /*
 +       * Interlock with the DELETING flag.
 +       */
 +      spin_lock(&cino->spin);
 +      flags = cino->flags;
 +      cino->flags |= CCMS_INODE_DELETING;
 +      spin_unlock(&cino->spin);
 +
 +      if (flags & CCMS_INODE_DELETING)
 +              return;
 +      if ((flags & CCMS_INODE_INSERTED) == 0)
 +              return;
 +
++      /*
++       *
++       */
++      topo_cst = cino->topo_cst;
++
++ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock)
++
 +      /*
 +       * We have the interlock, we are the only ones who can delete
 +       * the inode now.
 +       */
 +      cpar = cino->parent;
 +      spin_lock(&cpar->spin);
 +      spin_lock(&cino->spin);
 +      KKASSERT(cpar == cino->parent);
 +
 +      cino->flags &= ~CCMS_INODE_INSERTED;
 +      RB_REMOVE(ccms_rb_tree, &cpar->tree, &cino->topo_cst);
 +
 +      spin_unlock(&cino->spin);
 +      spin_unlock(&cpar->spin);
 +}
 +
 +/*
 + * The caller has removed the inode from the topology and is now trying
 + * to destroy the structure.  This routine flushes the cache state and
 + * can block on third-party interactions.
 + *
 + * NOTE: Caller must have already destroyed any recursive inode state.
 + */
 +void
 +ccms_inode_uninit(ccms_inode_t *cino)
 +{
 +      ccms_cst_t *scan;
 +
-       scan = ccms_free_pass1(cino, 0);
++      KKASSERT((cino->flags & CCMS_INODE_INSERTED) == 0);
 +      spin_lock(&cino->spin);
 +
 +      while ((scan = RB_ROOT(&cino->tree)) != NULL) {
 +              KKASSERT(scan->flags & CCMS_CST_DYNAMIC);
 +              KKASSERT((scan->flags & CCMS_CST_DELETING) == 0);
 +              RB_REMOVE(ccms_rb_tree, &cino->tree, scan);
 +              scan->flags |= CCMS_CST_DELETING;
 +              scan->flags &= ~CCMS_CST_INSERTED;
 +              spin_unlock(&cino->spin);
 +
 +              /*
 +               * Inval can be called without the inode spinlock because
 +               * we own the DELETING flag.
 +               */
 +              ccms_lstate_put(scan);
 +              ccms_rstate_put(scan);
 +              atomic_add_int(&cino->domain->cst_count, -1);
 +
 +              kfree(scan, cino->domain->mcst);
 +              spin_lock(&cino->spin);
 +      }
 +      KKASSERT((cino->attr_cst.flags & CCMS_CST_DELETING) == 0);
 +      cino->attr_cst.flags |= CCMS_CST_DELETING;
 +      KKASSERT((cino->topo_cst.flags & CCMS_CST_DELETING) == 0);
 +      cino->topo_cst.flags |= CCMS_CST_DELETING;
 +      spin_unlock(&cino->spin);
 +
 +      /*
 +       * Inval can be called without the inode spinlock because
 +       * we own the DELETING flag.  Similarly we can clear cino->domain
 +       * and cino->handle because we own the DELETING flag on the cino.
 +       */
 +      ccms_lstate_put(&cino->attr_cst);
 +      ccms_rstate_put(&cino->attr_cst);
 +      ccms_lstate_put(&cino->topo_cst);
 +      ccms_rstate_put(&cino->topo_cst);
 +
++      /*
++       * Clean out the ccms_inode free CST cache
++       */
++      spin_lock(&cino->spin);
++      scan = ccms_free_pass1(cino, 0);
++      spin_unlock(&cino->spin);
 +      ccms_free_pass2(scan);
 +
 +      cino->domain = NULL;
 +      cino->handle = NULL;
 +}
 +
++#endif
++
 +/*
 + * This is the core CCMS lock acquisition code and is typically called
 + * by program-specific wrappers which initialize the lock structure.
 + *
 + * Three cache coherent domains can be obtained, the topological 't'
 + * domain, the attribute 'a' domain, and a range in the data 'd' domain.
 + *
 + * A topological CCMS lock covers the entire attribute and data domain
 + * plus recursively covers the entire directory sub-tree, so if a topo
 + * lock is requested the other 'a' and 'd' locks currently assert if
 + * specified in the same request.
 + *
 + * You can get both an 'a' and a 'd' lock at the same time and, in
 + * particular, a VFS can use the 'a' lock to also lock the related
 + * VFS inode structure if it desires to.  HAMMER2 utilizes this feature.
 + *
 + * Topo locks are typically needed for rename operations and topo CST
 + * cache state on the backend can be used to limit the number of dynamic
 + * CST allocations backing the live CCMS locks.
 + */
 +int
 +ccms_lock_get(ccms_inode_t *cino, ccms_lock_t *lock)
 +{
 +      struct ccms_lock_scan_info info;
 +      ccms_cst_t *cst;
 +      int use_redo = 0;
 +      ccms_state_t highest_state;
 +
 +      /*
 +       * Live local locks prevent remotes from downgrading the rstate,
 +       * so we have to acquire a local lock before testing rstate.  If
 +       *
 +       * The local lock must be released if a remote upgrade is required
 +       * to avoid a deadlock, and we retry in that situation.
 +       */
 +again:
 +      if (lock->tstate) {
 +              KKASSERT(lock->astate == 0 && lock->dstate == 0);
 +              lock->icst = &cino->topo_cst;
 +              ccms_lstate_get(lock->icst, lock->tstate);
 +
 +              if (cino->topo_cst.rstate < lock->tstate) {
 +                      ccms_lstate_put(&cino->topo_cst);
 +                      ccms_rstate_get(&cino->topo_cst, lock->tstate);
 +                      goto again;
 +              }
 +      } else {
 +              /*
 +               * The topo rstate must be at least ALLOWED for us to be
 +               * able to acquire any other cache state.  If the topo
 +               * rstate is already higher than that then we may have
 +               * to upgrade it further to cover the lstate's we are
 +               * requesting.
 +               */
 +              highest_state = CCMS_STATE_ALLOWED;
 +              if (cino->topo_cst.rstate > highest_state) {
 +                      if (highest_state < lock->astate)
 +                              highest_state = lock->astate;
 +                      if (highest_state < lock->dstate)
 +                              highest_state = lock->dstate;
 +              }
 +              if (cino->topo_cst.rstate < highest_state)
 +                      ccms_rstate_get(&cino->topo_cst, highest_state);
 +              /* no need to retry */
 +      }
 +      if (lock->astate) {
 +              lock->icst = &cino->attr_cst;
 +              ccms_lstate_get(lock->icst, lock->astate);
 +
 +              if (cino->attr_cst.rstate < lock->astate) {
 +                      ccms_lstate_put(&cino->attr_cst);
 +                      if (lock->tstate)
 +                              ccms_lstate_put(&cino->topo_cst);
 +                      ccms_rstate_get(&cino->attr_cst, lock->astate);
 +                      goto again;
 +              }
 +      }
 +
 +      /*
 +       * The data-lock is a range-lock and requires a bit more code.
 +       * The CST space is partitioned so the precise range is covered.
 +       *
 +       * Multiple CST's may be involved and dcst points to the left hand
 +       * edge.
 +       */
 +      if (lock->dstate) {
 +              info.lock = lock;
 +              info.cino = cino;
 +              info.coll_cst = NULL;
 +
 +              spin_lock(&cino->spin);
 +
 +              /*
 +               * Make sure cino has enough free CSTs to cover the operation,
 +               * so we can hold the spinlock through the scan later on.
 +               */
 +              while (cino->free_cache == NULL ||
 +                     cino->free_cache->free_next == NULL) {
 +                      spin_unlock(&cino->spin);
 +                      cst = kmalloc(sizeof(*cst), cino->domain->mcst,
 +                                    M_WAITOK | M_ZERO);
 +                      atomic_add_int(&cino->domain->cst_count, 1);
 +                      spin_lock(&cino->spin);
 +                      cst->free_next = cino->free_cache;
 +                      cino->free_cache = cst;
 +              }
 +
 +              /*
 +               * The partitioning code runs with the spinlock held.  If
 +               * we've already partitioned due to having to do an rstate
 +               * upgrade we run a redo instead of a get.
 +               */
 +              info.rstate_upgrade_needed = 0;
 +              if (use_redo == 0) {
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_get_match, &info);
 +              } else {
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_redo_match, &info);
 +              }
 +
 +              /*
 +               * If a collision occured, undo the fragments we were able
 +               * to obtain, block, and try again.
 +               */
 +              while (info.coll_cst != NULL) {
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_undo_match, &info);
 +                      info.coll_cst->blocked = 1;
 +                      info.coll_cst = NULL;
 +                      ssleep(info.coll_cst, &cino->spin, 0, "ccmsget", hz);
 +                      info.rstate_upgrade_needed = 0;
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_redo_match, &info);
 +              }
 +
 +              /*
 +               * If the rstate needs to be upgraded we have to undo the
 +               * local locks (but we retain the partitioning).
 +               *
 +               * Set use_redo to indicate that the partioning was retained
 +               * (i.e. lrefs and rrefs remain intact).
 +               */
 +              if (info.rstate_upgrade_needed) {
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_undo_match, &info);
 +                      spin_unlock(&cino->spin);
 +                      if (lock->astate)
 +                              ccms_lstate_put(&cino->attr_cst);
 +                      if (lock->tstate)
 +                              ccms_lstate_put(&cino->topo_cst);
 +                      spin_lock(&cino->spin);
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_upgrade_match, &info);
 +                      spin_unlock(&cino->spin);
 +                      use_redo = 1;
 +                      goto again;
 +              }
 +
 +              /*
 +               * Cleanup free CSTs beyond the 2 we wish to retain.
 +               */
 +              cst = ccms_free_pass1(cino, 2);
 +              spin_unlock(&cino->spin);
 +              ccms_free_pass2(cst);
 +      }
 +
 +      /*
 +       * Ok, everything is in good shape EXCEPT we might not have
 +       * sufficient topo_cst.rstate.  It could have gotten ripped
 +       * out from under us.  Once we have the local locks it can
 +       * no longer be downgraded so a check here suffices.
 +       */
 +      highest_state = CCMS_STATE_ALLOWED;
 +      if (highest_state < lock->tstate)
 +              highest_state = lock->tstate;
 +      if (highest_state < lock->astate)
 +              highest_state = lock->astate;
 +      if (highest_state < lock->dstate)
 +              highest_state = lock->dstate;
 +
 +      if (cino->topo_cst.rstate < highest_state) {
 +              if (lock->dstate) {
 +                      spin_lock(&cino->spin);
 +                      RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                              ccms_lock_put_match, &info);
 +                      spin_unlock(&cino->spin);
 +              }
 +              if (lock->astate)
 +                      ccms_lstate_put(&cino->attr_cst);
 +              if (lock->tstate)
 +                      ccms_lstate_put(&cino->topo_cst);
 +              ccms_rstate_get(&cino->topo_cst, highest_state);
 +              use_redo = 0;
 +              goto again;
 +      }
 +      return(0);
 +}
 +
 +/*
 + * Obtain a CCMS lock, initialize the lock structure based on the uio.
 + *
 + * Both the attribute AND a ranged-data lock is acquired.
 + */
 +int
 +ccms_lock_get_uio(ccms_inode_t *cino, ccms_lock_t *lock, struct uio *uio)
 +{
 +      ccms_state_t dstate;
 +      ccms_off_t eoff;
 +
 +      if (uio->uio_rw == UIO_READ)
 +              dstate = CCMS_STATE_SHARED;
 +      else
 +              dstate = CCMS_STATE_MODIFIED;
 +
 +      /*
 +       * Calculate the ending offset (byte inclusive), make sure a seek
 +       * overflow does not blow us up.
 +       */
 +      eoff = uio->uio_offset + uio->uio_resid - 1;
 +      if (eoff < uio->uio_offset)
 +              eoff = 0x7FFFFFFFFFFFFFFFLL;
 +      lock->beg_offset = uio->uio_offset;
 +      lock->end_offset = eoff;
 +      lock->tstate = 0;
 +      lock->astate = dstate;
 +      lock->dstate = dstate;
 +      return (ccms_lock_get(cino, lock));
 +}
 +
 +/*
 + * Obtain a CCMS lock.  Only the attribute lock is acquired.
 + */
 +int
 +ccms_lock_get_attr(ccms_inode_t *cino, ccms_lock_t *lock, ccms_state_t astate)
 +{
 +      lock->tstate = 0;
 +      lock->astate = astate;
 +      lock->dstate = 0;
 +      return (ccms_lock_get(cino, lock));
 +}
 +
 +/*
 + * Helper routine.
 + *
 + * NOTE: called with spinlock held.
 + */
 +static
 +int
 +ccms_lock_get_match(ccms_cst_t *cst, void *arg)
 +{
 +      struct ccms_lock_scan_info *info = arg;
 +      ccms_lock_t *lock = info->lock;
 +      ccms_cst_t *ncst;
 +
 +      /*
 +       * If the lock's left edge is within the CST we must split the CST
 +       * into two pieces [cst][ncst].  lrefs must be bumped on the CST
 +       * containing the left edge.
 +       *
 +       * NOTE! cst->beg_offset may not be modified.  This allows us to
 +       *       avoid having to manipulate the cst's position in the tree.
 +       */
 +      if (lock->beg_offset > cst->beg_offset) {
 +              ncst = info->cino->free_cache;
 +              info->cino->free_cache = ncst->free_next;
 +              ncst->free_next = NULL;
 +              KKASSERT(ncst != NULL);
 +
 +              *ncst = *cst;
 +              cst->end_offset = lock->beg_offset - 1;
 +              cst->rrefs = 0;
 +              ncst->beg_offset = lock->beg_offset;
 +              ncst->lrefs = 1;
 +              RB_INSERT(ccms_rb_tree, &info->cino->tree, ncst);
 +
 +              /*
 +               * ncst becomes our 'matching' cst.
 +               */
 +              cst = ncst;
 +      } else if (lock->beg_offset == cst->beg_offset) {
 +              ++cst->lrefs;
 +      }
 +
 +      /*
 +       * If the lock's right edge is within the CST we must split the CST
 +       * into two pieces [cst][ncst].  rrefs must be bumped on the CST
 +       * containing the right edge.
 +       *
 +       * NOTE! cst->beg_offset may not be modified.  This allows us to
 +       * avoid having to manipulate the cst's position in the tree.
 +       */
 +      if (lock->end_offset < cst->end_offset) {
 +              ncst = info->cino->free_cache;
 +              info->cino->free_cache = ncst->free_next;
 +              ncst->free_next = NULL;
 +              KKASSERT(ncst != NULL);
 +
 +              *ncst = *cst;
 +              cst->end_offset = lock->end_offset;
 +              cst->rrefs = 1;
 +              ncst->beg_offset = lock->end_offset + 1;
 +              ncst->lrefs = 0;
 +              RB_INSERT(ccms_rb_tree, &info->cino->tree, ncst);
 +              /* cst remains our 'matching' cst */
 +      } else if (lock->end_offset == cst->end_offset) {
 +              ++cst->rrefs;
 +      }
 +
 +      /*
 +       * The lock covers the CST, so increment the CST's coverage count.
 +       * Then attempt to obtain the shared/exclusive lock.  The coverage
 +       * count is maintained until the put operation.
 +       */
 +      ++cst->xrefs;
 +      if (cst->lstate < lock->dstate)
 +              cst->lstate = lock->dstate;
 +
 +      /*
 +       * If we have already collided we make no more modifications
 +       * to cst->count, but we must continue the scan to properly
 +       * partition the cst.
 +       */
 +      if (info->coll_cst)
 +              return(0);
 +
 +      switch(lock->dstate) {
 +      case CCMS_STATE_INVALID:
 +              break;
 +      case CCMS_STATE_ALLOWED:
 +      case CCMS_STATE_SHARED:
 +      case CCMS_STATE_SLAVE:
 +              if (cst->count < 0) {
 +                      info->coll_cst = cst;
 +              } else {
 +                      ++cst->count;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("CST SHARE %d %lld-%lld\n",
 +                                      cst->count,
 +                                      (long long)cst->beg_offset,
 +                                      (long long)cst->end_offset);
 +                      }
 +              }
 +              break;
 +      case CCMS_STATE_MASTER:
 +      case CCMS_STATE_EXCLUSIVE:
 +              if (cst->count != 0) {
 +                      info->coll_cst = cst;
 +              } else {
 +                      --cst->count;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("CST EXCLS %d %lld-%lld\n",
 +                                      cst->count,
 +                                      (long long)cst->beg_offset,
 +                                      (long long)cst->end_offset);
 +                      }
 +              }
 +              break;
 +      case CCMS_STATE_MODIFIED:
 +              if (cst->count != 0) {
 +                      info->coll_cst = cst;
 +              } else {
 +                      --cst->count;
 +                      if (cst->lstate <= CCMS_STATE_EXCLUSIVE)
 +                              cst->lstate = CCMS_STATE_MODIFIED;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("CST MODXL %d %lld-%lld\n",
 +                                      cst->count,
 +                                      (long long)cst->beg_offset,
 +                                      (long long)cst->end_offset);
 +                      }
 +              }
 +              break;
 +      default:
 +              panic("ccms_lock_get_match: bad state %d\n", lock->dstate);
 +              break;
 +      }
 +      return(0);
 +}
 +
 +/*
 + * Undo a partially resolved ccms_ltype rangelock.  This is atomic with
 + * the scan/redo code so there should not be any blocked locks when
 + * transitioning to 0.  lrefs and rrefs are not touched in order to
 + * retain the partitioning.
 + *
 + * If coll_cst is non-NULL we stop when we hit this element as locks on
 + * no further elements were obtained.  This element might not represent
 + * a left or right edge but coll_cst can only be non-NULL if the spinlock
 + * was held throughout the get/redo and the undo.
 + *
 + * NOTE: called with spinlock held.
 + */
 +static
 +int
 +ccms_lock_undo_match(ccms_cst_t *cst, void *arg)
 +{
 +      struct ccms_lock_scan_info *info = arg;
 +      ccms_lock_t *lock = info->lock;
 +
 +      if (cst == info->coll_cst)
 +              return(-1);
 +
 +      switch (lock->dstate) {
 +      case CCMS_STATE_INVALID:
 +              break;
 +      case CCMS_STATE_ALLOWED:
 +      case CCMS_STATE_SHARED:
 +      case CCMS_STATE_SLAVE:
 +              KKASSERT(cst->count > 0);
 +              --cst->count;
 +              KKASSERT(cst->count || cst->blocked == 0);
 +              break;
 +      case CCMS_STATE_MASTER:
 +      case CCMS_STATE_EXCLUSIVE:
 +      case CCMS_STATE_MODIFIED:
 +              KKASSERT(cst->count < 0);
 +              ++cst->count;
 +              KKASSERT(cst->count || cst->blocked == 0);
 +              break;
 +      default:
 +              panic("ccms_lock_undo_match: bad state %d\n", lock->dstate);
 +              break;
 +      }
 +      return(0);
 +}
 +
 +/*
 + * Redo the local lock request for a range which has already been
 + * partitioned.
 + *
 + * NOTE: called with spinlock held.
 + */
 +static
 +int
 +ccms_lock_redo_match(ccms_cst_t *cst, void *arg)
 +{
 +      struct ccms_lock_scan_info *info = arg;
 +      ccms_lock_t *lock = info->lock;
 +
 +      KKASSERT(info->coll_cst == NULL);
 +
 +      switch(lock->dstate) {
 +      case CCMS_STATE_INVALID:
 +              break;
 +      case CCMS_STATE_ALLOWED:
 +      case CCMS_STATE_SHARED:
 +      case CCMS_STATE_SLAVE:
 +              if (cst->count < 0) {
 +                      info->coll_cst = cst;
 +              } else {
 +                      if (ccms_debug >= 9) {
 +                              kprintf("CST SHARE %d %lld-%lld\n",
 +                                      cst->count,
 +                                      (long long)cst->beg_offset,
 +                                      (long long)cst->end_offset);
 +                      }
 +                      ++cst->count;
 +              }
 +              break;
 +      case CCMS_STATE_MASTER:
 +      case CCMS_STATE_EXCLUSIVE:
 +              if (cst->count != 0) {
 +                      info->coll_cst = cst;
 +              } else {
 +                      --cst->count;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("CST EXCLS %d %lld-%lld\n",
 +                                      cst->count,
 +                                      (long long)cst->beg_offset,
 +                                      (long long)cst->end_offset);
 +                      }
 +              }
 +              break;
 +      case CCMS_STATE_MODIFIED:
 +              if (cst->count != 0) {
 +                      info->coll_cst = cst;
 +              } else {
 +                      --cst->count;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("CST MODXL %d %lld-%lld\n",
 +                                      cst->count,
 +                                      (long long)cst->beg_offset,
 +                                      (long long)cst->end_offset);
 +                      }
 +              }
 +              break;
 +      default:
 +              panic("ccms_lock_redo_match: bad state %d\n", lock->dstate);
 +              break;
 +      }
 +
 +      if (info->coll_cst)
 +              return(-1);     /* stop the scan */
 +      return(0);              /* continue the scan */
 +}
 +
 +/*
 + * Upgrade the rstate for the matching range.
 + *
 + * NOTE: Called with spinlock held.
 + */
 +static
 +int
 +ccms_lock_upgrade_match(ccms_cst_t *cst, void *arg)
 +{
 +      struct ccms_lock_scan_info *info = arg;
 +      ccms_lock_t *lock = info->lock;
 +
 +      /*
 +       * ccms_rstate_get() can block so we must release the spinlock.
 +       * To prevent the cst from getting ripped out on us we temporarily
 +       * bump both lrefs and rrefs.
 +       */
 +      if (cst->rstate < lock->dstate) {
 +              ++cst->lrefs;
 +              ++cst->rrefs;
 +              spin_unlock(&info->cino->spin);
 +              ccms_rstate_get(cst, lock->dstate);
 +              spin_lock(&info->cino->spin);
 +              --cst->lrefs;
 +              --cst->rrefs;
 +      }
 +      return(0);
 +}
 +
 +/*
 + * Release a previously acquired CCMS lock.
 + */
 +int
 +ccms_lock_put(ccms_inode_t *cino, ccms_lock_t *lock)
 +{
 +      struct ccms_lock_scan_info info;
 +      ccms_cst_t *scan;
 +
 +      if (lock->tstate) {
 +              ccms_lstate_put(lock->icst);
 +              lock->tstate = 0;
 +              lock->icst = NULL;
 +      } else if (lock->astate) {
 +              ccms_lstate_put(lock->icst);
 +              lock->astate = 0;
 +              lock->icst = NULL;
 +      }
 +
 +      if (lock->dstate) {
 +              info.lock = lock;
 +              info.cino = cino;
 +              spin_lock(&cino->spin);
 +              RB_SCAN(ccms_rb_tree, &cino->tree, ccms_lock_scan_cmp,
 +                      ccms_lock_put_match, &info);
 +              scan = ccms_free_pass1(cino, 2);
 +              spin_unlock(&cino->spin);
 +              ccms_free_pass2(scan);
 +              lock->dstate = 0;
 +              lock->dcst = NULL;
 +      }
 +
 +      return(0);
 +}
 +
 +/*
 + * Release a local lock.  The related CST's lstate is set to INVALID once
 + * the coverage drops to 0 and adjacent compatible entries will be
 + * recombined.
 + *
 + * NOTE: called with spinlock held.
 + */
 +static
 +int
 +ccms_lock_put_match(ccms_cst_t *cst, void *arg)
 +{
 +      struct ccms_lock_scan_info *info = arg;
 +      ccms_lock_t *lock = info->lock;
 +      ccms_cst_t *ocst;
 +
 +      /*
 +       * Undo the local shared/exclusive rangelock.
 +       */
 +      switch(lock->dstate) {
 +      case CCMS_STATE_INVALID:
 +              break;
 +      case CCMS_STATE_ALLOWED:
 +      case CCMS_STATE_SHARED:
 +      case CCMS_STATE_SLAVE:
 +              KKASSERT(cst->count > 0);
 +              --cst->count;
 +              if (ccms_debug >= 9) {
 +                      kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->count,
 +                              (long long)cst->beg_offset,
 +                              (long long)cst->end_offset,
 +                              cst->blocked);
 +              }
 +              if (cst->blocked && cst->count == 0) {
 +                      cst->blocked = 0;
 +                      wakeup(cst);
 +              }
 +              break;
 +      case CCMS_STATE_MASTER:
 +      case CCMS_STATE_EXCLUSIVE:
 +      case CCMS_STATE_MODIFIED:
 +              KKASSERT(cst->count < 0);
 +              ++cst->count;
 +              if (ccms_debug >= 9) {
 +                      kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->count,
 +                              (long long)cst->beg_offset,
 +                              (long long)cst->end_offset,
 +                              cst->blocked);
 +              }
 +              if (cst->blocked && cst->count == 0) {
 +                      cst->blocked = 0;
 +                      wakeup(cst);
 +              }
 +              break;
 +      default:
 +              panic("ccms_lock_put_match: bad state %d\n", lock->dstate);
 +              break;
 +      }
 +
 +      /*
 +       * Decrement the lock coverage count on the CST.  Decrement the left
 +       * and right edge counts as appropriate.
 +       *
 +       * When lrefs or rrefs drops to zero we check the adjacent entry to
 +       * determine whether a merge is possible.  If the appropriate refs
 +       * field (rrefs for the entry to our left, lrefs for the entry to
 +       * our right) is 0, then all covering locks must cover both entries
 +       * and the xrefs field must match.  We can then merge the entries
 +       * if they have compatible cache states.
 +       *
 +       * However, because we are cleaning up the shared/exclusive count
 +       * at the same time, the count field may be temporarily out of
 +       * sync, so require that the count field also match before doing
 +       * a merge.
 +       *
 +       * When merging an element which is being blocked on, the blocking
 +       * thread(s) will be woken up.
 +       *
 +       * If the dataspace has too many CSTs we may be able to merge the
 +       * entries even if their cache states are not the same, by dropping
 +       * both to a compatible (lower) cache state and performing the
 +       * appropriate management operations.  XXX
 +       */
 +      if (--cst->xrefs == 0)
 +              cst->lstate = CCMS_STATE_INVALID;
 +
 +      if (lock->beg_offset == cst->beg_offset && --cst->lrefs == 0) {
 +              if ((ocst = RB_PREV(ccms_rb_tree,
 +                                  &info->cino->tree, cst)) != NULL &&
 +                  ocst->rrefs == 0 &&
 +                  ocst->lstate == cst->lstate &&
 +                  ocst->rstate == cst->rstate &&
 +                  ocst->count == cst->count
 +              ) {
 +                      KKASSERT(ocst->xrefs == cst->xrefs);
 +                      KKASSERT(ocst->end_offset + 1 == cst->beg_offset);
 +                      RB_REMOVE(ccms_rb_tree, &info->cino->tree, ocst);
 +                      cst->beg_offset = ocst->beg_offset;
 +                      cst->lrefs = ocst->lrefs;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("MERGELEFT %p %lld-%lld (%d)\n",
 +                                     ocst,
 +                                     (long long)cst->beg_offset,
 +                                     (long long)cst->end_offset,
 +                                     cst->blocked);
 +                      }
 +                      if (ocst->blocked) {
 +                              ocst->blocked = 0;
 +                              wakeup(ocst);
 +                      }
 +                      ocst->free_next = info->cino->free_cache;
 +                      info->cino->free_cache = ocst;
 +              }
 +      }
 +      if (lock->end_offset == cst->end_offset && --cst->rrefs == 0) {
 +              if ((ocst = RB_NEXT(ccms_rb_tree,
 +                                  &info->cino->tree, cst)) != NULL &&
 +                  ocst->lrefs == 0 &&
 +                  ocst->lstate == cst->lstate &&
 +                  ocst->rstate == cst->rstate &&
 +                  ocst->count == cst->count
 +              ) {
 +                      KKASSERT(ocst->xrefs == cst->xrefs);
 +                      KKASSERT(cst->end_offset + 1 == ocst->beg_offset);
 +                      RB_REMOVE(ccms_rb_tree, &info->cino->tree, ocst);
 +                      cst->end_offset = ocst->end_offset;
 +                      cst->rrefs = ocst->rrefs;
 +                      if (ccms_debug >= 9) {
 +                              kprintf("MERGERIGHT %p %lld-%lld\n",
 +                                     ocst,
 +                                     (long long)cst->beg_offset,
 +                                     (long long)cst->end_offset);
 +                      }
 +                      ocst->free_next = info->cino->free_cache;
 +                      info->cino->free_cache = ocst;
 +              }
 +      }
 +      return(0);
 +}
 +
 +/*
 + * RB tree compare function for insertions and deletions.  This function
 + * compares two CSTs.
 + */
 +static int
 +ccms_cst_cmp(ccms_cst_t *b1, ccms_cst_t *b2)
 +{
 +      if (b1->end_offset < b2->beg_offset)
 +              return(-1);
 +      if (b1->beg_offset > b2->end_offset)
 +              return(1);
 +      return(0);
 +}
 +
 +/*
 + * RB tree scanning compare function.  This function compares the CST
 + * from the tree against the supplied ccms_lock and returns the CST's
 + * placement relative to the lock.
 + */
 +static int
 +ccms_lock_scan_cmp(ccms_cst_t *cst, void *arg)
 +{
 +      struct ccms_lock_scan_info *info = arg;
 +      ccms_lock_t *lock = info->lock;
 +
 +      if (cst->end_offset < lock->beg_offset)
 +              return(-1);
 +      if (cst->beg_offset > lock->end_offset)
 +              return(1);
 +      return(0);
 +}
 +
 +/************************************************************************
 + *            STANDALONE LSTATE AND RSTATE SUPPORT FUNCTIONS          *
 + ************************************************************************
 + *
 + * These functions are used to perform work on the attr_cst and topo_cst
 + * embedded in a ccms_inode, and to issue remote state operations.  These
 + * functions are called without the ccms_inode spinlock held.
 + */
 +
 +static
 +void
 +ccms_lstate_get(ccms_cst_t *cst, ccms_state_t state)
 +{
 +      int blocked;
 +
 +      spin_lock(&cst->cino->spin);
 +      ++cst->xrefs;
 +
 +      for (;;) {
 +              blocked = 0;
 +
 +              switch(state) {
 +              case CCMS_STATE_INVALID:
 +                      break;
 +              case CCMS_STATE_ALLOWED:
 +              case CCMS_STATE_SHARED:
 +              case CCMS_STATE_SLAVE:
 +                      if (cst->count < 0) {
 +                              blocked = 1;
 +                      } else {
 +                              ++cst->count;
 +                              if (ccms_debug >= 9) {
 +                                      kprintf("CST SHARE %d %lld-%lld\n",
 +                                              cst->count,
 +                                              (long long)cst->beg_offset,
 +                                              (long long)cst->end_offset);
 +                              }
 +                      }
 +                      break;
 +              case CCMS_STATE_MASTER:
 +              case CCMS_STATE_EXCLUSIVE:
 +                      if (cst->count != 0) {
 +                              blocked = 1;
 +                      } else {
 +                              --cst->count;
 +                              if (ccms_debug >= 9) {
 +                                      kprintf("CST EXCLS %d %lld-%lld\n",
 +                                              cst->count,
 +                                              (long long)cst->beg_offset,
 +                                              (long long)cst->end_offset);
 +                              }
 +                      }
 +                      break;
 +              case CCMS_STATE_MODIFIED:
 +                      if (cst->count != 0) {
 +                              blocked = 1;
 +                      } else {
 +                              --cst->count;
 +                              if (cst->lstate <= CCMS_STATE_EXCLUSIVE)
 +                                      cst->lstate = CCMS_STATE_MODIFIED;
 +                              if (ccms_debug >= 9) {
 +                                      kprintf("CST MODXL %d %lld-%lld\n",
 +                                              cst->count,
 +                                              (long long)cst->beg_offset,
 +                                              (long long)cst->end_offset);
 +                              }
 +                      }
 +                      break;
 +              default:
 +                      panic("ccms_lock_get_match: bad state %d\n", state);
 +                      break;
 +              }
 +              if (blocked == 0)
 +                      break;
 +              ssleep(cst, &cst->cino->spin, 0, "ccmslget", hz);
 +      }
 +      if (cst->lstate < state)
 +              cst->lstate = state;
 +      spin_unlock(&cst->cino->spin);
 +}
 +
 +static
 +void
 +ccms_lstate_put(ccms_cst_t *cst)
 +{
 +      spin_lock(&cst->cino->spin);
 +
 +      switch(cst->lstate) {
 +      case CCMS_STATE_INVALID:
 +              break;
 +      case CCMS_STATE_ALLOWED:
 +      case CCMS_STATE_SHARED:
 +      case CCMS_STATE_SLAVE:
 +              KKASSERT(cst->count > 0);
 +              --cst->count;
 +              if (ccms_debug >= 9) {
 +                      kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst->count,
 +                              (long long)cst->beg_offset,
 +                              (long long)cst->end_offset,
 +                              cst->blocked);
 +              }
 +              if (cst->blocked && cst->count == 0) {
 +                      cst->blocked = 0;
 +                      wakeup(cst);
 +              }
 +              break;
 +      case CCMS_STATE_MASTER:
 +      case CCMS_STATE_EXCLUSIVE:
 +      case CCMS_STATE_MODIFIED:
 +              KKASSERT(cst->count < 0);
 +              ++cst->count;
 +              if (ccms_debug >= 9) {
 +                      kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst->count,
 +                              (long long)cst->beg_offset,
 +                              (long long)cst->end_offset,
 +                              cst->blocked);
 +              }
 +              if (cst->blocked && cst->count == 0) {
 +                      cst->blocked = 0;
 +                      wakeup(cst);
 +              }
 +              break;
 +      default:
 +              panic("ccms_lock_put_match: bad state %d\n", cst->lstate);
 +              break;
 +      }
 +
 +      if (--cst->xrefs == 0)
 +              cst->lstate = CCMS_STATE_INVALID;
 +      spin_unlock(&cst->cino->spin);
 +}
 +
 +/*
 + * XXX third-party interaction & granularity
 + */
 +static
 +void
 +ccms_rstate_get(ccms_cst_t *cst, ccms_state_t state)
 +{
 +      spin_lock(&cst->cino->spin);
 +      if (cst->rstate < state)
 +              cst->rstate = state;
 +      spin_unlock(&cst->cino->spin);
 +}
 +
 +/*
 + * XXX third-party interaction & granularity
 + */
 +static
 +void
 +ccms_rstate_put(ccms_cst_t *cst)
 +{
 +      spin_lock(&cst->cino->spin);
 +      cst->rstate = CCMS_STATE_INVALID;
 +      spin_unlock(&cst->cino->spin);
 +}
index 2f8cb21,0000000..5921a78
mode 100644,000000..100644
--- /dev/null
@@@ -1,2811 -1,0 +1,2877 @@@
-        * the get function must always succeed, panic if there's no
 +/*
 + * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
 + *
 + * This code is derived from software contributed to The DragonFly Project
 + * by Matthew Dillon <dillon@dragonflybsd.org>
 + * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + *
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in
 + *    the documentation and/or other materials provided with the
 + *    distribution.
 + * 3. Neither the name of The DragonFly Project nor the names of its
 + *    contributors may be used to endorse or promote products derived
 + *    from this software without specific, prior written permission.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +/*
 + * This subsystem handles direct and indirect block searches, recursions,
 + * creation, and deletion.  Chains of blockrefs are tracked and modifications
 + * are flag for propagation... eventually all the way back to the volume
 + * header.
 + */
 +
 +#include <sys/cdefs.h>
 +#include <sys/param.h>
 +#include <sys/systm.h>
 +#include <sys/types.h>
 +#include <sys/lock.h>
 +#include <sys/uuid.h>
 +
 +#include "hammer2.h"
 +
 +static int hammer2_indirect_optimize; /* XXX SYSCTL */
 +
 +static hammer2_chain_t *hammer2_chain_create_indirect(
 +                      hammer2_mount_t *hmp, hammer2_chain_t *parent,
 +                      hammer2_key_t key, int keybits);
 +
 +/*
 + * Splay tree
 + */
 +SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
 +
 +int
 +hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
 +{
 +      return(chain2->index - chain1->index);
 +}
 +
 +/*
 + * Recursively mark the parent chain elements so flushes can find
 + * modified elements.  Stop when we hit a chain already flagged
 + * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
 + * in chain itself.
 + *
 + * SUBMODIFIED is not set on the chain passed in.
 + *
 + * XXX rename of parent can create a SMP race
 + */
 +static void
 +hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 +{
 +      hammer2_chain_t *parent;
 +
 +      parent = chain->parent;
 +      while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
 +              atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
 +              parent = parent->parent;
 +      }
 +}
 +
 +/*
 + * Allocate a new disconnected chain element representing the specified
 + * bref.  The chain element is locked exclusively and refs is set to 1.
 + *
 + * This essentially allocates a system memory structure representing one
 + * of the media structure types, including inodes.
 + */
 +hammer2_chain_t *
 +hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
 +{
 +      hammer2_chain_t *chain;
 +      hammer2_inode_t *ip;
 +      hammer2_indblock_t *np;
 +      hammer2_data_t *dp;
 +      u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
 +
 +      /*
 +       * Construct the appropriate system structure.
 +       */
 +      switch(bref->type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
 +              chain = &ip->chain;
 +              chain->u.ip = ip;
 +              lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
 +              ip->hmp = hmp;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
 +              chain = &np->chain;
 +              chain->u.np = np;
 +              lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
 +              break;
 +      case HAMMER2_BREF_TYPE_DATA:
 +              dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
 +              chain = &dp->chain;
 +              chain->u.dp = dp;
 +              lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              chain = NULL;
 +              panic("hammer2_chain_alloc volume type illegal for op");
 +      default:
 +              chain = NULL;
 +              panic("hammer2_chain_alloc: unrecognized blockref type: %d",
 +                    bref->type);
 +      }
 +
 +      /*
 +       * Only set bref_flush if the bref has a real media offset, otherwise
 +       * the caller has to wait for the chain to be modified/block-allocated
 +       * before a blockref can be synchronized with its (future) parent.
 +       */
 +      chain->bref = *bref;
 +      if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
 +              chain->bref_flush = *bref;
 +      chain->index = -1;              /* not yet assigned */
 +      chain->refs = 1;
 +      chain->bytes = bytes;
 +      lockmgr(&chain->lk, LK_EXCLUSIVE);
 +
 +      return (chain);
 +}
 +
 +/*
 + * Free a disconnected chain element
 + */
 +void
 +hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 +{
 +      void *mem;
 +
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
 +          chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
 +              chain->data = NULL;
 +      }
 +
 +      KKASSERT(chain->bp == NULL);
 +      KKASSERT(chain->data == NULL);
 +      KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
 +               chain->u.ip->vp == NULL);
 +
 +      if ((mem = chain->u.mem) != NULL) {
 +              chain->u.mem = NULL;
 +              if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
 +                      kfree(mem, hmp->minode);
 +              else
 +                      kfree(mem, hmp->mchain);
 +      }
 +}
 +
 +/*
 + * Add a reference to a chain element (for shared access).  The chain
 + * element must already have at least 1 ref controlled by the caller.
 + */
 +void
 +hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 +{
 +      KKASSERT(chain->refs > 0);
 +      atomic_add_int(&chain->refs, 1);
 +}
 +
 +/*
 + * Drop the callers reference to the chain element.  If the ref count
 + * reaches zero the chain element and its related structure (typically an
 + * inode or indirect block) will be freed and the parent will be
 + * recursively dropped.
 + *
 + * MOVED and MODIFIED elements hold additional references so it should not
 + * be possible for the count on a modified element to drop to 0.
 + *
 + * The chain element must NOT be locked by the caller.
 + *
 + * The parent might or might not be locked by the caller but if so it
 + * will also be referenced so we shouldn't recurse upward.
 + */
 +void
 +hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 +{
 +      hammer2_chain_t *parent;
++      hammer2_inode_t *ip;
 +      u_int refs;
 +
 +      while (chain) {
 +              refs = chain->refs;
 +              cpu_ccfence();
 +              KKASSERT(refs > 0);
 +              if (refs == 1) {
 +                      KKASSERT(chain != &hmp->vchain);
 +                      parent = chain->parent;
 +                      if (parent)
 +                              lockmgr(&parent->lk, LK_EXCLUSIVE);
 +                      if (atomic_cmpset_int(&chain->refs, 1, 0)) {
 +                              /*
 +                               * Succeeded, recurse and drop parent.
 +                               * These chain elements should be synchronized
 +                               * so no delta data or inode count updates
 +                               * should be needed.
 +                               */
 +                              KKASSERT((chain->flags &
 +                                        (HAMMER2_CHAIN_MOVED |
 +                                         HAMMER2_CHAIN_MODIFIED)) == 0);
++
++                              if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
++                                      ip = chain->u.ip;
++                              else
++                                      ip = NULL;
++
++                              /*
++                               * Delete interlock
++                               */
 +                              if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
++                                      /*
++                                       * Disconnect the CCMS inode if this
++                                       * was an inode.
++                                       */
++                                      if (ip && ip->cino)
++                                              ccms_inode_delete(ip->cino);
++
++                                      /*
++                                       * Disconnect the chain and clear
++                                       * pip if it was an inode.
++                                       */
 +                                      SPLAY_REMOVE(hammer2_chain_splay,
 +                                                   &parent->shead, chain);
 +                                      atomic_set_int(&chain->flags,
 +                                                     HAMMER2_CHAIN_DELETED);
++                                      if (ip)
++                                              ip->pip = NULL;
 +                                      /* parent refs dropped via recursion */
 +                              }
++
++                              /*
++                               * Destroy the disconnected ccms_inode if
++                               * applicable.
++                               */
++                              if (ip && ip->cino) {
++                                      ccms_inode_destroy(ip->cino);
++                                      ip->cino = NULL;
++                              }
 +                              chain->parent = NULL;
 +                              if (parent)
 +                                      lockmgr(&parent->lk, LK_RELEASE);
 +                              hammer2_chain_free(hmp, chain);
 +                              chain = parent;
 +                              /* recurse on parent */
 +                      } else {
 +                              if (parent)
 +                                      lockmgr(&parent->lk, LK_RELEASE);
 +                              /* retry the same chain */
 +                      }
 +              } else {
 +                      if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
 +                              /*
 +                               * Succeeded, count did not reach zero so
 +                               * cut out of the loop.
 +                               */
 +                              break;
 +                      }
 +                      /* retry the same chain */
 +              }
 +      }
 +}
 +
 +/*
 + * Ref and lock a chain element, acquiring its data with I/O if necessary,
 + * and specify how you would like the data to be resolved.
 + *
 + * Returns 0 on success or an error code if the data could not be acquired.
 + * The chain element is locked either way.
 + *
 + * The lock is allowed to recurse, multiple locking ops will aggregate
 + * the requested resolve types.  Once data is assigned it will not be
 + * removed until the last unlock.
 + *
 + * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
 + *                       (typically used to avoid device/logical buffer
 + *                        aliasing for data)
 + *
 + * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
 + *                       the INITIAL-create state (indirect blocks only).
 + *
 + *                       Do not resolve data elements for DATA chains.
 + *                       (typically used to avoid device/logical buffer
 + *                        aliasing for data)
 + *
 + * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
 + *
 + *
 + * NOTE: Embedded elements (volume header, inodes) are always resolved
 + *     regardless.
 + *
 + * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
 + *     element will instantiate and zero its buffer, and flush it on
 + *     release.
 + *
 + * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
 + *     so as not to instantiate a device buffer, which could alias against
 + *     a logical file buffer.  However, if ALWAYS is specified the
 + *     device buffer will be instantiated anyway.
 + */
 +int
 +hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
 +{
 +      hammer2_blockref_t *bref;
 +      hammer2_off_t pbase;
 +      hammer2_off_t peof;
 +      size_t boff;
 +      size_t bbytes;
 +      int error;
 +      char *bdata;
 +
 +      /*
 +       * Lock the element.  Under certain conditions this might end up
 +       * being a recursive lock.
 +       */
 +      KKASSERT(chain->refs > 0);
 +      atomic_add_int(&chain->refs, 1);
 +      lockmgr(&chain->lk, LK_EXCLUSIVE);
 +
 +      /*
 +       * If we already have a valid data pointer no further action is
 +       * necessary.
 +       */
 +      if (chain->data)
 +              return (0);
 +
 +      /*
 +       * Do we have to resolve the data?
 +       */
 +      switch(how) {
 +      case HAMMER2_RESOLVE_NEVER:
 +              return(0);
 +      case HAMMER2_RESOLVE_MAYBE:
 +              if (chain->flags & HAMMER2_CHAIN_INITIAL)
 +                      return(0);
 +              if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
 +                      return(0);
 +              /* fall through */
 +      case HAMMER2_RESOLVE_ALWAYS:
 +              break;
 +      }
 +
 +      /*
 +       * We must resolve to a device buffer, either by issuing I/O or
 +       * by creating a zero-fill element.  We do not mark the buffer
 +       * dirty when creating a zero-fill element (the hammer2_chain_modify()
 +       * API must still be used to do that).
 +       *
 +       * The device buffer is variable-sized in powers of 2 down
 +       * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
 +       * chunk always contains buffers of the same size. (XXX)
 +       *
 +       * The minimum physical IO size may be larger than the variable
 +       * block size.
 +       */
 +      bref = &chain->bref;
 +
 +      if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
 +              bbytes = HAMMER2_MINIOSIZE;
 +      pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
 +      peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
 +      boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
 +      KKASSERT(pbase != 0);
 +
 +      /*
 +       * The getblk() optimization can only be used on newly created
 +       * elements if the physical block size matches the request.
 +       */
 +      if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
 +          chain->bytes == bbytes) {
 +              chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
 +              error = 0;
 +      } else if (hammer2_cluster_enable) {
 +              error = cluster_read(hmp->devvp, peof, pbase, bbytes,
 +                                   HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
 +                                   &chain->bp);
 +      } else {
 +              error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
 +      }
 +
 +      if (error) {
 +              kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
 +                      (intmax_t)pbase, error);
 +              bqrelse(chain->bp);
 +              chain->bp = NULL;
 +              return (error);
 +      }
 +
 +      /*
 +       * Zero the data area if the chain is in the INITIAL-create state.
 +       * Mark the buffer for bdwrite().
 +       */
 +      bdata = (char *)chain->bp->b_data + boff;
 +      if (chain->flags & HAMMER2_CHAIN_INITIAL) {
 +              bzero(bdata, chain->bytes);
 +              atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
 +      }
 +
 +      /*
 +       * Setup the data pointer, either pointing it to an embedded data
 +       * structure and copying the data from the buffer, or pointing it
 +       * into the buffer.
 +       *
 +       * The buffer is not retained when copying to an embedded data
 +       * structure in order to avoid potential deadlocks or recursions
 +       * on the same physical buffer.
 +       */
 +      switch (bref->type) {
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              /*
 +               * Copy data from bp to embedded buffer
 +               */
 +              panic("hammer2_chain_lock: called on unresolved volume header");
 +#if 0
 +              /* NOT YET */
 +              KKASSERT(pbase == 0);
 +              KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
 +              bcopy(bdata, &hmp->voldata, chain->bytes);
 +              chain->data = (void *)&hmp->voldata;
 +              bqrelse(chain->bp);
 +              chain->bp = NULL;
 +#endif
 +              break;
 +      case HAMMER2_BREF_TYPE_INODE:
 +              /*
 +               * Copy data from bp to embedded buffer, do not retain the
 +               * device buffer.
 +               */
 +              bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
 +              chain->data = (void *)&chain->u.ip->ip_data;
 +              bqrelse(chain->bp);
 +              chain->bp = NULL;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +      case HAMMER2_BREF_TYPE_DATA:
 +      default:
 +              /*
 +               * Point data at the device buffer and leave bp intact.
 +               */
 +              chain->data = (void *)bdata;
 +              break;
 +      }
 +      return (0);
 +}
 +
 +/*
 + * Unlock and deref a chain element.
 + *
 + * On the last lock release any non-embedded data (chain->bp) will be
 + * retired.
 + */
 +void
 +hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 +{
 +      long *counterp;
 +
 +      /*
 +       * Undo a recursive lock
 +       *
 +       * XXX shared locks not handled properly
 +       */
 +      if (lockcountnb(&chain->lk) > 1) {
 +              KKASSERT(chain->refs > 1);
 +              atomic_add_int(&chain->refs, -1);
 +              lockmgr(&chain->lk, LK_RELEASE);
 +              return;
 +      }
 +
 +      /*
 +       * Shortcut the case if the data is embedded or not resolved.
 +       *
 +       * Do NOT null-out pointers to embedded data (e.g. inode).
 +       *
 +       * The DIRTYBP flag is non-applicable in this situation and can
 +       * be cleared to keep the flags state clean.
 +       */
 +      if (chain->bp == NULL) {
 +              atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
 +              lockmgr(&chain->lk, LK_RELEASE);
 +              hammer2_chain_drop(hmp, chain);
 +              return;
 +      }
 +
 +      /*
 +       * Statistics
 +       */
 +      if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
 +              ;
 +      } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
 +              switch(chain->bref.type) {
 +              case HAMMER2_BREF_TYPE_DATA:
 +                      counterp = &hammer2_ioa_file_write;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INODE:
 +                      counterp = &hammer2_ioa_meta_write;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INDIRECT:
 +                      counterp = &hammer2_ioa_indr_write;
 +                      break;
 +              default:
 +                      counterp = &hammer2_ioa_volu_write;
 +                      break;
 +              }
 +              ++*counterp;
 +      } else {
 +              switch(chain->bref.type) {
 +              case HAMMER2_BREF_TYPE_DATA:
 +                      counterp = &hammer2_iod_file_write;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INODE:
 +                      counterp = &hammer2_iod_meta_write;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INDIRECT:
 +                      counterp = &hammer2_iod_indr_write;
 +                      break;
 +              default:
 +                      counterp = &hammer2_iod_volu_write;
 +                      break;
 +              }
 +              ++*counterp;
 +      }
 +
 +      /*
 +       * Clean out the bp.
 +       *
 +       * If a device buffer was used for data be sure to destroy the
 +       * buffer when we are done to avoid aliases (XXX what about the
 +       * underlying VM pages?).
 +       */
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
 +              chain->bp->b_flags |= B_RELBUF;
 +
 +      /*
 +       * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
 +       * or not.  The flag will get re-set when chain_modify() is called,
 +       * even if MODIFIED is already set, allowing the OS to retire the
 +       * buffer independent of a hammer2 flus.
 +       */
 +      chain->data = NULL;
 +      if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
 +              atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
 +              if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
 +                      atomic_clear_int(&chain->flags,
 +                                       HAMMER2_CHAIN_IOFLUSH);
 +                      chain->bp->b_flags |= B_RELBUF;
 +                      cluster_awrite(chain->bp);
 +              } else {
 +                      chain->bp->b_flags |= B_CLUSTEROK;
 +                      bdwrite(chain->bp);
 +              }
 +      } else {
 +              if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
 +                      atomic_clear_int(&chain->flags,
 +                                       HAMMER2_CHAIN_IOFLUSH);
 +                      chain->bp->b_flags |= B_RELBUF;
 +                      brelse(chain->bp);
 +              } else {
 +                      /* bp might still be dirty */
 +                      bqrelse(chain->bp);
 +              }
 +      }
 +      chain->bp = NULL;
 +      lockmgr(&chain->lk, LK_RELEASE);
 +      hammer2_chain_drop(hmp, chain);
 +}
 +
 +/*
 + * Resize the chain's physical storage allocation.  Chains can be resized
 + * smaller without reallocating the storage.  Resizing larger will reallocate
 + * the storage.
 + *
 + * Must be passed a locked chain.
 + *
 + * If you want the resize code to copy the data to the new block then the
 + * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
 + *
 + * If the caller already holds a logical buffer containing the data and
 + * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
 + * operation will then not copy the data.
 + *
 + * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
 + * to avoid instantiating a device buffer that conflicts with the vnode
 + * data buffer.
 + *
 + * XXX flags currently ignored, uses chain->bp to detect data/no-data.
 + */
 +void
 +hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
 +                   int nradix, int flags)
 +{
 +      hammer2_mount_t *hmp = ip->hmp;
 +      struct buf *nbp;
 +      hammer2_off_t pbase;
 +      size_t obytes;
 +      size_t nbytes;
 +      size_t bbytes;
 +      int boff;
 +      char *bdata;
 +      int error;
 +
 +      /*
 +       * Only data and indirect blocks can be resized for now
 +       */
 +      KKASSERT(chain != &hmp->vchain);
 +      KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
 +               chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
 +
 +      /*
 +       * Nothing to do if the element is already the proper size
 +       */
 +      obytes = chain->bytes;
 +      nbytes = 1U << nradix;
 +      if (obytes == nbytes)
 +              return;
 +
 +      /*
 +       * Set MODIFIED and add a chain ref to prevent destruction.  Both
 +       * modified flags share the same ref.
 +       *
 +       * If the chain is already marked MODIFIED then we can safely
 +       * return the previous allocation to the pool without having to
 +       * worry about snapshots.
 +       */
 +      if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
 +              atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
 +                                            HAMMER2_CHAIN_MODIFY_TID);
 +              hammer2_chain_ref(hmp, chain);
 +      } else {
 +              hammer2_freemap_free(hmp, chain->bref.data_off,
 +                                   chain->bref.type);
 +      }
 +
 +      /*
 +       * Relocate the block, even if making it smaller (because different
 +       * block sizes may be in different regions).
 +       */
 +      chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
 +                                                   nbytes);
 +      chain->bytes = nbytes;
 +      ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
 +
 +      /*
 +       * The device buffer may be larger than the allocation size.
 +       */
 +      if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
 +              bbytes = HAMMER2_MINIOSIZE;
 +      pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
 +      boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
 +
 +      /*
 +       * Only copy the data if resolved, otherwise the caller is
 +       * responsible.
 +       */
 +      if (chain->bp) {
 +              KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
 +                       chain->bref.type == HAMMER2_BREF_TYPE_DATA);
 +              KKASSERT(chain != &hmp->vchain);        /* safety */
 +
 +              /*
 +               * The getblk() optimization can only be used if the
 +               * physical block size matches the request.
 +               */
 +              if (nbytes == bbytes) {
 +                      nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
 +                      error = 0;
 +              } else {
 +                      error = bread(hmp->devvp, pbase, bbytes, &nbp);
 +                      KKASSERT(error == 0);
 +              }
 +              bdata = (char *)nbp->b_data + boff;
 +
 +              if (nbytes < obytes) {
 +                      bcopy(chain->data, bdata, nbytes);
 +              } else {
 +                      bcopy(chain->data, bdata, obytes);
 +                      bzero(bdata + obytes, nbytes - obytes);
 +              }
 +
 +              /*
 +               * NOTE: The INITIAL state of the chain is left intact.
 +               *       We depend on hammer2_chain_modify() to do the
 +               *       right thing.
 +               *
 +               * NOTE: We set B_NOCACHE to throw away the previous bp and
 +               *       any VM backing store, even if it was dirty.
 +               *       Otherwise we run the risk of a logical/device
 +               *       conflict on reallocation.
 +               */
 +              chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
 +              brelse(chain->bp);
 +              chain->bp = nbp;
 +              chain->data = (void *)bdata;
 +              hammer2_chain_modify(hmp, chain, 0);
 +      }
 +
 +      /*
 +       * Make sure the chain is marked MOVED and SUBMOD is set in the
 +       * parent(s) so the adjustments are picked up by flush.
 +       */
 +      if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
 +              hammer2_chain_ref(hmp, chain);
 +              atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
 +      }
 +      hammer2_chain_parent_setsubmod(hmp, chain);
 +}
 +
 +/*
 + * Convert a locked chain that was retrieved read-only to read-write.
 + *
 + * If not already marked modified a new physical block will be allocated
 + * and assigned to the bref.
 + *
 + * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
 + *                 level or the COW operation will not work.
 + *
 + * Data blocks           - The chain is usually locked RESOLVE_NEVER so as not to
 + *                 run the data through the device buffers.
 + */
 +void
 +hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
 +{
 +      struct buf *nbp;
 +      int error;
 +      hammer2_off_t pbase;
 +      size_t bbytes;
 +      size_t boff;
 +      void *bdata;
 +
 +      /*
 +       * Tells flush that modify_tid must be updated, otherwise only
 +       * mirror_tid is updated.  This is the default.
 +       */
 +      if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
 +              atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
 +
 +      /*
 +       * If the chain is already marked MODIFIED we can just return.
 +       *
 +       * However, it is possible that a prior lock/modify sequence
 +       * retired the buffer.  During this lock/modify sequence MODIFIED
 +       * may still be set but the buffer could wind up clean.  Since
 +       * the caller is going to modify the buffer further we have to
 +       * be sure that DIRTYBP is set again.
 +       */
 +      if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 +              if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
 +                  chain->bp == NULL) {
 +                      goto skip1;
 +              }
 +              atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
 +              return;
 +      }
 +
 +      /*
 +       * Set MODIFIED and add a chain ref to prevent destruction.  Both
 +       * modified flags share the same ref.
 +       */
 +      atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
 +      hammer2_chain_ref(hmp, chain);
 +
 +      /*
 +       * We must allocate the copy-on-write block.
 +       *
 +       * If the data is embedded no other action is required.
 +       *
 +       * If the data is not embedded we acquire and clear the
 +       * new block.  If chain->data is not NULL we then do the
 +       * copy-on-write.  chain->data will then be repointed to the new
 +       * buffer and the old buffer will be released.
 +       *
 +       * For newly created elements with no prior allocation we go
 +       * through the copy-on-write steps except without the copying part.
 +       */
 +      if (chain != &hmp->vchain) {
 +              if ((hammer2_debug & 0x0001) &&
 +                  (chain->bref.data_off & HAMMER2_OFF_MASK)) {
 +                      kprintf("Replace %d\n", chain->bytes);
 +              }
 +              chain->bref.data_off =
 +                      hammer2_freemap_alloc(hmp, chain->bref.type,
 +                                            chain->bytes);
 +              /* XXX failed allocation */
 +      }
 +
 +      /*
 +       * If data instantiation is optional and the chain has no current
 +       * data association (typical for DATA and newly-created INDIRECT
 +       * elements), don't instantiate the buffer now.
 +       */
 +      if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
 +              goto skip2;
 +
 +skip1:
 +      /*
 +       * Setting the DIRTYBP flag will cause the buffer to be dirtied or
 +       * written-out on unlock.  This bit is independent of the MODIFIED
 +       * bit because the chain may still need meta-data adjustments done
 +       * by virtue of MODIFIED for its parent, and the buffer can be
 +       * flushed out (possibly multiple times) by the OS before that.
 +       *
 +       * Clearing the INITIAL flag (for indirect blocks) indicates that
 +       * a zero-fill buffer has been instantiated.
 +       */
 +      atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
 +      atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
 +
 +      /*
 +       * We currently should never instantiate a device buffer for a
 +       * data chain.
 +       */
 +      KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
 +
 +      /*
 +       * Execute COW operation
 +       */
 +      switch(chain->bref.type) {
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +      case HAMMER2_BREF_TYPE_INODE:
 +              /*
 +               * The data is embedded, no copy-on-write operation is
 +               * needed.
 +               */
 +              KKASSERT(chain->bp == NULL);
 +              break;
 +      case HAMMER2_BREF_TYPE_DATA:
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              /*
 +               * Perform the copy-on-write operation
 +               */
 +              KKASSERT(chain != &hmp->vchain);        /* safety */
 +              /*
 +               * The device buffer may be larger than the allocation size.
 +               */
 +              if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
 +                      bbytes = HAMMER2_MINIOSIZE;
 +              pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
 +              boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
 +
 +              /*
 +               * The getblk() optimization can only be used if the
 +               * physical block size matches the request.
 +               */
 +              if (chain->bytes == bbytes) {
 +                      nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
 +                      error = 0;
 +              } else {
 +                      error = bread(hmp->devvp, pbase, bbytes, &nbp);
 +                      KKASSERT(error == 0);
 +              }
 +              bdata = (char *)nbp->b_data + boff;
 +
 +              /*
 +               * Copy or zero-fill on write depending on whether
 +               * chain->data exists or not.
 +               */
 +              if (chain->data) {
 +                      bcopy(chain->data, bdata, chain->bytes);
 +                      KKASSERT(chain->bp != NULL);
 +              } else {
 +                      bzero(bdata, chain->bytes);
 +              }
 +              if (chain->bp) {
 +                      chain->bp->b_flags |= B_RELBUF;
 +                      brelse(chain->bp);
 +              }
 +              chain->bp = nbp;
 +              chain->data = bdata;
 +              break;
 +      default:
 +              panic("hammer2_chain_modify: illegal non-embedded type %d",
 +                    chain->bref.type);
 +              break;
 +
 +      }
 +skip2:
 +      if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
 +              hammer2_chain_parent_setsubmod(hmp, chain);
 +}
 +
 +/*
 + * Mark the volume as having been modified.  This short-cut version
 + * does not have to lock the volume's chain, which allows the ioctl
 + * code to make adjustments to connections without deadlocking.
 + */
 +void
 +hammer2_modify_volume(hammer2_mount_t *hmp)
 +{
 +      hammer2_voldata_lock(hmp);
 +      atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
 +      hammer2_voldata_unlock(hmp);
 +}
 +
 +/*
 + * Locate an in-memory chain.  The parent must be locked.  The in-memory
 + * chain is returned or NULL if no in-memory chain is present.
 + *
 + * NOTE: A chain on-media might exist for this index when NULL is returned.
 + */
 +hammer2_chain_t *
 +hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
 +{
 +      hammer2_chain_t dummy;
 +      hammer2_chain_t *chain;
 +
 +      dummy.index = index;
 +      chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
 +      return (chain);
 +}
 +
 +/*
 + * Return a locked chain structure with all associated data acquired.
 + *
 + * Caller must lock the parent on call, the returned child will be locked.
 + */
 +hammer2_chain_t *
 +hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 +                int index, int flags)
 +{
 +      hammer2_blockref_t *bref;
 +      hammer2_chain_t *chain;
 +      hammer2_chain_t dummy;
++      ccms_cst_t *cst;
 +      int how;
 +
 +      /*
 +       * Figure out how to lock.  MAYBE can be used to optimized
 +       * the initial-create state for indirect blocks.
 +       */
 +      if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
 +              how = HAMMER2_RESOLVE_NEVER;
 +      else
 +              how = HAMMER2_RESOLVE_MAYBE;
 +
++      /*
++       * Resolve cache state XXX
++       */
++      cst = NULL;
++
 +      /*
 +       * First see if we have a (possibly modified) chain element cached
 +       * for this (parent, index).  Acquire the data if necessary.
 +       *
 +       * If chain->data is non-NULL the chain should already be marked
 +       * modified.
 +       */
 +      dummy.index = index;
 +      chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
 +      if (chain) {
 +              if (flags & HAMMER2_LOOKUP_NOLOCK)
 +                      hammer2_chain_ref(hmp, chain);
 +              else
 +                      hammer2_chain_lock(hmp, chain, how);
 +              return(chain);
 +      }
 +
 +      /*
++       * The get function must always succeed, panic if there's no
 +       * data to index.
 +       */
 +      if (parent->flags & HAMMER2_CHAIN_INITIAL) {
 +              panic("hammer2_chain_get: Missing bref(1)");
 +              /* NOT REACHED */
 +      }
 +
 +      /*
 +       * Otherwise lookup the bref and issue I/O (switch on the parent)
 +       */
 +      switch(parent->bref.type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
 +              bref = &parent->data->ipdata.u.blockset.blockref[index];
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              KKASSERT(parent->data != NULL);
 +              KKASSERT(index >= 0 &&
 +                       index < parent->bytes / sizeof(hammer2_blockref_t));
 +              bref = &parent->data->npdata.blockref[index];
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
 +              bref = &hmp->voldata.sroot_blockset.blockref[index];
 +              break;
 +      default:
 +              bref = NULL;
 +              panic("hammer2_chain_get: unrecognized blockref type: %d",
 +                    parent->bref.type);
 +      }
 +      if (bref->type == 0) {
 +              panic("hammer2_chain_get: Missing bref(2)");
 +              /* NOT REACHED */
 +      }
 +
 +      /*
 +       * Allocate a chain structure representing the existing media
 +       * entry.
 +       *
 +       * The locking operation we do later will issue I/O to read it.
 +       */
 +      chain = hammer2_chain_alloc(hmp, bref);
 +
 +      /*
 +       * Link the chain into its parent.  Caller is expected to hold an
 +       * exclusive lock on the parent.
 +       */
 +      chain->parent = parent;
 +      chain->index = index;
 +      if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
 +              panic("hammer2_chain_link: collision");
 +      KKASSERT(parent->refs > 0);
 +      atomic_add_int(&parent->refs, 1);       /* for splay entry */
 +
 +      /*
 +       * Additional linkage for inodes.  Reuse the parent pointer to
 +       * find the parent directory.
++       *
++       * The CCMS for the pfs-root is initialized from the mount code,
++       * this chain_get, or chain_create, when the pmp is assigned and
++       * non-NULL.  No CCMS is initialized here for the super-root and
++       * the CCMS for the PFS root is initialized in the mount code.
 +       */
 +      if (bref->type == HAMMER2_BREF_TYPE_INODE) {
 +              while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
 +                      parent = parent->parent;
 +              if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
 +                      chain->u.ip->pip = parent->u.ip;
 +                      chain->u.ip->pmp = parent->u.ip->pmp;
 +                      chain->u.ip->depth = parent->u.ip->depth + 1;
++                      if (cst)
++                              chain->u.ip->cino = cst->tag.cino;
 +              }
 +      }
 +
 +      /*
 +       * Our new chain structure has already been referenced and locked
 +       * but the lock code handles the I/O so call it to resolve the data.
 +       * Then release one of our two exclusive locks.
 +       *
 +       * If NOLOCK is set the release will release the one-and-only lock.
 +       */
 +      if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
 +              hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
 +              hammer2_chain_drop(hmp, chain);         /* excess ref */
 +      }
 +      lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
 +
 +      return (chain);
 +}
 +
 +/*
 + * Locate any key between key_beg and key_end inclusive.  (*parentp)
 + * typically points to an inode but can also point to a related indirect
 + * block and this function will recurse upwards and find the inode again.
 + *
 + * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
 + *         WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
 + *         WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
 + *
 + * (*parentp) must be exclusively locked and referenced and can be an inode
 + * or an existing indirect block within the inode.
 + *
 + * On return (*parentp) will be modified to point at the deepest parent chain
 + * element encountered during the search, as a helper for an insertion or
 + * deletion.   The new (*parentp) will be locked and referenced and the old
 + * will be unlocked and dereferenced (no change if they are both the same).
 + *
 + * The matching chain will be returned exclusively locked and referenced.
 + *
 + * NULL is returned if no match was found, but (*parentp) will still
 + * potentially be adjusted.
 + *
 + * This function will also recurse up the chain if the key is not within the
 + * current parent's range.  (*parentp) can never be set to NULL.  An iteration
 + * can simply allow (*parentp) to float inside the loop.
 + */
 +hammer2_chain_t *
 +hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
 +                   hammer2_key_t key_beg, hammer2_key_t key_end,
 +                   int flags)
 +{
 +      hammer2_chain_t *parent;
 +      hammer2_chain_t *chain;
 +      hammer2_chain_t *tmp;
 +      hammer2_blockref_t *base;
 +      hammer2_blockref_t *bref;
 +      hammer2_key_t scan_beg;
 +      hammer2_key_t scan_end;
 +      int count = 0;
 +      int i;
 +
 +      /*
 +       * Recurse (*parentp) upward if necessary until the parent completely
 +       * encloses the key range or we hit the inode.
 +       */
 +      parent = *parentp;
 +      while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
 +              scan_beg = parent->bref.key;
 +              scan_end = scan_beg +
 +                         ((hammer2_key_t)1 << parent->bref.keybits) - 1;
 +              if (key_beg >= scan_beg && key_end <= scan_end)
 +                      break;
 +              hammer2_chain_ref(hmp, parent);         /* ref old parent */
 +              hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
 +              parent = parent->parent;
 +                                                      /* lock new parent */
 +              hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
 +              hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
 +              *parentp = parent;                      /* new parent */
 +      }
 +
 +again:
 +      /*
 +       * Locate the blockref array.  Currently we do a fully associative
 +       * search through the array.
 +       */
 +      switch(parent->bref.type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              /*
 +               * Special shortcut for embedded data returns the inode
 +               * itself.  Callers must detect this condition and access
 +               * the embedded data (the strategy code does this for us).
 +               *
 +               * This is only applicable to regular files and softlinks.
 +               */
 +              if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
 +                      if (flags & HAMMER2_LOOKUP_NOLOCK)
 +                              hammer2_chain_ref(hmp, parent);
 +                      else
 +                              hammer2_chain_lock(hmp, parent,
 +                                                 HAMMER2_RESOLVE_ALWAYS);
 +                      return (parent);
 +              }
 +              base = &parent->data->ipdata.u.blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              /*
 +               * Optimize indirect blocks in the INITIAL state to avoid
 +               * I/O.
 +               */
 +              if (parent->flags & HAMMER2_CHAIN_INITIAL) {
 +                      base = NULL;
 +              } else {
 +                      if (parent->data == NULL)
 +                              panic("parent->data is NULL");
 +                      base = &parent->data->npdata.blockref[0];
 +              }
 +              count = parent->bytes / sizeof(hammer2_blockref_t);
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              base = &hmp->voldata.sroot_blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      default:
 +              panic("hammer2_chain_lookup: unrecognized blockref type: %d",
 +                    parent->bref.type);
 +              base = NULL;    /* safety */
 +              count = 0;      /* safety */
 +      }
 +
 +      /*
 +       * If the element and key overlap we use the element.
 +       */
 +      bref = NULL;
 +      for (i = 0; i < count; ++i) {
 +              tmp = hammer2_chain_find(hmp, parent, i);
 +              if (tmp) {
 +                      bref = &tmp->bref;
 +                      KKASSERT(bref->type != 0);
 +              } else if (base == NULL || base[i].type == 0) {
 +                      continue;
 +              } else {
 +                      bref = &base[i];
 +              }
 +              scan_beg = bref->key;
 +              scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
 +              if (key_beg <= scan_end && key_end >= scan_beg)
 +                      break;
 +      }
 +      if (i == count) {
 +              if (key_beg == key_end)
 +                      return (NULL);
 +              return (hammer2_chain_next(hmp, parentp, NULL,
 +                                         key_beg, key_end, flags));
 +      }
 +
 +      /*
 +       * Acquire the new chain element.  If the chain element is an
 +       * indirect block we must search recursively.
 +       */
 +      chain = hammer2_chain_get(hmp, parent, i, flags);
 +      if (chain == NULL)
 +              return (NULL);
 +
 +      /*
 +       * If the chain element is an indirect block it becomes the new
 +       * parent and we loop on it.
 +       *
 +       * The parent always has to be locked with at least RESOLVE_MAYBE,
 +       * so it might need a fixup if the caller passed incompatible flags.
 +       */
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
 +              hammer2_chain_unlock(hmp, parent);
 +              *parentp = parent = chain;
 +              if (flags & HAMMER2_LOOKUP_NOLOCK) {
 +                      hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
 +                      hammer2_chain_drop(hmp, chain); /* excess ref */
 +              } else if (flags & HAMMER2_LOOKUP_NODATA) {
 +                      hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
 +                      hammer2_chain_unlock(hmp, chain);
 +              }
 +              goto again;
 +      }
 +
 +      /*
 +       * All done, return chain
 +       */
 +      return (chain);
 +}
 +
 +/*
 + * After having issued a lookup we can iterate all matching keys.
 + *
 + * If chain is non-NULL we continue the iteration from just after it's index.
 + *
 + * If chain is NULL we assume the parent was exhausted and continue the
 + * iteration at the next parent.
 + *
 + * parent must be locked on entry and remains locked throughout.  chain's
 + * lock status must match flags.
 + */
 +hammer2_chain_t *
 +hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
 +                 hammer2_chain_t *chain,
 +                 hammer2_key_t key_beg, hammer2_key_t key_end,
 +                 int flags)
 +{
 +      hammer2_chain_t *parent;
 +      hammer2_chain_t *tmp;
 +      hammer2_blockref_t *base;
 +      hammer2_blockref_t *bref;
 +      hammer2_key_t scan_beg;
 +      hammer2_key_t scan_end;
 +      int i;
 +      int count;
 +
 +      parent = *parentp;
 +
 +again:
 +      /*
 +       * Calculate the next index and recalculate the parent if necessary.
 +       */
 +      if (chain) {
 +              /*
 +               * Continue iteration within current parent.  If not NULL
 +               * the passed-in chain may or may not be locked, based on
 +               * the LOOKUP_NOLOCK flag (passed in as returned from lookup
 +               * or a prior next).
 +               */
 +              i = chain->index + 1;
 +              if (flags & HAMMER2_LOOKUP_NOLOCK)
 +                      hammer2_chain_drop(hmp, chain);
 +              else
 +                      hammer2_chain_unlock(hmp, chain);
 +
 +              /*
 +               * Any scan where the lookup returned degenerate data embedded
 +               * in the inode has an invalid index and must terminate.
 +               */
 +              if (chain == parent)
 +                      return(NULL);
 +              chain = NULL;
 +      } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
 +              /*
 +               * We reached the end of the iteration.
 +               */
 +              return (NULL);
 +      } else {
 +              /*
 +               * Continue iteration with next parent unless the current
 +               * parent covers the range.
 +               */
 +              hammer2_chain_t *nparent;
 +
 +              scan_beg = parent->bref.key;
 +              scan_end = scan_beg +
 +                          ((hammer2_key_t)1 << parent->bref.keybits) - 1;
 +              if (key_beg >= scan_beg && key_end <= scan_end)
 +                      return (NULL);
 +
 +              i = parent->index + 1;
 +              nparent = parent->parent;
 +              hammer2_chain_ref(hmp, nparent);        /* ref new parent */
 +              hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
 +                                                      /* lock new parent */
 +              hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
 +              hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
 +              *parentp = parent = nparent;
 +      }
 +
 +again2:
 +      /*
 +       * Locate the blockref array.  Currently we do a fully associative
 +       * search through the array.
 +       */
 +      switch(parent->bref.type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              base = &parent->data->ipdata.u.blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              if (parent->flags & HAMMER2_CHAIN_INITIAL) {
 +                      base = NULL;
 +              } else {
 +                      KKASSERT(parent->data != NULL);
 +                      base = &parent->data->npdata.blockref[0];
 +              }
 +              count = parent->bytes / sizeof(hammer2_blockref_t);
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              base = &hmp->voldata.sroot_blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      default:
 +              panic("hammer2_chain_next: unrecognized blockref type: %d",
 +                    parent->bref.type);
 +              base = NULL;    /* safety */
 +              count = 0;      /* safety */
 +              break;
 +      }
 +      KKASSERT(i <= count);
 +
 +      /*
 +       * Look for the key.  If we are unable to find a match and an exact
 +       * match was requested we return NULL.  If a range was requested we
 +       * run hammer2_chain_next() to iterate.
 +       */
 +      bref = NULL;
 +      while (i < count) {
 +              tmp = hammer2_chain_find(hmp, parent, i);
 +              if (tmp) {
 +                      bref = &tmp->bref;
 +              } else if (base == NULL || base[i].type == 0) {
 +                      ++i;
 +                      continue;
 +              } else {
 +                      bref = &base[i];
 +              }
 +              scan_beg = bref->key;
 +              scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
 +              if (key_beg <= scan_end && key_end >= scan_beg)
 +                      break;
 +              ++i;
 +      }
 +
 +      /*
 +       * If we couldn't find a match recurse up a parent to continue the
 +       * search.
 +       */
 +      if (i == count)
 +              goto again;
 +
 +      /*
 +       * Acquire the new chain element.  If the chain element is an
 +       * indirect block we must search recursively.
 +       */
 +      chain = hammer2_chain_get(hmp, parent, i, flags);
 +      if (chain == NULL)
 +              return (NULL);
 +
 +      /*
 +       * If the chain element is an indirect block it becomes the new
 +       * parent and we loop on it.
 +       *
 +       * The parent always has to be locked with at least RESOLVE_MAYBE,
 +       * so it might need a fixup if the caller passed incompatible flags.
 +       */
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
 +              hammer2_chain_unlock(hmp, parent);
 +              *parentp = parent = chain;
 +              chain = NULL;
 +              if (flags & HAMMER2_LOOKUP_NOLOCK) {
 +                      hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
 +                      hammer2_chain_drop(hmp, parent);        /* excess ref */
 +              } else if (flags & HAMMER2_LOOKUP_NODATA) {
 +                      hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
 +                      hammer2_chain_unlock(hmp, parent);
 +              }
 +              i = 0;
 +              goto again2;
 +      }
 +
 +      /*
 +       * All done, return chain
 +       */
 +      return (chain);
 +}
 +
 +/*
 + * Create and return a new hammer2 system memory structure of the specified
 + * key, type and size and insert it RELATIVE TO (PARENT).
 + *
 + * (parent) is typically either an inode or an indirect block, acquired
 + * acquired as a side effect of issuing a prior failed lookup.  parent
 + * must be locked and held.  Do not pass the inode chain to this function
 + * unless that is the chain returned by the failed lookup.
 + *
 + * Non-indirect types will automatically allocate indirect blocks as required
 + * if the new item does not fit in the current (parent).
 + *
 + * Indirect types will move a portion of the existing blockref array in
 + * (parent) into the new indirect type and then use one of the free slots
 + * to emplace the new indirect type.
 + *
 + * A new locked, referenced chain element is returned of the specified type.
 + * The element may or may not have a data area associated with it:
 + *
 + *    VOLUME          not allowed here
 + *    INODE           embedded data are will be set-up
 + *    INDIRECT        not allowed here
 + *    DATA            no data area will be set-up (caller is expected
 + *                    to have logical buffers, we don't want to alias
 + *                    the data onto device buffers!).
 + */
 +hammer2_chain_t *
 +hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 +                   hammer2_chain_t *chain,
 +                   hammer2_key_t key, int keybits, int type, size_t bytes)
 +{
 +      hammer2_blockref_t dummy;
 +      hammer2_blockref_t *base;
 +      hammer2_chain_t dummy_chain;
 +      int unlock_parent = 0;
 +      int allocated = 0;
 +      int count;
 +      int i;
++      ccms_cst_t *cst;
++
++      /*
++       * Resolve cache state
++       */
++      cst = NULL;
 +
 +      if (chain == NULL) {
 +              /*
 +               * First allocate media space and construct the dummy bref,
 +               * then allocate the in-memory chain structure.
 +               */
 +              bzero(&dummy, sizeof(dummy));
 +              dummy.type = type;
 +              dummy.key = key;
 +              dummy.keybits = keybits;
 +              dummy.data_off = hammer2_bytes_to_radix(bytes);
 +              chain = hammer2_chain_alloc(hmp, &dummy);
 +              allocated = 1;
 +
 +              /*
 +               * We do NOT set INITIAL here (yet).  INITIAL is only
 +               * used for indirect blocks.
 +               *
 +               * Recalculate bytes to reflect the actual media block
 +               * allocation.
 +               */
 +              bytes = (hammer2_off_t)1 <<
 +                      (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
 +              chain->bytes = bytes;
 +
 +              switch(type) {
 +              case HAMMER2_BREF_TYPE_VOLUME:
 +                      panic("hammer2_chain_create: called with volume type");
 +                      break;
 +              case HAMMER2_BREF_TYPE_INODE:
 +                      KKASSERT(bytes == HAMMER2_INODE_BYTES);
 +                      chain->data = (void *)&chain->u.ip->ip_data;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INDIRECT:
 +                      panic("hammer2_chain_create: cannot be used to"
 +                            "create indirect block");
 +                      break;
 +              case HAMMER2_BREF_TYPE_DATA:
 +              default:
 +                      /* leave chain->data NULL */
 +                      KKASSERT(chain->data == NULL);
 +                      break;
 +              }
 +      } else {
 +              /*
 +               * Potentially update the chain's key/keybits.
 +               */
 +              chain->bref.key = key;
 +              chain->bref.keybits = keybits;
 +      }
 +
 +again:
 +      /*
 +       * Locate a free blockref in the parent's array
 +       */
 +      switch(parent->bref.type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              KKASSERT((parent->u.ip->ip_data.op_flags &
 +                        HAMMER2_OPFLAG_DIRECTDATA) == 0);
 +              KKASSERT(parent->data != NULL);
 +              base = &parent->data->ipdata.u.blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              if (parent->flags & HAMMER2_CHAIN_INITIAL) {
 +                      base = NULL;
 +              } else {
 +                      KKASSERT(parent->data != NULL);
 +                      base = &parent->data->npdata.blockref[0];
 +              }
 +              count = parent->bytes / sizeof(hammer2_blockref_t);
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              KKASSERT(parent->data != NULL);
 +              base = &hmp->voldata.sroot_blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      default:
 +              panic("hammer2_chain_create: unrecognized blockref type: %d",
 +                    parent->bref.type);
 +              count = 0;
 +              break;
 +      }
 +
 +      /*
 +       * Scan for an unallocated bref, also skipping any slots occupied
 +       * by in-memory chain elements that may not yet have been updated
 +       * in the parent's bref array.
 +       */
 +      bzero(&dummy_chain, sizeof(dummy_chain));
 +      for (i = 0; i < count; ++i) {
 +              if (base == NULL) {
 +                      dummy_chain.index = i;
 +                      if (SPLAY_FIND(hammer2_chain_splay,
 +                                     &parent->shead, &dummy_chain) == NULL) {
 +                              break;
 +                      }
 +              } else if (base[i].type == 0) {
 +                      dummy_chain.index = i;
 +                      if (SPLAY_FIND(hammer2_chain_splay,
 +                                     &parent->shead, &dummy_chain) == NULL) {
 +                              break;
 +                      }
 +              }
 +      }
 +
 +      /*
 +       * If no free blockref could be found we must create an indirect
 +       * block and move a number of blockrefs into it.  With the parent
 +       * locked we can safely lock each child in order to move it without
 +       * causing a deadlock.
 +       *
 +       * This may return the new indirect block or the old parent depending
 +       * on where the key falls.
 +       */
 +      if (i == count) {
 +              hammer2_chain_t *nparent;
 +
 +              nparent = hammer2_chain_create_indirect(hmp, parent,
 +                                                      key, keybits);
 +              if (nparent == NULL) {
 +                      if (allocated)
 +                              hammer2_chain_free(hmp, chain);
 +                      chain = NULL;
 +                      goto done;
 +              }
 +              if (parent != nparent) {
 +                      if (unlock_parent)
 +                              hammer2_chain_unlock(hmp, parent);
 +                      parent = nparent;
 +                      unlock_parent = 1;
 +              }
 +              goto again;
 +      }
 +
 +      /*
 +       * Link the chain into its parent.  Later on we will have to set
 +       * the MOVED bit in situations where we don't mark the new chain
 +       * as being modified.
 +       */
 +      if (chain->parent != NULL)
 +              panic("hammer2: hammer2_chain_create: chain already connected");
 +      KKASSERT(chain->parent == NULL);
 +      chain->parent = parent;
 +      chain->index = i;
 +      if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
 +              panic("hammer2_chain_link: collision");
 +      atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
 +      KKASSERT(parent->refs > 0);
 +      atomic_add_int(&parent->refs, 1);
 +
 +      /*
 +       * Additional linkage for inodes.  Reuse the parent pointer to
 +       * find the parent directory.
 +       *
 +       * Cumulative adjustments are inherited on [re]attach and will
 +       * propagate up the tree on the next flush.
++       *
++       * The CCMS for the pfs-root is initialized from the mount code,
++       * this chain_get, or chain_create, when the pmp is assigned and
++       * non-NULL.  No CCMS is initialized here for the super-root and
++       * the CCMS for the PFS root is initialized in the mount code.
 +       */
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
 +              hammer2_chain_t *scan = parent;
 +              hammer2_inode_t *ip = chain->u.ip;
 +
 +              while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
 +                      scan = scan->parent;
 +              if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
 +                      ip->pip = scan->u.ip;
 +                      ip->pmp = scan->u.ip->pmp;
 +                      ip->depth = scan->u.ip->depth + 1;
 +                      ip->pip->delta_icount += ip->ip_data.inode_count;
 +                      ip->pip->delta_dcount += ip->ip_data.data_count;
 +                      ++ip->pip->delta_icount;
++
++                      if (cst)
++                              ip->cino = cst->tag.cino;
 +              }
 +      }
 +
 +      /*
 +       * (allocated) indicates that this is a newly-created chain element
 +       * rather than a renamed chain element.  In this situation we want
 +       * to place the chain element in the MODIFIED state.
 +       *
 +       * The data area will be set up as follows:
 +       *
 +       *      VOLUME          not allowed here.
 +       *
 +       *      INODE           embedded data are will be set-up.
 +       *
 +       *      INDIRECT        not allowed here.
 +       *
 +       *      DATA            no data area will be set-up (caller is expected
 +       *                      to have logical buffers, we don't want to alias
 +       *                      the data onto device buffers!).
 +       */
 +      if (allocated) {
 +              if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
 +                      hammer2_chain_modify(hmp, chain,
 +                                           HAMMER2_MODIFY_OPTDATA);
 +              } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
 +                      /* not supported in this function */
 +                      panic("hammer2_chain_create: bad type");
 +                      atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
 +                      hammer2_chain_modify(hmp, chain,
 +                                           HAMMER2_MODIFY_OPTDATA);
 +              } else {
 +                      hammer2_chain_modify(hmp, chain, 0);
 +              }
 +      } else {
 +              /*
 +               * When reconnecting inodes we have to call setsubmod()
 +               * to ensure that its state propagates up the newly
 +               * connected parent.
 +               *
 +               * Make sure MOVED is set but do not update bref_flush.  If
 +               * the chain is undergoing modification bref_flush will be
 +               * updated when it gets flushed.  If it is not then the
 +               * bref may not have been flushed yet and we do not want to
 +               * set MODIFIED here as this could result in unnecessary
 +               * reallocations.
 +               */
 +              if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
 +                      hammer2_chain_ref(hmp, chain);
 +                      atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
 +              }
 +              hammer2_chain_parent_setsubmod(hmp, chain);
 +      }
 +
 +done:
 +      if (unlock_parent)
 +              hammer2_chain_unlock(hmp, parent);
 +      return (chain);
 +}
 +
 +/*
 + * Create an indirect block that covers one or more of the elements in the
 + * current parent.  Either returns the existing parent with no locking or
 + * ref changes or returns the new indirect block locked and referenced
 + * and leaving the original parent lock/ref intact as well.
 + *
 + * The returned chain depends on where the specified key falls.
 + *
 + * The key/keybits for the indirect mode only needs to follow three rules:
 + *
 + * (1) That all elements underneath it fit within its key space and
 + *
 + * (2) That all elements outside it are outside its key space.
 + *
 + * (3) When creating the new indirect block any elements in the current
 + *     parent that fit within the new indirect block's keyspace must be
 + *     moved into the new indirect block.
 + *
 + * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
 + *     keyspace the the current parent, but lookup/iteration rules will
 + *     ensure (and must ensure) that rule (2) for all parents leading up
 + *     to the nearest inode or the root volume header is adhered to.  This
 + *     is accomplished by always recursing through matching keyspaces in
 + *     the hammer2_chain_lookup() and hammer2_chain_next() API.
 + *
 + * The current implementation calculates the current worst-case keyspace by
 + * iterating the current parent and then divides it into two halves, choosing
 + * whichever half has the most elements (not necessarily the half containing
 + * the requested key).
 + *
 + * We can also opt to use the half with the least number of elements.  This
 + * causes lower-numbered keys (aka logical file offsets) to recurse through
 + * fewer indirect blocks and higher-numbered keys to recurse through more.
 + * This also has the risk of not moving enough elements to the new indirect
 + * block and being forced to create several indirect blocks before the element
 + * can be inserted.
 + */
 +static
 +hammer2_chain_t *
 +hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 +                            hammer2_key_t create_key, int create_bits)
 +{
 +      hammer2_blockref_t *base;
 +      hammer2_blockref_t *bref;
 +      hammer2_chain_t *chain;
 +      hammer2_chain_t *ichain;
 +      hammer2_chain_t dummy;
 +      hammer2_key_t key = create_key;
 +      int keybits = create_bits;
 +      int locount = 0;
 +      int hicount = 0;
 +      int count;
 +      int nbytes;
 +      int i;
 +
 +      /*
 +       * Calculate the base blockref pointer or NULL if the chain
 +       * is known to be empty.  We need to calculate the array count
 +       * for SPLAY lookups either way.
 +       */
 +      hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
 +      if (parent->flags & HAMMER2_CHAIN_INITIAL) {
 +              base = NULL;
 +
 +              switch(parent->bref.type) {
 +              case HAMMER2_BREF_TYPE_INODE:
 +                      count = HAMMER2_SET_COUNT;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INDIRECT:
 +                      count = parent->bytes / sizeof(hammer2_blockref_t);
 +                      break;
 +              case HAMMER2_BREF_TYPE_VOLUME:
 +                      count = HAMMER2_SET_COUNT;
 +                      break;
 +              default:
 +                      panic("hammer2_chain_create_indirect: "
 +                            "unrecognized blockref type: %d",
 +                            parent->bref.type);
 +                      count = 0;
 +                      break;
 +              }
 +      } else {
 +              switch(parent->bref.type) {
 +              case HAMMER2_BREF_TYPE_INODE:
 +                      base = &parent->data->ipdata.u.blockset.blockref[0];
 +                      count = HAMMER2_SET_COUNT;
 +                      break;
 +              case HAMMER2_BREF_TYPE_INDIRECT:
 +                      base = &parent->data->npdata.blockref[0];
 +                      count = parent->bytes / sizeof(hammer2_blockref_t);
 +                      break;
 +              case HAMMER2_BREF_TYPE_VOLUME:
 +                      base = &hmp->voldata.sroot_blockset.blockref[0];
 +                      count = HAMMER2_SET_COUNT;
 +                      break;
 +              default:
 +                      panic("hammer2_chain_create_indirect: "
 +                            "unrecognized blockref type: %d",
 +                            parent->bref.type);
 +                      count = 0;
 +                      break;
 +              }
 +      }
 +
 +      /*
 +       * Scan for an unallocated bref, also skipping any slots occupied
 +       * by in-memory chain elements which may not yet have been updated
 +       * in the parent's bref array.
 +       */
 +      bzero(&dummy, sizeof(dummy));
 +      for (i = 0; i < count; ++i) {
 +              int nkeybits;
 +
 +              dummy.index = i;
 +              chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
 +              if (chain) {
 +                      bref = &chain->bref;
 +              } else if (base && base[i].type) {
 +                      bref = &base[i];
 +              } else {
 +                      continue;
 +              }
 +
 +              /*
 +               * Expand our calculated key range (key, keybits) to fit
 +               * the scanned key.  nkeybits represents the full range
 +               * that we will later cut in half (two halves @ nkeybits - 1).
 +               */
 +              nkeybits = keybits;
 +              if (nkeybits < bref->keybits)
 +                      nkeybits = bref->keybits;
 +              while (nkeybits < 64 &&
 +                     (~(((hammer2_key_t)1 << nkeybits) - 1) &
 +                      (key ^ bref->key)) != 0) {
 +                      ++nkeybits;
 +              }
 +
 +              /*
 +               * If the new key range is larger we have to determine
 +               * which side of the new key range the existing keys fall
 +               * under by checking the high bit, then collapsing the
 +               * locount into the hicount or vise-versa.
 +               */
 +              if (keybits != nkeybits) {
 +                      if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
 +                              hicount += locount;
 +                              locount = 0;
 +                      } else {
 +                              locount += hicount;
 +                              hicount = 0;
 +                      }
 +                      keybits = nkeybits;
 +              }
 +
 +              /*
 +               * The newly scanned key will be in the lower half or the
 +               * higher half of the (new) key range.
 +               */
 +              if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
 +                      ++hicount;
 +              else
 +                      ++locount;
 +      }
 +
 +      /*
 +       * Adjust keybits to represent half of the full range calculated
 +       * above (radix 63 max)
 +       */
 +      --keybits;
 +
 +      /*
 +       * Select whichever half contains the most elements.  Theoretically
 +       * we can select either side as long as it contains at least one
 +       * element (in order to ensure that a free slot is present to hold
 +       * the indirect block).
 +       */
 +      key &= ~(((hammer2_key_t)1 << keybits) - 1);
 +      if (hammer2_indirect_optimize) {
 +              /*
 +               * Insert node for least number of keys, this will arrange
 +               * the first few blocks of a large file or the first few
 +               * inodes in a directory with fewer indirect blocks when
 +               * created linearly.
 +               */
 +              if (hicount < locount && hicount != 0)
 +                      key |= (hammer2_key_t)1 << keybits;
 +              else
 +                      key &= ~(hammer2_key_t)1 << keybits;
 +      } else {
 +              /*
 +               * Insert node for most number of keys, best for heavily
 +               * fragmented files.
 +               */
 +              if (hicount > locount)
 +                      key |= (hammer2_key_t)1 << keybits;
 +              else
 +                      key &= ~(hammer2_key_t)1 << keybits;
 +      }
 +
 +      /*
 +       * How big should our new indirect block be?  It has to be at least
 +       * as large as its parent.
 +       */
 +      if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
 +              nbytes = HAMMER2_IND_BYTES_MIN;
 +      else
 +              nbytes = HAMMER2_IND_BYTES_MAX;
 +      if (nbytes < count * sizeof(hammer2_blockref_t))
 +              nbytes = count * sizeof(hammer2_blockref_t);
 +
 +      /*
 +       * Ok, create our new indirect block
 +       */
 +      dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
 +      dummy.bref.key = key;
 +      dummy.bref.keybits = keybits;
 +      dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
 +      ichain = hammer2_chain_alloc(hmp, &dummy.bref);
 +      atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
 +
 +      /*
 +       * Iterate the original parent and move the matching brefs into
 +       * the new indirect block.
 +       */
 +      for (i = 0; i < count; ++i) {
 +              /*
 +               * For keying purposes access the bref from the media or
 +               * from our in-memory cache.  In cases where the in-memory
 +               * cache overrides the media the keyrefs will be the same
 +               * anyway so we can avoid checking the cache when the media
 +               * has a key.
 +               */
 +              dummy.index = i;
 +              chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
 +              if (chain) {
 +                      bref = &chain->bref;
 +              } else if (base && base[i].type) {
 +                      bref = &base[i];
 +              } else {
 +                      if (ichain->index < 0)
 +                              ichain->index = i;
 +                      continue;
 +              }
 +
 +              /*
 +               * Skip keys not in the chosen half (low or high), only bit
 +               * (keybits - 1) needs to be compared but for safety we
 +               * will compare all msb bits plus that bit again.
 +               */
 +              if ((~(((hammer2_key_t)1 << keybits) - 1) &
 +                  (key ^ bref->key)) != 0) {
 +                      continue;
 +              }
 +
 +              /*
 +               * This element is being moved from the parent, its slot
 +               * is available for our new indirect block.
 +               */
 +              if (ichain->index < 0)
 +                      ichain->index = i;
 +
 +              /*
 +               * Load the new indirect block by acquiring or allocating
 +               * the related chain entries, then simply move them to the
 +               * new parent (ichain).
 +               *
 +               * When adjusting the parent/child relationship we must
 +               * set the MOVED bit but we do NOT update bref_flush
 +               * because otherwise we might synchronize a bref that has
 +               * not yet been flushed.  We depend on chain's bref_flush
 +               * either being correct or the chain being in a MODIFIED
 +               * state.
 +               *
 +               * We do not want to set MODIFIED here as this would result
 +               * in unnecessary reallocations.
 +               *
 +               * We must still set SUBMODIFIED in the parent but we do
 +               * that after the loop.
 +               *
 +               * XXX we really need a lock here but we don't need the
 +               *     data.  NODATA feature needed.
 +               */
 +              chain = hammer2_chain_get(hmp, parent, i,
 +                                        HAMMER2_LOOKUP_NODATA);
 +              SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
 +              if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
 +                      panic("hammer2_chain_create_indirect: collision");
 +              chain->parent = ichain;
 +              if (base)
 +                      bzero(&base[i], sizeof(base[i]));
 +              atomic_add_int(&parent->refs, -1);
 +              atomic_add_int(&ichain->refs, 1);
 +              if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
 +                      hammer2_chain_ref(hmp, chain);
 +                      atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
 +              }
 +              hammer2_chain_unlock(hmp, chain);
 +              KKASSERT(parent->refs > 0);
 +              chain = NULL;
 +      }
 +
 +      /*
 +       * Insert the new indirect block into the parent now that we've
 +       * cleared out some entries in the parent.  We calculated a good
 +       * insertion index in the loop above (ichain->index).
 +       *
 +       * We don't have to set MOVED here because we mark ichain modified
 +       * down below (so the normal modified -> flush -> set-moved sequence
 +       * applies).
 +       */
 +      KKASSERT(ichain->index >= 0);
 +      if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
 +              panic("hammer2_chain_create_indirect: ichain insertion");
 +      ichain->parent = parent;
 +      atomic_add_int(&parent->refs, 1);
 +
 +      /*
 +       * Mark the new indirect block modified after insertion, which
 +       * will propagate up through parent all the way to the root and
 +       * also allocate the physical block in ichain for our caller,
 +       * and assign ichain->data to a pre-zero'd space (because there
 +       * is not prior data to copy into it).
 +       *
 +       * We have to set SUBMODIFIED in ichain's flags manually so the
 +       * flusher knows it has to recurse through it to get to all of
 +       * our moved blocks, then call setsubmod() to set the bit
 +       * recursively.
 +       */
 +      hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
 +      hammer2_chain_parent_setsubmod(hmp, ichain);
 +      atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
 +
 +      /*
 +       * Figure out what to return.
 +       */
 +      if (create_bits > keybits) {
 +              /*
 +               * Key being created is way outside the key range,
 +               * return the original parent.
 +               */
 +              hammer2_chain_unlock(hmp, ichain);
 +      } else if (~(((hammer2_key_t)1 << keybits) - 1) &
 +                 (create_key ^ key)) {
 +              /*
 +               * Key being created is outside the key range,
 +               * return the original parent.
 +               */
 +              hammer2_chain_unlock(hmp, ichain);
 +      } else {
 +              /*
 +               * Otherwise its in the range, return the new parent.
 +               * (leave both the new and old parent locked).
 +               */
 +              parent = ichain;
 +      }
 +
 +      return(parent);
 +}
 +
 +/*
 + * Physically delete the specified chain element.  Note that inodes with
 + * open descriptors should not be deleted (as with other filesystems) until
 + * the last open descriptor is closed.
 + *
 + * This routine will remove the chain element from its parent and potentially
 + * also recurse upward and delete indirect blocks which become empty as a
 + * side effect.
 + *
 + * The caller must pass a pointer to the chain's parent, also locked and
 + * referenced.  (*parentp) will be modified in a manner similar to a lookup
 + * or iteration when indirect blocks are also deleted as a side effect.
 + *
 + * XXX This currently does not adhere to the MOVED flag protocol in that
 + *     the removal is immediately indicated in the parent's blockref[]
 + *     array.
 + */
 +void
 +hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
 +                   hammer2_chain_t *chain, int retain)
 +{
 +      hammer2_blockref_t *base;
 +      hammer2_inode_t *ip;
 +      int count;
 +
 +      if (chain->parent != parent)
 +              panic("hammer2_chain_delete: parent mismatch");
 +
 +      /*
 +       * Mark the parent modified so our base[] pointer remains valid
 +       * while we move entries.  For the optimized indirect block
 +       * case mark the parent moved instead.
 +       *
 +       * Calculate the blockref reference in the parent
 +       */
 +      switch(parent->bref.type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
 +              base = &parent->data->ipdata.u.blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
 +                                                HAMMER2_MODIFY_NO_MODIFY_TID);
 +              if (parent->flags & HAMMER2_CHAIN_INITIAL)
 +                      base = NULL;
 +              else
 +                      base = &parent->data->npdata.blockref[0];
 +              count = parent->bytes / sizeof(hammer2_blockref_t);
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
 +              base = &hmp->voldata.sroot_blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      default:
 +              panic("hammer2_chain_delete: unrecognized blockref type: %d",
 +                    parent->bref.type);
 +              count = 0;
 +              break;
 +      }
 +
 +      /*
 +       * Disconnect the bref in the parent, remove the chain, and
 +       * disconnect in-memory fields from the parent.
 +       */
 +      KKASSERT(chain->index >= 0 && chain->index < count);
 +      if (base)
 +              bzero(&base[chain->index], sizeof(*base));
 +
 +      SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
 +      atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
 +      atomic_add_int(&parent->refs, -1);      /* for splay entry */
 +      chain->index = -1;
 +      chain->parent = NULL;
 +
 +      /*
 +       * Cumulative adjustments must be propagated to the parent inode
 +       * when deleting and synchronized to ip.
 +       *
++       * The CCMS is deleted when pip is NULL'd out, here and also in
++       * chain_drop().  The CCMS is uninitialized when the pmp is NULL'd
++       * out (if it was non-NULL).  This is interlocked by the
++       * HAMMER2_CHAIN_DELETED flag to prevent reentrancy.
++       *
 +       * NOTE:  We do not propagate ip->delta_*count to the parent because
 +       *        these represent adjustments that have not yet been
 +       *        propagated upward, so we don't need to remove them from
 +       *        the parent.
 +       *
 +       * Clear the pointer to the parent inode.
 +       */
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
 +              ip = chain->u.ip;
 +              if (ip->pip) {
++                      ccms_inode_delete(ip->cino);
++
 +                      ip->pip->delta_icount -= ip->ip_data.inode_count;
 +                      ip->pip->delta_dcount -= ip->ip_data.data_count;
 +                      ip->ip_data.inode_count += ip->delta_icount;
 +                      ip->ip_data.data_count += ip->delta_dcount;
 +                      ip->delta_icount = 0;
 +                      ip->delta_dcount = 0;
 +                      --ip->pip->delta_icount;
 +                      ip->pip = NULL;
 +              }
 +              chain->u.ip->depth = 0;
 +      }
 +
 +      /*
 +       * If retain is 0 the deletion is permanent.  Because the chain is
 +       * no longer connected to the topology a flush will have no
 +       * visibility into it.  We must dispose of the references related
 +       * to the MODIFIED and MOVED flags, otherwise the ref count will
 +       * never transition to 0.
 +       *
 +       * If retain is non-zero the deleted element is likely an inode
 +       * which the vnops frontend will mark DESTROYED and flush.  In that
 +       * situation we must retain the flags for any open file descriptors
 +       * on the (removed) inode.  The final close will destroy the
 +       * disconnected chain.
 +       */
 +      if (retain == 0) {
 +              if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 +                      atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
 +                      hammer2_chain_drop(hmp, chain);
 +              }
 +              if (chain->flags & HAMMER2_CHAIN_MOVED) {
 +                      atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
 +                      hammer2_chain_drop(hmp, chain);
 +              }
 +      }
 +
 +      /*
 +       * The chain is still likely referenced, possibly even by a vnode
 +       * (if an inode), so defer further action until the chain gets
 +       * dropped.
 +       */
 +}
 +
 +/*
 + * Recursively flush the specified chain.  The chain is locked and
 + * referenced by the caller and will remain so on return.  The chain
 + * will remain referenced throughout but can temporarily lose its
 + * lock during the recursion to avoid unnecessarily stalling user
 + * processes.
 + *
 + *
 + */
 +TAILQ_HEAD(flush_deferral_list, hammer2_chain);
 +
 +struct hammer2_flush_info {
 +      struct flush_deferral_list flush_list;
 +      int             depth;
 +      hammer2_tid_t   modify_tid;
 +};
 +
 +typedef struct hammer2_flush_info hammer2_flush_info_t;
 +
 +static void
 +hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
 +                        hammer2_flush_info_t *info)
 +{
 +      hammer2_blockref_t *bref;
 +      hammer2_off_t pbase;
 +      size_t bbytes;
 +      size_t boff;
 +      char *bdata;
 +      struct buf *bp;
 +      int error;
 +      int wasmodified;
 +
 +      /*
 +       * If we hit the stack recursion depth limit defer the operation.
 +       * The controller of the info structure will execute the deferral
 +       * list and then retry.
 +       *
 +       * This is only applicable if SUBMODIFIED is set.  After a reflush
 +       * SUBMODIFIED will probably be cleared and we want to drop through
 +       * to finish processing the current element so our direct parent
 +       * can process the results.
 +       */
 +      if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
 +          (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
 +              if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
 +                      hammer2_chain_ref(hmp, chain);
 +                      TAILQ_INSERT_TAIL(&info->flush_list,
 +                                        chain, flush_node);
 +                      atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
 +              }
 +              return;
 +      }
 +
 +      if (hammer2_debug & 0x0008)
 +              kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
 +                      info->depth, info->depth, "",
 +                      chain->bref.type, chain->bref.data_off,
 +                      chain, chain->refs, chain->flags);
 +
 +      /*
 +       * If SUBMODIFIED is set we recurse the flush and adjust the
 +       * blockrefs accordingly.
 +       *
 +       * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
 +       *       finishing in the face of filesystem activity.
 +       */
 +      if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
 +              hammer2_chain_t *child;
 +              hammer2_chain_t *next;
 +              hammer2_blockref_t *base;
 +              int count;
 +
 +              /*
 +               * Clear SUBMODIFIED to catch races.  Note that if any
 +               * child has to be flushed SUBMODIFIED will wind up being
 +               * set again (for next time), but this does not stop us from
 +               * synchronizing block updates which occurred.
 +               *
 +               * We don't want to set our chain to MODIFIED gratuitously.
 +               */
 +              /* XXX SUBMODIFIED not interlocked, can race */
 +              atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
 +
 +              /*
 +               * Flush the children and update the blockrefs in the chain.
 +               * Be careful of ripouts during the loop.
 +               */
 +              next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
 +              if (next)
 +                      hammer2_chain_ref(hmp, next);
 +              while ((child = next) != NULL) {
 +                      next = SPLAY_NEXT(hammer2_chain_splay,
 +                                        &chain->shead, child);
 +                      if (next)
 +                              hammer2_chain_ref(hmp, next);
 +                      /*
 +                       * We only recurse if SUBMODIFIED (internal node)
 +                       * or MODIFIED (internal node or leaf) is set.
 +                       * However, we must still track whether any MOVED
 +                       * entries are present to determine if the chain's
 +                       * blockref's need updating or not.
 +                       */
 +                      if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
 +                                           HAMMER2_CHAIN_MODIFIED |
 +                                          HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
 +                              hammer2_chain_drop(hmp, child);
 +                              continue;
 +                      }
 +                      hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
 +                      hammer2_chain_drop(hmp, child);
 +                      if (child->parent != chain ||
 +                          (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
 +                                           HAMMER2_CHAIN_MODIFIED |
 +                                          HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
 +                              hammer2_chain_unlock(hmp, child);
 +                              continue;
 +                      }
 +
 +                      /*
 +                       * Propagate the DESTROYED flag if found set, then
 +                       * recurse the flush.
 +                       */
 +                      if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
 +                          (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
 +                              atomic_set_int(&child->flags,
 +                                             HAMMER2_CHAIN_DESTROYED |
 +                                             HAMMER2_CHAIN_SUBMODIFIED);
 +                      }
 +                      ++info->depth;
 +                      hammer2_chain_flush_pass1(hmp, child, info);
 +                      --info->depth;
 +                      hammer2_chain_unlock(hmp, child);
 +              }
 +
 +              /*
 +               * Now synchronize any block updates.
 +               */
 +              next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
 +              if (next)
 +                      hammer2_chain_ref(hmp, next);
 +              while ((child = next) != NULL) {
 +                      next = SPLAY_NEXT(hammer2_chain_splay,
 +                                        &chain->shead, child);
 +                      if (next)
 +                              hammer2_chain_ref(hmp, next);
 +                      if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
 +                              hammer2_chain_drop(hmp, child);
 +                              continue;
 +                      }
 +                      hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
 +                      hammer2_chain_drop(hmp, child);
 +                      if (child->parent != chain ||
 +                          (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
 +                              hammer2_chain_unlock(hmp, child);
 +                              continue;
 +                      }
 +
 +                      hammer2_chain_modify(hmp, chain,
 +                                           HAMMER2_MODIFY_NO_MODIFY_TID);
 +
 +                      switch(chain->bref.type) {
 +                      case HAMMER2_BREF_TYPE_INODE:
 +                              KKASSERT((chain->data->ipdata.op_flags &
 +                                        HAMMER2_OPFLAG_DIRECTDATA) == 0);
 +                              base = &chain->data->ipdata.u.blockset.
 +                                      blockref[0];
 +                              count = HAMMER2_SET_COUNT;
 +                              break;
 +                      case HAMMER2_BREF_TYPE_INDIRECT:
 +                              base = &chain->data->npdata.blockref[0];
 +                              count = chain->bytes /
 +                                      sizeof(hammer2_blockref_t);
 +                              break;
 +                      case HAMMER2_BREF_TYPE_VOLUME:
 +                              base = &hmp->voldata.sroot_blockset.blockref[0];
 +                              count = HAMMER2_SET_COUNT;
 +                              break;
 +                      default:
 +                              base = NULL;
 +                              panic("hammer2_chain_get: "
 +                                    "unrecognized blockref type: %d",
 +                                    chain->bref.type);
 +                      }
 +
 +                      KKASSERT(child->index >= 0);
 +                      base[child->index] = child->bref_flush;
 +
 +                      if (chain->bref.mirror_tid <
 +                          child->bref_flush.mirror_tid) {
 +                              chain->bref.mirror_tid =
 +                                      child->bref_flush.mirror_tid;
 +                      }
 +
 +                      if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
 +                          hmp->voldata.mirror_tid <
 +                          child->bref_flush.mirror_tid) {
 +                              hmp->voldata.mirror_tid =
 +                                      child->bref_flush.mirror_tid;
 +                      }
 +                      atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
 +                      hammer2_chain_drop(hmp, child); /* MOVED flag */
 +                      hammer2_chain_unlock(hmp, child);
 +              }
 +      }
 +
 +      /*
 +       * If destroying the object we unconditonally clear the MODIFIED
 +       * and MOVED bits, and we destroy the buffer without writing it
 +       * out.
 +       *
 +       * We don't bother updating the hash/crc or the chain bref.
 +       *
 +       * NOTE: The destroy'd object's bref has already been updated.
 +       *       so we can clear MOVED without propagating mirror_tid
 +       *       or modify_tid upward.
 +       *
 +       * XXX allocations for unflushed data can be returned to the
 +       *     free pool.
 +       */
 +      if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
 +              if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 +                      if (chain->bp) {
 +                              chain->bp->b_flags |= B_INVAL|B_RELBUF;
 +                      }
 +                      atomic_clear_int(&chain->flags,
 +                                       HAMMER2_CHAIN_MODIFIED |
 +                                       HAMMER2_CHAIN_MODIFY_TID);
 +                      hammer2_chain_drop(hmp, chain);
 +              }
 +              if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
 +                      atomic_clear_int(&chain->flags,
 +                                       HAMMER2_CHAIN_MODIFIED_AUX);
 +              }
 +              if (chain->flags & HAMMER2_CHAIN_MOVED) {
 +                      atomic_clear_int(&chain->flags,
 +                                       HAMMER2_CHAIN_MOVED);
 +                      hammer2_chain_drop(hmp, chain);
 +              }
 +              return;
 +      }
 +
 +      /*
 +       * Flush this chain entry only if it is marked modified.
 +       */
 +      if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
 +                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
 +              goto done;
 +      }
 +
 +      /*
 +       * Synchronize cumulative data and inode count adjustments to
 +       * the inode and propagate the deltas upward to the parent.
 +       */
 +      if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
 +              hammer2_inode_t *ip;
 +
 +              ip = chain->u.ip;
 +              ip->ip_data.inode_count += ip->delta_icount;
 +              ip->ip_data.data_count += ip->delta_dcount;
 +              if (ip->pip) {
 +                      ip->pip->delta_icount += ip->delta_icount;
 +                      ip->pip->delta_dcount += ip->delta_dcount;
 +              }
 +              ip->delta_icount = 0;
 +              ip->delta_dcount = 0;
 +      }
 +
 +      /*
 +       * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
 +       * used by the volume header (&hmp->vchain).
 +       */
 +      if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
 +                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
 +              goto done;
 +      }
 +      atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
 +
 +      /*
 +       * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
 +       * will re-test the MOVED bit.  We must also update the mirror_tid
 +       * and modify_tid fields as appropriate.
 +       *
 +       * bits own a single chain ref and the MOVED bit owns its own
 +       * chain ref.
 +       */
 +      chain->bref.mirror_tid = info->modify_tid;
 +      if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
 +              chain->bref.modify_tid = info->modify_tid;
 +      wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
 +      atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
 +                                      HAMMER2_CHAIN_MODIFY_TID);
 +
 +      if (chain->flags & HAMMER2_CHAIN_MOVED) {
 +              /*
 +               * Drop the ref from the MODIFIED bit we cleared.
 +               */
 +              if (wasmodified)
 +                      hammer2_chain_drop(hmp, chain);
 +      } else {
 +              /*
 +               * If we were MODIFIED we inherit the ref from clearing
 +               * that bit, otherwise we need another ref.
 +               */
 +              if (wasmodified == 0)
 +                      hammer2_chain_ref(hmp, chain);
 +              atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
 +      }
 +      chain->bref_flush = chain->bref;
 +
 +      /*
 +       * If this is part of a recursive flush we can go ahead and write
 +       * out the buffer cache buffer and pass a new bref back up the chain.
 +       *
 +       * This will never be a volume header.
 +       */
 +      switch(chain->bref.type) {
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              /*
 +               * The volume header is flushed manually by the syncer, not
 +               * here.
 +               */
 +              break;
 +      case HAMMER2_BREF_TYPE_DATA:
 +              /*
 +               * Data elements have already been flushed via the logical
 +               * file buffer cache.  Their hash was set in the bref by
 +               * the vop_write code.
 +               *
 +               * Make sure the buffer(s) have been flushed out here.
 +               */
 +              bbytes = chain->bytes;
 +              pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
 +              boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
 +
 +              bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
 +              if (bp) {
 +                      if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
 +                          (B_CACHE | B_DIRTY)) {
 +                              kprintf("x");
 +                              cluster_awrite(bp);
 +                      } else {
 +                              bp->b_flags |= B_RELBUF;
 +                              brelse(bp);
 +                      }
 +              }
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              /*
 +               * Indirect blocks may be in an INITIAL state.  Use the
 +               * chain_lock() call to ensure that the buffer has been
 +               * instantiated (even though it is already locked the buffer
 +               * might not have been instantiated).
 +               *
 +               * Only write the buffer out if it is dirty, it is possible
 +               * the operating system had already written out the buffer.
 +               */
 +              hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
 +              KKASSERT(chain->bp != NULL);
 +
 +              bp = chain->bp;
 +              if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
 +                  (bp->b_flags & B_DIRTY)) {
 +                      bawrite(chain->bp);
 +              } else {
 +                      brelse(chain->bp);
 +              }
 +              chain->bp = NULL;
 +              chain->data = NULL;
 +              hammer2_chain_unlock(hmp, chain);
 +              break;
 +      default:
 +              /*
 +               * Embedded elements have to be flushed out.
 +               */
 +              KKASSERT(chain->data != NULL);
 +              KKASSERT(chain->bp == NULL);
 +              bref = &chain->bref;
 +
 +              KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
 +
 +              if (chain->bp == NULL) {
 +                      /*
 +                       * The data is embedded, we have to acquire the
 +                       * buffer cache buffer and copy the data into it.
 +                       */
 +                      if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
 +                              bbytes = HAMMER2_MINIOSIZE;
 +                      pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
 +                      boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
 +
 +                      /*
 +                       * The getblk() optimization can only be used if the
 +                       * physical block size matches the request.
 +                       */
 +                      if (chain->bytes == bbytes) {
 +                              bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
 +                              error = 0;
 +                      } else {
 +                              error = bread(hmp->devvp, pbase, bbytes, &bp);
 +                              KKASSERT(error == 0);
 +                      }
 +                      bdata = (char *)bp->b_data + boff;
 +
 +                      /*
 +                       * Copy the data to the buffer, mark the buffer
 +                       * dirty, and convert the chain to unmodified.
 +                       *
 +                       * We expect we might have to make adjustments to
 +                       * non-data delayed-write buffers when doing an
 +                       * actual flush so use bawrite() instead of
 +                       * cluster_awrite() here.
 +                       */
 +                      bcopy(chain->data, bdata, chain->bytes);
 +                      bp->b_flags |= B_CLUSTEROK;
 +                      bawrite(bp);
 +                      bp = NULL;
 +                      chain->bref.check.iscsi32.value =
 +                              hammer2_icrc32(chain->data, chain->bytes);
 +                      if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
 +                              ++hammer2_iod_meta_write;
 +                      else
 +                              ++hammer2_iod_indr_write;
 +              } else {
 +                      chain->bref.check.iscsi32.value =
 +                              hammer2_icrc32(chain->data, chain->bytes);
 +              }
 +      }
 +
 +      /*
 +       * Adjustments to the bref.  The caller will use this to adjust
 +       * our chain's pointer to this chain element.
 +       */
 +      bref = &chain->bref;
 +
 +      switch(bref->type) {
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              KKASSERT(chain->data != NULL);
 +              KKASSERT(chain->bp == NULL);
 +
 +              hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
 +                      hammer2_icrc32(
 +                              (char *)&hmp->voldata +
 +                               HAMMER2_VOLUME_ICRC1_OFF,
 +                              HAMMER2_VOLUME_ICRC1_SIZE);
 +              hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
 +                      hammer2_icrc32(
 +                              (char *)&hmp->voldata +
 +                               HAMMER2_VOLUME_ICRC0_OFF,
 +                              HAMMER2_VOLUME_ICRC0_SIZE);
 +              hmp->voldata.icrc_volheader =
 +                      hammer2_icrc32(
 +                              (char *)&hmp->voldata +
 +                               HAMMER2_VOLUME_ICRCVH_OFF,
 +                              HAMMER2_VOLUME_ICRCVH_SIZE);
 +              break;
 +      default:
 +              break;
 +
 +      }
 +done:
 +      if (hammer2_debug & 0x0008) {
 +              kprintf("%*.*s} %p/%d %04x ",
 +                      info->depth, info->depth, "",
 +                      chain, chain->refs, chain->flags);
 +      }
 +}
 +
 +#if 0
 +/*
 + * PASS2 - not yet implemented (should be called only with the root chain?)
 + */
 +static void
 +hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
 +{
 +}
 +#endif
 +
 +/*
 + * Stand-alone flush.  If the chain is unable to completely flush we have
 + * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
 + * clear the MOVED bit after flushing in this situation or our desynchronized
 + * bref will not properly update in the parent.
 + *
 + * This routine can be called from several places but the most important
 + * is from the hammer2_vop_reclaim() function.  We want to try to completely
 + * clean out the inode structure to prevent disconnected inodes from
 + * building up and blowing out the kmalloc pool.
 + *
 + * If modify_tid is 0 (usual case), a new modify_tid is allocated and
 + * applied to the flush.  The depth-limit handling code is the only
 + * code which passes a non-zero modify_tid to hammer2_chain_flush().
 + */
 +void
 +hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
 +                  hammer2_tid_t modify_tid)
 +{
 +      hammer2_chain_t *parent;
 +      hammer2_chain_t *scan;
 +      hammer2_blockref_t *base;
 +      hammer2_flush_info_t info;
 +      int count;
 +      int reflush;
 +
 +      /*
 +       * Execute the recursive flush and handle deferrals.
 +       *
 +       * Chains can be ridiculously long (thousands deep), so to
 +       * avoid blowing out the kernel stack the recursive flush has a
 +       * depth limit.  Elements at the limit are placed on a list
 +       * for re-execution after the stack has been popped.
 +       */
 +      bzero(&info, sizeof(info));
 +      TAILQ_INIT(&info.flush_list);
 +
 +      if (modify_tid == 0) {
 +              hammer2_voldata_lock(hmp);
 +              info.modify_tid = hmp->voldata.alloc_tid++;
 +              atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
 +              hammer2_voldata_unlock(hmp);
 +      } else {
 +              info.modify_tid = modify_tid;
 +      }
 +      reflush = 1;
 +
 +      while (reflush) {
 +              /*
 +               * Primary recursion
 +               */
 +              hammer2_chain_flush_pass1(hmp, chain, &info);
 +              reflush = 0;
 +
 +              while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
 +                      /*
 +                       * Secondary recursion.  Note that a reference is
 +                       * retained from the element's presence on the
 +                       * deferral list.
 +                       */
 +                      KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
 +                      TAILQ_REMOVE(&info.flush_list, scan, flush_node);
 +                      atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
 +
 +                      /*
 +                       * Now that we've popped back up we can do a secondary
 +                       * recursion on the deferred elements.
 +                       */
 +                      if (hammer2_debug & 0x0040)
 +                              kprintf("defered flush %p\n", scan);
 +                      hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
 +                      hammer2_chain_flush(hmp, scan, info.modify_tid);
 +                      hammer2_chain_unlock(hmp, scan);
 +
 +                      /*
 +                       * Only flag a reflush if SUBMODIFIED is no longer
 +                       * set.  If SUBMODIFIED is set the element will just
 +                       * wind up on our flush_list again.
 +                       */
 +                      if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
 +                                          HAMMER2_CHAIN_MODIFIED |
 +                                          HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
 +                              reflush = 1;
 +                      }
 +                      hammer2_chain_drop(hmp, scan);
 +              }
 +              if ((hammer2_debug & 0x0040) && reflush)
 +                      kprintf("reflush %p\n", chain);
 +      }
 +
 +      /*
 +       * The SUBMODIFIED bit must propagate upward if the chain could not
 +       * be completely flushed.
 +       */
 +      if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
 +                          HAMMER2_CHAIN_MODIFIED |
 +                          HAMMER2_CHAIN_MODIFIED_AUX |
 +                          HAMMER2_CHAIN_MOVED)) {
 +              hammer2_chain_parent_setsubmod(hmp, chain);
 +      }
 +
 +      /*
 +       * If the only thing left is a simple bref update try to
 +       * pro-actively update the parent, otherwise return early.
 +       */
 +      parent = chain->parent;
 +      if (parent == NULL) {
 +              return;
 +      }
 +      if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
 +          (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
 +                           HAMMER2_CHAIN_MODIFIED |
 +                           HAMMER2_CHAIN_MODIFIED_AUX |
 +                           HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
 +              return;
 +      }
 +
 +      /*
 +       * We are locking backwards so allow the lock to fail
 +       */
 +      if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 +              return;
 +      }
 +
 +      /*
 +       * We are updating brefs but we have to call chain_modify()
 +       * because our caller is not being run from a recursive flush.
 +       *
 +       * This will also chain up the parent list and set the SUBMODIFIED
 +       * flag.
 +       *
 +       * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
 +       * modification is only related to updating a bref in the parent.
 +       *
 +       * When updating the blockset embedded in the volume header we must
 +       * also update voldata.mirror_tid.
 +       */
 +      hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
 +      hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
 +
 +      switch(parent->bref.type) {
 +      case HAMMER2_BREF_TYPE_INODE:
 +              base = &parent->data->ipdata.u.blockset.
 +                      blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              break;
 +      case HAMMER2_BREF_TYPE_INDIRECT:
 +              base = &parent->data->npdata.blockref[0];
 +              count = parent->bytes /
 +                      sizeof(hammer2_blockref_t);
 +              break;
 +      case HAMMER2_BREF_TYPE_VOLUME:
 +              base = &hmp->voldata.sroot_blockset.blockref[0];
 +              count = HAMMER2_SET_COUNT;
 +              if (chain->flags & HAMMER2_CHAIN_MOVED) {
 +                      if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
 +                              hmp->voldata.mirror_tid =
 +                                      chain->bref.mirror_tid;
 +                      }
 +              }
 +              break;
 +      default:
 +              base = NULL;
 +              panic("hammer2_chain_flush: "
 +                    "unrecognized blockref type: %d",
 +                    parent->bref.type);
 +      }
 +
 +      /*
 +       * Update the blockref in the parent.  We do not have to set
 +       * MOVED in the parent because the parent has been marked modified,
 +       * so the flush sequence will pick up the bref change.
 +       *
 +       * We do have to propagate mirror_tid upward.
 +       */
 +      KKASSERT(chain->index >= 0 &&
 +               chain->index < count);
 +      KKASSERT(chain->parent == parent);
 +      if (chain->flags & HAMMER2_CHAIN_MOVED) {
 +              base[chain->index] = chain->bref_flush;
 +              if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
 +                      parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
 +              atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
 +              hammer2_chain_drop(hmp, chain);
 +      } else if (bcmp(&base[chain->index], &chain->bref_flush,
 +                 sizeof(chain->bref)) != 0) {
 +              panic("hammer2: unflagged bref update(2)");
 +      }
 +
 +      lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
 +      hammer2_chain_unlock(hmp, parent);
 +}
index 0760f18,0000000..bc17dbe
mode 100644,000000..100644
--- /dev/null
@@@ -1,938 -1,0 +1,941 @@@
 +/*-
 + * Copyright (c) 2011, 2012 The DragonFly Project.  All rights reserved.
 + *
 + * This code is derived from software contributed to The DragonFly Project
 + * by Matthew Dillon <dillon@backplane.com>
 + *
 + * Redistribution and use in source and binary forms, with or without
 + * modification, are permitted provided that the following conditions
 + * are met:
 + *
 + * 1. Redistributions of source code must retain the above copyright
 + *    notice, this list of conditions and the following disclaimer.
 + * 2. Redistributions in binary form must reproduce the above copyright
 + *    notice, this list of conditions and the following disclaimer in
 + *    the documentation and/or other materials provided with the
 + *    distribution.
 + * 3. Neither the name of The DragonFly Project nor the names of its
 + *    contributors may be used to endorse or promote products derived
 + *    from this software without specific, prior written permission.
 + *
 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
 + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
 + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 + * SUCH DAMAGE.
 + */
 +#include <sys/param.h>
 +#include <sys/systm.h>
 +#include <sys/kernel.h>
 +#include <sys/nlookup.h>
 +#include <sys/vnode.h>
 +#include <sys/mount.h>
 +#include <sys/fcntl.h>
 +#include <sys/buf.h>
 +#include <sys/uuid.h>
 +#include <sys/vfsops.h>
 +#include <sys/sysctl.h>
 +
 +#include "hammer2.h"
 +#include "hammer2_disk.h"
 +#include "hammer2_mount.h"
 +
 +struct hammer2_sync_info {
 +      int error;
 +      int waitfor;
 +};
 +
 +TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
 +static struct hammer2_mntlist hammer2_mntlist;
 +static struct lock hammer2_mntlk;
 +
 +int hammer2_debug;
 +int hammer2_cluster_enable = 1;
 +int hammer2_hardlink_enable = 1;
 +long hammer2_iod_file_read;
 +long hammer2_iod_meta_read;
 +long hammer2_iod_indr_read;
 +long hammer2_iod_file_write;
 +long hammer2_iod_meta_write;
 +long hammer2_iod_indr_write;
 +long hammer2_iod_volu_write;
 +long hammer2_ioa_file_read;
 +long hammer2_ioa_meta_read;
 +long hammer2_ioa_indr_read;
 +long hammer2_ioa_file_write;
 +long hammer2_ioa_meta_write;
 +long hammer2_ioa_indr_write;
 +long hammer2_ioa_volu_write;
 +
 +SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
 +
 +SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
 +         &hammer2_debug, 0, "");
 +SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
 +         &hammer2_cluster_enable, 0, "");
 +SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
 +         &hammer2_hardlink_enable, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
 +         &hammer2_iod_file_read, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
 +         &hammer2_iod_meta_read, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
 +         &hammer2_iod_indr_read, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
 +         &hammer2_iod_file_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
 +         &hammer2_iod_meta_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
 +         &hammer2_iod_indr_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
 +         &hammer2_iod_volu_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
 +         &hammer2_ioa_file_read, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
 +         &hammer2_ioa_meta_read, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
 +         &hammer2_ioa_indr_read, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
 +         &hammer2_ioa_file_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
 +         &hammer2_ioa_meta_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
 +         &hammer2_ioa_indr_write, 0, "");
 +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
 +         &hammer2_ioa_volu_write, 0, "");
 +
 +static int hammer2_vfs_init(struct vfsconf *conf);
 +static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 +                              struct ucred *cred);
 +static int hammer2_remount(struct mount *, char *, struct vnode *,
 +                              struct ucred *);
 +static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
 +static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
 +static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
 +                              struct ucred *cred);
 +static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
 +                              struct ucred *cred);
 +static int hammer2_vfs_sync(struct mount *mp, int waitfor);
 +static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
 +                              ino_t ino, struct vnode **vpp);
 +static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
 +                              struct fid *fhp, struct vnode **vpp);
 +static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
 +static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
 +                              int *exflagsp, struct ucred **credanonp);
 +
 +static int hammer2_install_volume_header(hammer2_mount_t *hmp);
 +static int hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
 +static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
 +
 +/*
 + * HAMMER2 vfs operations.
 + */
 +static struct vfsops hammer2_vfsops = {
 +      .vfs_init       = hammer2_vfs_init,
 +      .vfs_sync       = hammer2_vfs_sync,
 +      .vfs_mount      = hammer2_vfs_mount,
 +      .vfs_unmount    = hammer2_vfs_unmount,
 +      .vfs_root       = hammer2_vfs_root,
 +      .vfs_statfs     = hammer2_vfs_statfs,
 +      .vfs_statvfs    = hammer2_vfs_statvfs,
 +      .vfs_vget       = hammer2_vfs_vget,
 +      .vfs_vptofh     = hammer2_vfs_vptofh,
 +      .vfs_fhtovp     = hammer2_vfs_fhtovp,
 +      .vfs_checkexp   = hammer2_vfs_checkexp
 +};
 +
 +MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
 +
 +VFS_SET(hammer2_vfsops, hammer2, 0);
 +MODULE_VERSION(hammer2, 1);
 +
 +static
 +int
 +hammer2_vfs_init(struct vfsconf *conf)
 +{
 +      int error;
 +
 +      error = 0;
 +
 +      if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
 +              error = EINVAL;
 +      if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
 +              error = EINVAL;
 +      if (HAMMER2_ALLOCREF_BYTES != sizeof(struct hammer2_allocref))
 +              error = EINVAL;
 +      if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
 +              error = EINVAL;
 +
 +      if (error)
 +              kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
 +
 +      lockinit(&hammer2_mntlk, "mntlk", 0, 0);
 +      TAILQ_INIT(&hammer2_mntlist);
 +
 +      return (error);
 +}
 +
 +/*
 + * Mount or remount HAMMER2 fileystem from physical media
 + *
 + *    mountroot
 + *            mp              mount point structure
 + *            path            NULL
 + *            data            <unused>
 + *            cred            <unused>
 + *
 + *    mount
 + *            mp              mount point structure
 + *            path            path to mount point
 + *            data            pointer to argument structure in user space
 + *                    volume  volume path (device@LABEL form)
 + *                    hflags  user mount flags
 + *            cred            user credentials
 + *
 + * RETURNS:   0       Success
 + *            !0      error number
 + */
 +static
 +int
 +hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 +            struct ucred *cred)
 +{
 +      struct hammer2_mount_info info;
 +      hammer2_pfsmount_t *pmp;
 +      hammer2_mount_t *hmp;
 +      hammer2_key_t lhc;
 +      struct vnode *devvp;
 +      struct nlookupdata nd;
 +      hammer2_chain_t *parent;
 +      hammer2_chain_t *schain;
 +      hammer2_chain_t *rchain;
 +      char devstr[MNAMELEN];
 +      size_t size;
 +      size_t done;
 +      char *dev;
 +      char *label;
 +      int ronly = 1;
 +      int create_hmp;
 +      int error;
 +
 +      hmp = NULL;
 +      pmp = NULL;
 +      dev = NULL;
 +      label = NULL;
 +      devvp = NULL;
 +
 +      kprintf("hammer2_mount\n");
 +
 +      if (path == NULL) {
 +              /*
 +               * Root mount
 +               */
 +              return (EOPNOTSUPP);
 +      } else {
 +              /*
 +               * Non-root mount or updating a mount
 +               */
 +              error = copyin(data, &info, sizeof(info));
 +              if (error)
 +                      return (error);
 +
 +              error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
 +              if (error)
 +                      return (error);
 +
 +              /* Extract device and label */
 +              dev = devstr;
 +              label = strchr(devstr, '@');
 +              if (label == NULL ||
 +                  ((label + 1) - dev) > done) {
 +                      return (EINVAL);
 +              }
 +              *label = '\0';
 +              label++;
 +              if (*label == '\0')
 +                      return (EINVAL);
 +
 +              if (mp->mnt_flag & MNT_UPDATE) {
 +                      /* Update mount */
 +                      /* HAMMER2 implements NFS export via mountctl */
 +                      hmp = MPTOHMP(mp);
 +                      devvp = hmp->devvp;
 +                      error = hammer2_remount(mp, path, devvp, cred);
 +                      return error;
 +              }
 +      }
 +
 +      /*
 +       * New non-root mount
 +       */
 +      /* Lookup name and verify it refers to a block device */
 +      error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
 +      if (error == 0)
 +              error = nlookup(&nd);
 +      if (error == 0)
 +              error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
 +      nlookup_done(&nd);
 +
 +      if (error == 0) {
 +              if (vn_isdisk(devvp, &error))
 +                      error = vfs_mountedon(devvp);
 +      }
 +
 +      /*
 +       * Determine if the device has already been mounted.  After this
 +       * check hmp will be non-NULL if we are doing the second or more
 +       * hammer2 mounts from the same device.
 +       */
 +      lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
 +      TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
 +              if (hmp->devvp == devvp)
 +                      break;
 +      }
 +
 +      /*
 +       * Open the device if this isn't a secondary mount
 +       */
 +      if (hmp) {
 +              create_hmp = 0;
 +      } else {
 +              create_hmp = 1;
 +              if (error == 0 && vcount(devvp) > 0)
 +                      error = EBUSY;
 +
 +              /*
 +               * Now open the device
 +               */
 +              if (error == 0) {
 +                      ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
 +                      vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 +                      error = vinvalbuf(devvp, V_SAVE, 0, 0);
 +                      if (error == 0) {
 +                              error = VOP_OPEN(devvp,
 +                                               ronly ? FREAD : FREAD | FWRITE,
 +                                               FSCRED, NULL);
 +                      }
 +                      vn_unlock(devvp);
 +              }
 +              if (error && devvp) {
 +                      vrele(devvp);
 +                      devvp = NULL;
 +              }
 +              if (error) {
 +                      lockmgr(&hammer2_mntlk, LK_RELEASE);
 +                      return error;
 +              }
 +      }
 +
 +      /*
 +       * Block device opened successfully, finish initializing the
 +       * mount structure.
 +       *
 +       * From this point on we have to call hammer2_unmount() on failure.
 +       */
 +      pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
 +      mp->mnt_data = (qaddr_t)pmp;
 +      pmp->mp = mp;
 +
 +      if (create_hmp) {
 +              hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
 +              hmp->ronly = ronly;
 +              hmp->devvp = devvp;
 +              kmalloc_create(&hmp->minode, "HAMMER2-inodes");
 +              kmalloc_create(&hmp->mchain, "HAMMER2-chains");
 +              TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
 +      }
++      ccms_domain_init(&pmp->ccms_dom);
 +      pmp->hmp = hmp;
 +      ++hmp->pmp_count;
 +      lockmgr(&hammer2_mntlk, LK_RELEASE);
 +      kprintf("hammer2_mount hmp=%p pmpcnt=%d\n", hmp, hmp->pmp_count);
 +      
 +      mp->mnt_flag = MNT_LOCAL;
 +      mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
 +
 +      if (create_hmp) {
 +              /*
 +               * vchain setup. vchain.data is special cased to NULL.
 +               * vchain.refs is initialized and will never drop to 0.
 +               */
 +              hmp->vchain.refs = 1;
 +              hmp->vchain.data = (void *)&hmp->voldata;
 +              hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
 +              hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
 +              hmp->vchain.bref_flush = hmp->vchain.bref;
 +              /* hmp->vchain.u.xxx is left NULL */
 +              lockinit(&hmp->vchain.lk, "volume", 0, LK_CANRECURSE);
 +              lockinit(&hmp->alloclk, "h2alloc", 0, 0);
 +              lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
 +
 +              /*
 +               * Install the volume header
 +               */
 +              error = hammer2_install_volume_header(hmp);
 +              if (error) {
 +                      hammer2_vfs_unmount(mp, MNT_FORCE);
 +                      return error;
 +              }
 +      }
 +
 +      /*
 +       * required mount structure initializations
 +       */
 +      mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
 +      mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
 +
 +      mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
 +      mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
 +
 +      /*
 +       * Optional fields
 +       */
 +      mp->mnt_iosize_max = MAXPHYS;
 +
 +      /*
 +       * First locate the super-root inode, which is key 0 relative to the
 +       * volume header's blockset.
 +       *
 +       * Then locate the root inode by scanning the directory keyspace
 +       * represented by the label.
 +       */
 +      if (create_hmp) {
 +              parent = &hmp->vchain;
 +              hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
 +              schain = hammer2_chain_lookup(hmp, &parent,
 +                                    HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 0);
 +              hammer2_chain_unlock(hmp, parent);
 +              if (schain == NULL) {
 +                      kprintf("hammer2_mount: invalid super-root\n");
 +                      hammer2_vfs_unmount(mp, MNT_FORCE);
 +                      return EINVAL;
 +              }
 +              hammer2_chain_ref(hmp, schain); /* for hmp->schain */
 +              hmp->schain = schain;           /* left locked */
 +      } else {
 +              schain = hmp->schain;
 +              hammer2_chain_lock(hmp, schain, HAMMER2_RESOLVE_ALWAYS);
 +      }
 +
 +      parent = schain;
 +      lhc = hammer2_dirhash(label, strlen(label));
 +      rchain = hammer2_chain_lookup(hmp, &parent,
 +                                    lhc, lhc + HAMMER2_DIRHASH_LOMASK,
 +                                    0);
 +      while (rchain) {
 +              if (rchain->bref.type == HAMMER2_BREF_TYPE_INODE &&
 +                  rchain->u.ip &&
 +                  strcmp(label, rchain->data->ipdata.filename) == 0) {
 +                      break;
 +              }
 +              rchain = hammer2_chain_next(hmp, &parent, rchain,
 +                                          lhc, lhc + HAMMER2_DIRHASH_LOMASK,
 +                                          0);
 +      }
 +      hammer2_chain_unlock(hmp, parent);
 +      if (rchain == NULL) {
 +              kprintf("hammer2_mount: PFS label not found\n");
 +              hammer2_vfs_unmount(mp, MNT_FORCE);
 +              return EINVAL;
 +      }
 +      if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
 +              hammer2_chain_unlock(hmp, rchain);
 +              kprintf("hammer2_mount: PFS label already mounted!\n");
 +              hammer2_vfs_unmount(mp, MNT_FORCE);
 +              return EBUSY;
 +      }
 +      atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
 +
 +      hammer2_chain_ref(hmp, rchain); /* for pmp->rchain */
 +      hammer2_chain_unlock(hmp, rchain);
 +      pmp->rchain = rchain;           /* left held & unlocked */
 +      pmp->iroot = rchain->u.ip;      /* implied hold from rchain */
 +      pmp->iroot->pmp = pmp;
++
 +      kprintf("iroot %p\n", pmp->iroot);
 +
 +      vfs_getnewfsid(mp);
 +      vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
 +      vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
 +      vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
 +
 +      copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
 +      bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
 +      bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
 +      copyinstr(path, mp->mnt_stat.f_mntonname,
 +                sizeof(mp->mnt_stat.f_mntonname) - 1,
 +                &size);
 +
 +      hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
 +
 +      return 0;
 +}
 +
 +static
 +int
 +hammer2_remount(struct mount *mp, char *path, struct vnode *devvp,
 +                struct ucred *cred)
 +{
 +      return (0);
 +}
 +
 +static
 +int
 +hammer2_vfs_unmount(struct mount *mp, int mntflags)
 +{
 +      hammer2_pfsmount_t *pmp;
 +      hammer2_mount_t *hmp;
 +      int flags;
 +      int error = 0;
 +      int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
 +      struct vnode *devvp;
 +
 +      pmp = MPTOPMP(mp);
 +      hmp = pmp->hmp;
 +      flags = 0;
 +
 +      if (mntflags & MNT_FORCE)
 +              flags |= FORCECLOSE;
 +
 +      hammer2_mount_exlock(hmp);
 +
 +      /*
 +       * If mount initialization proceeded far enough we must flush
 +       * its vnodes.
 +       */
 +      if (pmp->iroot)
 +              error = vflush(mp, 0, flags);
 +
 +      if (error)
 +              return error;
 +
 +      lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
 +      --hmp->pmp_count;
 +      kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n", hmp, hmp->pmp_count);
 +
 +      /*
 +       * Flush any left over chains.  The voldata lock is only used
 +       * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
 +       */
 +      hammer2_voldata_lock(hmp);
 +      if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
 +                               HAMMER2_CHAIN_MODIFIED_AUX |
 +                               HAMMER2_CHAIN_SUBMODIFIED)) {
 +              hammer2_voldata_unlock(hmp);
 +              hammer2_vfs_sync(mp, MNT_WAIT);
 +      } else {
 +              hammer2_voldata_unlock(hmp);
 +      }
 +      if (hmp->pmp_count == 0) {
 +              if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
 +                                       HAMMER2_CHAIN_MODIFIED_AUX |
 +                                       HAMMER2_CHAIN_SUBMODIFIED)) {
 +                      kprintf("hammer2_unmount: chains left over after "
 +                              "final sync\n");
 +                      if (hammer2_debug & 0x0010)
 +                              Debugger("entered debugger");
 +              }
 +      }
 +
 +      /*
 +       * Cleanup the root and super-root chain elements (which should be
 +       * clean).
 +       */
 +      pmp->iroot = NULL;
 +      if (pmp->rchain) {
 +              atomic_clear_int(&pmp->rchain->flags, HAMMER2_CHAIN_MOUNTED);
 +              KKASSERT(pmp->rchain->refs == 1);
 +              hammer2_chain_drop(hmp, pmp->rchain);
 +              pmp->rchain = NULL;
 +      }
++      ccms_domain_uninit(&pmp->ccms_dom);
 +      if (hmp->pmp_count == 0) {
 +              if (hmp->schain) {
 +                      KKASSERT(hmp->schain->refs == 1);
 +                      hammer2_chain_drop(hmp, hmp->schain);
 +                      hmp->schain = NULL;
 +              }
 +
 +              /*
 +               * Finish up with the device vnode
 +               */
 +              if ((devvp = hmp->devvp) != NULL) {
 +                      vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
 +                      hmp->devvp = NULL;
 +                      VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE));
 +                      vrele(devvp);
 +                      devvp = NULL;
 +              }
 +      }
 +      hammer2_mount_unlock(hmp);
 +
 +      pmp->mp = NULL;
 +      pmp->hmp = NULL;
 +      mp->mnt_data = NULL;
 +
 +      kfree(pmp, M_HAMMER2);
 +      if (hmp->pmp_count == 0) {
 +              TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
 +              kmalloc_destroy(&hmp->minode);
 +              kmalloc_destroy(&hmp->mchain);
 +              kfree(hmp, M_HAMMER2);
 +      }
 +      lockmgr(&hammer2_mntlk, LK_RELEASE);
 +      return (error);
 +}
 +
 +static
 +int
 +hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
 +           ino_t ino, struct vnode **vpp)
 +{
 +      kprintf("hammer2_vget\n");
 +      return (EOPNOTSUPP);
 +}
 +
 +static
 +int
 +hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
 +{
 +      hammer2_pfsmount_t *pmp;
 +      int error;
 +      struct vnode *vp;
 +
 +      pmp = MPTOPMP(mp);
 +      hammer2_mount_exlock(pmp->hmp);
 +      if (pmp->iroot == NULL) {
 +              *vpp = NULL;
 +              error = EINVAL;
 +      } else {
 +              vp = hammer2_igetv(pmp->iroot, &error);
 +              *vpp = vp;
 +              if (vp == NULL)
 +                      kprintf("vnodefail\n");
 +      }
 +      hammer2_mount_unlock(pmp->hmp);
 +
 +      return (error);
 +}
 +
 +/*
 + * Filesystem status
 + *
 + * XXX incorporate pmp->iroot->ip_data.inode_quota and data_quota
 + */
 +static
 +int
 +hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
 +{
 +      hammer2_pfsmount_t *pmp;
 +      hammer2_mount_t *hmp;
 +
 +      pmp = MPTOPMP(mp);
 +      hmp = MPTOHMP(mp);
 +
 +      mp->mnt_stat.f_files = pmp->iroot->ip_data.inode_count +
 +                             pmp->iroot->delta_icount;
 +      mp->mnt_stat.f_ffree = 0;
 +      mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
 +      mp->mnt_stat.f_bfree = (hmp->voldata.allocator_size -
 +                              hmp->voldata.allocator_beg) / HAMMER2_PBUFSIZE;
 +      mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
 +
 +      *sbp = mp->mnt_stat;
 +      return (0);
 +}
 +
 +static
 +int
 +hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
 +{
 +      hammer2_pfsmount_t *pmp;
 +      hammer2_mount_t *hmp;
 +
 +      pmp = MPTOPMP(mp);
 +      hmp = MPTOHMP(mp);
 +
 +      mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
 +      mp->mnt_vstat.f_files = pmp->iroot->ip_data.inode_count +
 +                              pmp->iroot->delta_icount;
 +      mp->mnt_vstat.f_ffree = 0;
 +      mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
 +      mp->mnt_vstat.f_bfree = (hmp->voldata.allocator_size -
 +                               hmp->voldata.allocator_beg) / HAMMER2_PBUFSIZE;
 +      mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
 +
 +      *sbp = mp->mnt_vstat;
 +      return (0);
 +}
 +
 +/*
 + * Sync the entire filesystem; this is called from the filesystem syncer
 + * process periodically and whenever a user calls sync(1) on the hammer
 + * mountpoint.
 + *
 + * Currently is actually called from the syncer! \o/
 + *
 + * This task will have to snapshot the state of the dirty inode chain.
 + * From that, it will have to make sure all of the inodes on the dirty
 + * chain have IO initiated. We make sure that io is initiated for the root
 + * block.
 + *
 + * If waitfor is set, we wait for media to acknowledge the new rootblock.
 + *
 + * THINKS: side A vs side B, to have sync not stall all I/O?
 + */
 +static
 +int
 +hammer2_vfs_sync(struct mount *mp, int waitfor)
 +{
 +      struct hammer2_sync_info info;
 +      hammer2_mount_t *hmp;
 +      int flags;
 +      int error;
 +      int haswork;
 +
 +      hmp = MPTOHMP(mp);
 +
 +      flags = VMSC_GETVP;
 +      if (waitfor & MNT_LAZY)
 +              flags |= VMSC_ONEPASS;
 +
 +      info.error = 0;
 +      info.waitfor = MNT_NOWAIT;
 +      vmntvnodescan(mp, flags | VMSC_NOWAIT,
 +                    hammer2_sync_scan1,
 +                    hammer2_sync_scan2, &info);
 +      if (info.error == 0 && (waitfor & MNT_WAIT)) {
 +              info.waitfor = waitfor;
 +                  vmntvnodescan(mp, flags,
 +                                hammer2_sync_scan1,
 +                                hammer2_sync_scan2, &info);
 +
 +      }
 +#if 0
 +      if (waitfor == MNT_WAIT) {
 +              /* XXX */
 +      } else {
 +              /* XXX */
 +      }
 +#endif
 +      hammer2_chain_lock(hmp, &hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
 +      if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
 +                               HAMMER2_CHAIN_MODIFIED_AUX |
 +                               HAMMER2_CHAIN_SUBMODIFIED)) {
 +              hammer2_chain_flush(hmp, &hmp->vchain, 0);
 +              haswork = 1;
 +      } else {
 +              haswork = 0;
 +      }
 +      hammer2_chain_unlock(hmp, &hmp->vchain);
 +
 +      error = 0;
 +
 +      if ((waitfor & MNT_LAZY) == 0) {
 +              waitfor = MNT_NOWAIT;
 +              vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
 +              error = VOP_FSYNC(hmp->devvp, waitfor, 0);
 +              vn_unlock(hmp->devvp);
 +      }
 +
 +      if (error == 0 && haswork) {
 +              struct buf *bp;
 +
 +              /*
 +               * Synchronize the disk before flushing the volume
 +               * header.
 +               */
 +              bp = getpbuf(NULL);
 +              bp->b_bio1.bio_offset = 0;
 +              bp->b_bufsize = 0;
 +              bp->b_bcount = 0;
 +              bp->b_cmd = BUF_CMD_FLUSH;
 +              bp->b_bio1.bio_done = biodone_sync;
 +              bp->b_bio1.bio_flags |= BIO_SYNC;
 +              vn_strategy(hmp->devvp, &bp->b_bio1);
 +              biowait(&bp->b_bio1, "h2vol");
 +              relpbuf(bp, NULL);
 +
 +              /*
 +               * Then we can safely flush the volume header.  Volume
 +               * data is locked separately to prevent ioctl functions
 +               * from deadlocking due to a configuration issue.
 +               */
 +              bp = getblk(hmp->devvp, 0, HAMMER2_PBUFSIZE, 0, 0);
 +              hammer2_voldata_lock(hmp);
 +              bcopy(&hmp->voldata, bp->b_data, HAMMER2_PBUFSIZE);
 +              hammer2_voldata_unlock(hmp);
 +              bawrite(bp);
 +      }
 +      return (error);
 +}
 +
 +/*
 + * Sync passes.
 + *
 + * NOTE: We don't test SUBMODIFIED or MOVED here because the fsync code
 + *     won't flush on those flags.  The syncer code above will do a
 + *     general meta-data flush globally that will catch these flags.
 + */
 +static int
 +hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
 +{
 +      hammer2_inode_t *ip;
 +
 +      ip = VTOI(vp);
 +      if (vp->v_type == VNON || ip == NULL ||
 +          ((ip->chain.flags & (HAMMER2_CHAIN_MODIFIED |
 +                               HAMMER2_CHAIN_DIRTYEMBED)) == 0 &&
 +           RB_EMPTY(&vp->v_rbdirty_tree))) {
 +              return(-1);
 +      }
 +      return(0);
 +}
 +
 +static int
 +hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
 +{
 +      struct hammer2_sync_info *info = data;
 +      hammer2_inode_t *ip;
 +      int error;
 +
 +      ip = VTOI(vp);
 +      if (vp->v_type == VNON || vp->v_type == VBAD ||
 +          ((ip->chain.flags & (HAMMER2_CHAIN_MODIFIED |
 +                               HAMMER2_CHAIN_DIRTYEMBED)) == 0 &&
 +          RB_EMPTY(&vp->v_rbdirty_tree))) {
 +              return(0);
 +      }
 +      error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
 +      if (error)
 +              info->error = error;
 +      return(0);
 +}
 +
 +static
 +int
 +hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
 +{
 +      return (0);
 +}
 +
 +static
 +int
 +hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
 +             struct fid *fhp, struct vnode **vpp)
 +{
 +      return (0);
 +}
 +
 +static
 +int
 +hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
 +               int *exflagsp, struct ucred **credanonp)
 +{
 +      return (0);
 +}
 +
 +/*
 + * Support code for hammer2_mount().  Read, verify, and install the volume
 + * header into the HMP
 + *
 + * XXX read four volhdrs and use the one with the highest TID whos CRC
 + *     matches.
 + *
 + * XXX check iCRCs.
 + *
 + * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
 + *     nonexistant locations.
 + *
 + * XXX Record selected volhdr and ring updates to each of 4 volhdrs
 + */
 +static
 +int
 +hammer2_install_volume_header(hammer2_mount_t *hmp)
 +{
 +      hammer2_volume_data_t *vd;
 +      struct buf *bp;
 +      hammer2_crc32_t crc0, crc, bcrc0, bcrc;
 +      int error_reported;
 +      int error;
 +      int valid;
 +      int i;
 +
 +      error_reported = 0;
 +      error = 0;
 +      valid = 0;
 +      bp = NULL;
 +
 +      /*
 +       * There are up to 4 copies of the volume header (syncs iterate
 +       * between them so there is no single master).  We don't trust the
 +       * volu_size field so we don't know precisely how large the filesystem
 +       * is, so depend on the OS to return an error if we go beyond the
 +       * block device's EOF.
 +       */
 +      for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
 +              error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
 +                            HAMMER2_VOLUME_BYTES, &bp);
 +              if (error) {
 +                      brelse(bp);
 +                      bp = NULL;
 +                      continue;
 +              }
 +
 +              vd = (struct hammer2_volume_data *) bp->b_data;
 +              if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
 +                  (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
 +                      brelse(bp);
 +                      bp = NULL;
 +                      continue;
 +              }
 +
 +              if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
 +                      /* XXX: Reversed-endianness filesystem */
 +                      kprintf("hammer2: reverse-endian filesystem detected");
 +                      brelse(bp);
 +                      bp = NULL;
 +                      continue;
 +              }
 +
 +              crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
 +              crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
 +                                    HAMMER2_VOLUME_ICRC0_SIZE);
 +              bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
 +              bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
 +                                     HAMMER2_VOLUME_ICRC1_SIZE);
 +              if ((crc0 != crc) || (bcrc0 != bcrc)) {
 +                      kprintf("hammer2 volume header crc "
 +                              "mismatch copy #%d\t%08x %08x",
 +                              i, crc0, crc);
 +                      error_reported = 1;
 +                      brelse(bp);
 +                      bp = NULL;
 +                      continue;
 +              }
 +              if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
 +                      valid = 1;
 +                      hmp->voldata = *vd;
 +              }
 +              brelse(bp);
 +              bp = NULL;
 +      }
 +      if (valid) {
 +              error = 0;
 +              if (error_reported)
 +                      kprintf("hammer2: a valid volume header was found\n");
 +      } else {
 +              error = EINVAL;
 +              kprintf("hammer2: no valid volume headers found!\n");
 +      }
 +      return (error);
 +}
 +