From 8cd0a023d85e93ba91304273c892a533202c863b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 19 Nov 2007 00:53:40 +0000 Subject: [PATCH] HAMMER 3/many - more core infrastructure. * Add an in-memory B-Tree node abstraction * Add an in-memory record abstraction. * Put the B-Tree cursor code in its own source file. * Fill in more of the VOP code. * Do a major clean-up of all in-memory structures and some on-disk structures. All the major in-memory structures now use similarly named functions. * Move inter-cluster link from a B-Tree leaf node to a B-Tree internal node, giving us a left and right boundary to play with. This simplifies the algorithms by quite a bit. * Allow the B-Tree to be unbalanced by moving the sub-type from the B-Tree node header to the B-Tree element structure. * Revamp the I/O infrastructure, in particular allow B-Tree nodes to be held passively. * Implement a flexible B-Tree node cache. References into the B-Tree can be cached by inodes. If the related buffer is flushed by the system, the related cache pointers will be cleared. --- sbin/newfs_hammer/newfs_hammer.c | 31 +- sys/conf/files | 3 +- sys/vfs/hammer/Makefile | 6 +- sys/vfs/hammer/hammer.h | 290 +++- sys/vfs/hammer/hammer_btree.c | 2012 +++++++++++++-------------- sys/vfs/hammer/hammer_btree.h | 285 ++-- sys/vfs/hammer/hammer_cursor.c | 406 ++++++ sys/vfs/hammer/hammer_cursor.h | 106 ++ sys/vfs/hammer/hammer_disk.h | 47 +- sys/vfs/hammer/hammer_inode.c | 238 +++- sys/vfs/hammer/hammer_io.c | 63 +- sys/vfs/hammer/hammer_object.c | 163 ++- sys/vfs/hammer/hammer_ondisk.c | 1055 ++++++++++---- sys/vfs/hammer/hammer_subs.c | 105 +- sys/vfs/hammer/hammer_transaction.c | 6 +- sys/vfs/hammer/hammer_vfsops.c | 4 +- sys/vfs/hammer/hammer_vnops.c | 638 +++++++-- 17 files changed, 3614 insertions(+), 1844 deletions(-) create mode 100644 sys/vfs/hammer/hammer_cursor.c create mode 100644 sys/vfs/hammer/hammer_cursor.h diff --git a/sbin/newfs_hammer/newfs_hammer.c b/sbin/newfs_hammer/newfs_hammer.c index 970eff5925..56102c82bd 100644 --- a/sbin/newfs_hammer/newfs_hammer.c +++ b/sbin/newfs_hammer/newfs_hammer.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sbin/newfs_hammer/newfs_hammer.c,v 1.5 2007/11/02 00:54:26 dillon Exp $ + * $DragonFly: src/sbin/newfs_hammer/newfs_hammer.c,v 1.6 2007/11/19 00:53:39 dillon Exp $ */ #include "newfs_hammer.h" @@ -551,7 +551,8 @@ format_cluster(struct volume_info *vol, int isroot) */ ondisk->clu_btree_parent_vol_no = -1; ondisk->clu_btree_parent_clu_no = -1; - ondisk->clu_btree_parent_clu_id = -1; + ondisk->clu_btree_parent_offset = -1; + ondisk->clu_btree_parent_clu_gen = -1; /* * Cluster 0 is the root cluster. Set the B-Tree range for this @@ -560,6 +561,11 @@ format_cluster(struct volume_info *vol, int isroot) * Note that delete_tid for the ending range must be set to 0, * 0 indicates 'not deleted', aka 'the most recent'. See * hammer_btree_cmp() in sys/vfs/hammer/hammer_btree.c. + * + * The root cluster's key space represents the entire key space for + * the filesystem. The btree_end element appears to be inclusive + * only because we can't overflow our variables. It's actually + * non-inclusive... that is, it is a right-side boundary element. */ if (isroot) { ondisk->clu_btree_beg.obj_id = -0x8000000000000000LL; @@ -574,7 +580,7 @@ format_cluster(struct volume_info *vol, int isroot) ondisk->clu_btree_end.create_tid = 0xFFFFFFFFFFFFFFFFULL; ondisk->clu_btree_end.delete_tid = 0; /* special case */ ondisk->clu_btree_end.rec_type = 0xFFFFU; - ondisk->clu_btree_end.obj_type = 0xFFFFU; + ondisk->clu_btree_end.obj_type = 0; format_root(cluster); } @@ -595,10 +601,10 @@ format_root(struct cluster_info *cluster) int32_t btree_off; int32_t rec_off; int32_t data_off; - hammer_btree_node_t bnode; + hammer_node_ondisk_t bnode; union hammer_record_ondisk *rec; struct hammer_inode_data *idata; - hammer_btree_leaf_elm_t elm; + hammer_btree_elm_t elm; bnode = alloc_btree_element(cluster, &btree_off); rec = alloc_record_element(cluster, &rec_off); @@ -634,16 +640,15 @@ format_root(struct cluster_info *cluster) * Create the root of the B-Tree. The root is a leaf node so we * do not have to worry about boundary elements. */ - bnode->base.count = 1; - bnode->base.type = HAMMER_BTREE_LEAF_NODE; - bnode->base.subtype = 0; + bnode->count = 1; + bnode->type = HAMMER_BTREE_TYPE_LEAF; - elm = &bnode->leaf.elms[0]; + elm = &bnode->elms[0]; elm->base = rec->base.base; - elm->record.rec_offset = rec_off; - elm->record.data_offset = rec->base.data_offset; - elm->record.data_len = rec->base.data_len; - elm->record.data_crc = rec->base.data_crc; + elm->leaf.rec_offset = rec_off; + elm->leaf.data_offset = rec->base.data_offset; + elm->leaf.data_len = rec->base.data_len; + elm->leaf.data_crc = rec->base.data_crc; } void diff --git a/sys/conf/files b/sys/conf/files index 1aa013552b..9b40baa4ab 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,5 +1,5 @@ # $FreeBSD: src/sys/conf/files,v 1.340.2.137 2003/06/04 17:10:30 sam Exp $ -# $DragonFly: src/sys/conf/files,v 1.194 2007/11/18 17:53:01 pavalos Exp $ +# $DragonFly: src/sys/conf/files,v 1.195 2007/11/19 00:53:38 dillon Exp $ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and @@ -1144,6 +1144,7 @@ vfs/hammer/hammer_vfsops.c optional hammer vfs/hammer/hammer_vnops.c optional hammer vfs/hammer/hammer_subs.c optional hammer vfs/hammer/hammer_ondisk.c optional hammer +vfs/hammer/hammer_cursor.c optional hammer vfs/hammer/hammer_btree.c optional hammer vfs/hammer/hammer_io.c optional hammer vfs/hammer/hammer_transaction.c optional hammer diff --git a/sys/vfs/hammer/Makefile b/sys/vfs/hammer/Makefile index 9d0068c0d5..aac8b9d94e 100644 --- a/sys/vfs/hammer/Makefile +++ b/sys/vfs/hammer/Makefile @@ -1,11 +1,11 @@ # -# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.3 2007/11/07 00:43:24 dillon Exp $ +# $DragonFly: src/sys/vfs/hammer/Makefile,v 1.4 2007/11/19 00:53:40 dillon Exp $ KMOD= hammer SRCS= hammer_vfsops.c hammer_vnops.c hammer_inode.c \ hammer_subs.c hammer_ondisk.c hammer_io.c \ - hammer_btree.c hammer_transaction.c hammer_alist.c \ - hammer_object.c hammer_transaction.c hammer_spike.c + hammer_cursor.c hammer_btree.c hammer_transaction.c \ + hammer_alist.c hammer_object.c hammer_spike.c NOMAN= diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 9921d8fbd8..2fffec3582 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.5 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.6 2007/11/19 00:53:40 dillon Exp $ */ /* * This header file contains structures used internally by the HAMMERFS @@ -49,10 +49,10 @@ #include #include #include +#include #include #include - #include "hammer_alist.h" #include "hammer_disk.h" #include "hammer_mount.h" @@ -80,11 +80,14 @@ struct hammer_transaction { hammer_tid_t tid; }; +typedef struct hammer_transaction *hammer_transaction_t; + /* * HAMMER locks */ struct hammer_lock { int refs; + int lockcount; int wanted; struct thread *locktd; }; @@ -92,7 +95,7 @@ struct hammer_lock { static __inline int hammer_islocked(struct hammer_lock *lock) { - return(lock->refs > 0); + return(lock->lockcount != 0); } static __inline int @@ -102,46 +105,101 @@ hammer_islastref(struct hammer_lock *lock) } /* - * Structures used to internally represent an inode + * Structure used to represent an inode in-memory. + * + * The record and data associated with an inode may be out of sync with + * the disk (xDIRTY flags), or not even on the disk at all (ONDISK flag + * clear). + * + * An inode may also hold a cache of unsynchronized records, used for + * database and directories only. Unsynchronized regular file data is + * stored in the buffer cache. + * + * NOTE: A file which is created and destroyed within the initial + * synchronization period can wind up not doing any disk I/O at all. + * + * Finally, an inode may cache numerous disk-referencing B-Tree cursors. */ struct hammer_ino_rb_tree; struct hammer_inode; RB_HEAD(hammer_ino_rb_tree, hammer_inode); RB_PROTOTYPEX(hammer_ino_rb_tree, INFO, hammer_inode, rb_node, - hammer_ino_rb_compare, hammer_inode_info_t); + hammer_ino_rb_compare, hammer_inode_info_t); + +struct hammer_rec_rb_tree; +struct hammer_record; +RB_HEAD(hammer_rec_rb_tree, hammer_record); +RB_PROTOTYPEX(hammer_rec_rb_tree, INFO, hammer_record, rb_node, + hammer_rec_rb_compare, hammer_base_elm_t); + +TAILQ_HEAD(hammer_node_list, hammer_node); struct hammer_inode { RB_ENTRY(hammer_inode) rb_node; u_int64_t obj_id; /* (key) object identifier */ hammer_tid_t obj_asof; /* (key) snapshot transid or 0 */ - hammer_tid_t obj_lasttid; /* last modified tid (for fsync) */ + hammer_tid_t last_tid; /* last modified tid (for fsync) */ struct hammer_mount *hmp; int flags; struct vnode *vp; struct lockf advlock; + struct hammer_lock lock; struct hammer_inode_record ino_rec; struct hammer_inode_data ino_data; - struct hammer_lock lock; + struct hammer_rec_rb_tree rec_tree; /* red-black record tree */ + struct hammer_node *cache; /* cached B-Tree node shortcut */ }; +typedef struct hammer_inode *hammer_inode_t; + #define VTOI(vp) ((struct hammer_inode *)(vp)->v_data) -#define HAMMER_INODE_DDIRTY 0x0001 -#define HAMMER_INODE_RDIRTY 0x0002 -#define HAMMER_INODE_ITIMES 0x0004 /* mtime or atime modified */ +#define HAMMER_INODE_DDIRTY 0x0001 /* in-memory ino_data is dirty */ +#define HAMMER_INODE_RDIRTY 0x0002 /* in-memory ino_rec is dirty */ +#define HAMMER_INODE_ITIMES 0x0004 /* in-memory mtime/atime modified */ +#define HAMMER_INODE_ONDISK 0x0010 /* inode is on-disk (else not yet) */ + +#define HAMMER_MAX_INODE_CURSORS 4 /* - * Structures used to internally represent a volume and a cluster + * Structure used to represent an unsynchronized record in-memory. This + * structure is orgranized in a per-inode RB-tree. If the inode is not + * on disk then neither are any records and the in-memory record tree + * represents the entire contents of the inode. If the inode is on disk + * then the on-disk B-Tree is scanned in parallel with the in-memory + * RB-Tree to synthesize the current state of the file. + * + * Only current (delete_tid == 0) unsynchronized records are kept in-memory. */ +struct hammer_record { + RB_ENTRY(hammer_record) rb_node; + hammer_tid_t last_tid; + struct hammer_inode *ip; + union hammer_record_ondisk rec; + union hammer_data_ondisk *data; + int32_t data_len; + int flags; +}; +typedef struct hammer_record *hammer_record_t; + +#define HAMMER_RECF_ALLOCDATA 0x0001 +#define HAMMER_RECF_ONRBTREE 0x0002 +#define HAMMER_RECF_DELETION 0x0004 /* placemark a deletion */ + +/* + * Structures used to internally represent a volume and a cluster + */ struct hammer_volume; struct hammer_cluster; struct hammer_supercl; struct hammer_buffer; +struct hammer_node; RB_HEAD(hammer_vol_rb_tree, hammer_volume); RB_HEAD(hammer_clu_rb_tree, hammer_cluster); RB_HEAD(hammer_scl_rb_tree, hammer_supercl); RB_HEAD(hammer_buf_rb_tree, hammer_buffer); +RB_HEAD(hammer_nod_rb_tree, hammer_node); RB_PROTOTYPE2(hammer_vol_rb_tree, hammer_volume, rb_node, hammer_vol_rb_compare, int32_t); @@ -151,6 +209,8 @@ RB_PROTOTYPE2(hammer_scl_rb_tree, hammer_supercl, rb_node, hammer_scl_rb_compare, int32_t); RB_PROTOTYPE2(hammer_buf_rb_tree, hammer_buffer, rb_node, hammer_buf_rb_compare, int32_t); +RB_PROTOTYPE2(hammer_nod_rb_tree, hammer_node, rb_node, + hammer_nod_rb_compare, int32_t); /* * IO management - embedded at the head of various in-memory structures @@ -176,8 +236,10 @@ struct hammer_io { u_int released : 1; /* bp released (w/ B_LOCKED set) */ }; +typedef struct hammer_io *hammer_io_t; + /* - * In-memory volume + * In-memory volume representing on-disk buffer */ struct hammer_volume { struct hammer_io io; @@ -195,8 +257,10 @@ struct hammer_volume { int vol_flags; }; +typedef struct hammer_volume *hammer_volume_t; + /* - * In-memory super-cluster + * In-memory super-cluster representing on-disk buffer */ struct hammer_supercl { struct hammer_io io; @@ -207,8 +271,13 @@ struct hammer_supercl { int32_t scl_no; }; +typedef struct hammer_supercl *hammer_supercl_t; + /* - * In-memory cluster + * In-memory cluster representing on-disk buffer + * + * The cluster's indexing range is cached in hammer_cluster, separate + * from the ondisk info in order to allow cursors to point to it. */ struct hammer_cluster { struct hammer_io io; @@ -220,22 +289,64 @@ struct hammer_cluster { struct hammer_alist_live alist_btree; struct hammer_alist_live alist_record; struct hammer_alist_live alist_mdata; + struct hammer_nod_rb_tree rb_nods_root; /* cursors in cluster */ + struct hammer_base_elm clu_btree_beg; /* copy of on-disk info */ + struct hammer_base_elm clu_btree_end; /* copy of on-disk info */ int32_t clu_no; }; +typedef struct hammer_cluster *hammer_cluster_t; + /* - * In-memory buffer (other then volume, super-cluster, or cluster) + * In-memory buffer (other then volume, super-cluster, or cluster), + * representing an on-disk buffer. */ struct hammer_buffer { struct hammer_io io; RB_ENTRY(hammer_buffer) rb_node; hammer_fsbuf_ondisk_t ondisk; - struct hammer_cluster *cluster; struct hammer_volume *volume; - struct hammer_alist_live alist; + struct hammer_cluster *cluster; int32_t buf_no; + u_int64_t buf_type; + struct hammer_alist_live alist; + struct hammer_node_list clist; + struct hammer_node *save_scan; }; +typedef struct hammer_buffer *hammer_buffer_t; + +/* + * In-memory B-Tree node, representing an on-disk B-Tree node. + * + * This is a hang-on structure which is backed by a hammer_buffer, + * indexed by a hammer_cluster, and used for fine-grained locking of + * B-Tree nodes in order to properly control lock ordering. A hammer_buffer + * can contain multiple nodes representing wildly disassociated portions + * of the B-Tree so locking cannot be done on a buffer-by-buffer basis. + * + * This structure uses a cluster-relative index to reduce the number + * of layers required to access it, and also because all on-disk B-Tree + * references are cluster-relative offsets. + */ +struct hammer_node { + struct hammer_lock lock; /* node-by-node lock */ + TAILQ_ENTRY(hammer_node) entry; /* per-buffer linkage */ + RB_ENTRY(hammer_node) rb_node; /* per-cluster linkage */ + int32_t node_offset; /* cluster-rel offset */ + struct hammer_cluster *cluster; + struct hammer_buffer *buffer; /* backing buffer */ + hammer_node_ondisk_t ondisk; /* ptr to on-disk structure */ + struct hammer_node **cache1; /* passive cache(s) */ + struct hammer_node **cache2; +}; + +typedef struct hammer_node *hammer_node_t; + +/* + * Common I/O management structure - embedded in in-memory structures + * which are backed by filesystem buffers. + */ union hammer_io_structure { struct hammer_io io; struct hammer_volume volume; @@ -246,6 +357,8 @@ union hammer_io_structure { #define HAMFS_CLUSTER_DIRTY 0x0001 +#include "hammer_cursor.h" + /* * Internal hammer mount data structure */ @@ -256,16 +369,15 @@ struct hammer_mount { struct hammer_vol_rb_tree rb_vols_root; struct hammer_volume *rootvol; struct hammer_cluster *rootcl; - /* struct hammer_volume *cache_volume */ - /* struct hammer_cluster *cache_cluster */ - /* struct hammer_buffer *cache_buffer */ char *zbuf; /* HAMMER_BUFSIZE bytes worth of all-zeros */ uuid_t fsid; udev_t fsid_udev; u_int32_t namekey_iterator; - hammer_tid_t last_tid; + hammer_tid_t last_tid; /* tid for transaction id */ + hammer_tid_t last_ino; /* inode creation iterator */ }; +typedef struct hammer_mount *hammer_mount_t; #endif @@ -286,68 +398,104 @@ int hammer_vfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp); int hammer_get_vnode(struct hammer_inode *ip, int lktype, struct vnode **vpp); -struct hammer_inode *hammer_get_inode(struct hammer_mount *hmp, +struct hammer_inode *hammer_get_inode(hammer_mount_t hmp, u_int64_t obj_id, int *errorp); -void hammer_lock_inode(struct hammer_inode *ip); void hammer_put_inode(struct hammer_inode *ip); void hammer_put_inode_ref(struct hammer_inode *ip); -int hammer_unload_inode(struct hammer_inode *ip, void *data __unused); -int hammer_unload_volume(struct hammer_volume *volume, void *data __unused); -int hammer_load_volume(struct hammer_mount *hmp, const char *volname); - -void hammer_lock(struct hammer_lock *lock); +int hammer_unload_inode(hammer_inode_t ip, void *data __unused); +int hammer_unload_volume(hammer_volume_t volume, void *data __unused); +int hammer_install_volume(hammer_mount_t hmp, const char *volname); + +hammer_record_ondisk_t + hammer_ip_first(hammer_cursor_t cursor, hammer_inode_t ip); +hammer_record_ondisk_t + hammer_ip_next(hammer_cursor_t cursor); +int hammer_ip_resolve_data(hammer_cursor_t cursor); +hammer_record_t + hammer_alloc_ip_record(struct hammer_transaction *trans, + hammer_inode_t ip); +void hammer_free_ip_record(hammer_record_t record); + +int hammer_cursor_up(hammer_cursor_t cursor); +int hammer_cursor_toroot(hammer_cursor_t cursor); +int hammer_cursor_down(hammer_cursor_t cursor); + +void hammer_lock_ex(struct hammer_lock *lock); +int hammer_lock_ex_try(struct hammer_lock *lock); +void hammer_lock_sh(struct hammer_lock *lock); void hammer_unlock(struct hammer_lock *lock); void hammer_ref(struct hammer_lock *lock); void hammer_unref(struct hammer_lock *lock); -void hammer_ref_to_lock(struct hammer_lock *lock); -void hammer_lock_to_ref(struct hammer_lock *lock); +void hammer_downgrade(struct hammer_lock *lock); + u_int32_t hammer_to_unix_xid(uuid_t *uuid); -void hammer_to_timespec(u_int64_t hammerts, struct timespec *ts); +void hammer_guid_to_uuid(uuid_t *uuid, u_int32_t guid); +void hammer_to_timespec(hammer_tid_t tid, struct timespec *ts); +hammer_tid_t hammer_timespec_to_transid(struct timespec *ts); + enum vtype hammer_get_vnode_type(u_int8_t obj_type); u_int8_t hammer_get_obj_type(enum vtype vtype); int64_t hammer_directory_namekey(void *name, int len); -int hammer_btree_lookup(hammer_btree_info_t info, - hammer_base_elm_t key, int flags); -int hammer_btree_extract(hammer_btree_info_t info, int flags); -int hammer_btree_iterate(hammer_btree_cursor_t cursor, - hammer_base_elm_t key); -int hammer_btree_insert(hammer_btree_info_t info, - hammer_btree_leaf_elm_t elm); -int hammer_btree_delete(hammer_btree_info_t info, hammer_base_elm_t key); -int hammer_btree_cursor_init(hammer_btree_cursor_t cursor, - struct hammer_cluster *cluster); -void hammer_btree_cursor_done(hammer_btree_cursor_t cusor); -int hammer_btree_info_init(hammer_btree_info_t info, - struct hammer_cluster *cluster); -void hammer_btree_info_done(hammer_btree_info_t info); +int hammer_init_cursor_hmp(hammer_cursor_t cursor, hammer_mount_t hmp); +int hammer_init_cursor_ip(hammer_cursor_t cursor, hammer_inode_t ip); +void hammer_done_cursor(hammer_cursor_t cursor); + +int hammer_btree_lookup(hammer_cursor_t cursor); +int hammer_btree_extract(hammer_cursor_t cursor, int flags); +int hammer_btree_iterate(hammer_cursor_t cursor); +int hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm); +int hammer_btree_delete(hammer_cursor_t cursor); +int hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2); void *hammer_bread(struct hammer_cluster *cluster, int32_t cloff, - u_int64_t buf_type, - int *errorp, struct hammer_buffer **bufferp); + u_int64_t buf_type, int *errorp, + struct hammer_buffer **bufferp); + +hammer_volume_t hammer_get_root_volume(hammer_mount_t hmp, int *errorp); +hammer_cluster_t hammer_get_root_cluster(hammer_mount_t hmp, int *errorp); -struct hammer_volume *hammer_get_volume(struct hammer_mount *hmp, +hammer_volume_t hammer_get_volume(hammer_mount_t hmp, int32_t vol_no, int *errorp); -struct hammer_supercl *hammer_get_supercl(struct hammer_volume *volume, +hammer_supercl_t hammer_get_supercl(hammer_volume_t volume, int32_t scl_no, int *errorp, int isnew); -struct hammer_cluster *hammer_get_cluster(struct hammer_volume *volume, +hammer_cluster_t hammer_get_cluster(hammer_volume_t volume, int32_t clu_no, int *errorp, int isnew); -struct hammer_buffer *hammer_get_buffer(struct hammer_cluster *cluster, - int32_t buf_no, int64_t buf_type, int *errorp); +hammer_buffer_t hammer_get_buffer(hammer_cluster_t cluster, + int32_t buf_no, u_int64_t buf_type, int *errorp); + +int hammer_ref_cluster(hammer_cluster_t cluster); +int hammer_ref_buffer(hammer_buffer_t buffer); +void hammer_flush_buffer_nodes(hammer_buffer_t buffer); + + +void hammer_rel_volume(hammer_volume_t volume, int flush); +void hammer_rel_supercl(hammer_supercl_t supercl, int flush); +void hammer_rel_cluster(hammer_cluster_t cluster, int flush); +void hammer_rel_buffer(hammer_buffer_t buffer, int flush); + +hammer_node_t hammer_get_node(hammer_cluster_t cluster, + int32_t node_offset, int *errorp); +int hammer_ref_node(hammer_node_t node); +void hammer_rel_node(hammer_node_t node); +void hammer_cache_node(hammer_node_t node, + struct hammer_node **cache); +void hammer_uncache_node(struct hammer_node **cache); +void hammer_flush_node(hammer_node_t node); + struct hammer_cluster *hammer_get_rootcl(struct hammer_mount *hmp); void hammer_dup_buffer(struct hammer_buffer **bufferp, struct hammer_buffer *buffer); void hammer_dup_cluster(struct hammer_cluster **clusterp, struct hammer_cluster *cluster); -void *hammer_alloc_btree(struct hammer_cluster *cluster, - int *errorp, struct hammer_buffer **bufferp); +hammer_node_t hammer_alloc_btree(struct hammer_cluster *cluster, int *errorp); void *hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes, int *errorp, struct hammer_buffer **bufferp); void *hammer_alloc_record(struct hammer_cluster *cluster, int *errorp, struct hammer_buffer **bufferp); void hammer_free_btree_ptr(struct hammer_buffer *buffer, - hammer_btree_node_t node); + hammer_node_ondisk_t node); void hammer_free_data_ptr(struct hammer_buffer *buffer, void *data, int bytes); void hammer_free_record_ptr(struct hammer_buffer *buffer, @@ -364,22 +512,34 @@ void hammer_put_buffer(struct hammer_buffer *buffer, int flush); void hammer_init_alist_config(void); -void hammer_start_transaction(struct hammer_mount *hmp, - struct hammer_transaction *trans); +void hammer_start_transaction(struct hammer_transaction *trans, + struct hammer_mount *hmp); void hammer_commit_transaction(struct hammer_transaction *trans); void hammer_abort_transaction(struct hammer_transaction *trans); void hammer_modify_inode(struct hammer_transaction *trans, - struct hammer_inode *ip, int flags); -int hammer_alloc_inode(struct hammer_transaction *trans, struct vattr *vap, - struct ucred *cred, struct hammer_inode **ipp); + hammer_inode_t ip, int flags); +int hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap, + struct ucred *cred, struct hammer_inode *dip, + struct hammer_inode **ipp); +void hammer_rel_inode(hammer_inode_t ip); + int hammer_add_directory(struct hammer_transaction *trans, - struct hammer_inode *dip, struct namecache *ncp, - struct hammer_inode *nip); + hammer_inode_t dip, struct namecache *ncp, + hammer_inode_t nip); +int hammer_del_directory(struct hammer_transaction *trans, + hammer_cursor_t cursor, hammer_inode_t dip, + hammer_inode_t ip); +int hammer_delete_range(struct hammer_transaction *trans, + hammer_inode_t ip, int64_t off_beg, int64_t off_end); +int hammer_add_data(struct hammer_transaction *trans, + hammer_inode_t ip, int64_t offset, + void *data, int bytes); int hammer_io_read(struct vnode *devvp, struct hammer_io *io); int hammer_io_new(struct vnode *devvp, struct hammer_io *io); void hammer_io_release(struct hammer_io *io, int flush); +int hammer_io_checkflush(hammer_io_t io); #endif @@ -410,6 +570,12 @@ hammer_modify_buffer(struct hammer_buffer *buffer) buffer->io.modified = 1; } +static __inline void +hammer_modify_node(struct hammer_node *node) +{ + node->buffer->io.modified = 1; +} + /* * Return the cluster-relative byte offset of an element within a buffer */ diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index 45e2cf64a4..b7ad8b7272 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -31,618 +31,136 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.3 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.4 2007/11/19 00:53:40 dillon Exp $ */ /* - * HAMMER BH-Tree index + * HAMMER B-Tree index * * HAMMER implements a modified B+Tree. In documentation this will - * simply be refered to as the HAMMER BH-Tree. Basically a BH-Tree + * simply be refered to as the HAMMER B-Tree. Basically a B-Tree * looks like a B+Tree (A B-Tree which stores its records only at the leafs * of the tree), but adds two additional boundary elements which describe * the left-most and right-most element a node is able to represent. In - * otherwords, we have boundary elements at the two ends of a BH-Tree node + * otherwords, we have boundary elements at the two ends of a B-Tree node * instead of sub-tree pointers. * - * A BH-Tree internal node looks like this: + * A B-Tree internal node looks like this: * * B N N N N N N B <-- boundary and internal elements * S S S S S S S <-- subtree pointers * - * A BH-Tree leaf node basically looks like this: + * A B-Tree leaf node basically looks like this: * * L L L L L L L L <-- leaf elemenets * - * The recursion radix is reduced by 2 relative to a normal B-Tree but - * we get a number of significant benefits for our troubles. + * The radix for an internal node is 1 less then a leaf but we get a + * number of significant benefits for our troubles. * - * The big benefit to using a BH-Tree is that it is possible to cache - * pointers into the middle of the tree and not have to start searches, - * insertions, OR deletions at the root node. In particular, searches are - * able to progress in a definitive direction from any point in the tree - * without revisting nodes. This greatly improves the efficiency of many - * operations, most especially record appends. + * The big benefit to using a B-Tree containing boundary information + * is that it is possible to cache pointers into the middle of the tree + * and not have to start searches, insertions, OR deletions at the root + * node. In particular, searches are able to progress in a definitive + * direction from any point in the tree without revisting nodes. This + * greatly improves the efficiency of many operations, most especially + * record appends. * - * BH-Trees also make the stacking of trees fairly straightforward. - */ -#include "hammer.h" -#include -#include - -static int btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key, - int flags); -static int btree_cursor_set_cluster(hammer_btree_cursor_t cursor, - struct hammer_cluster *cluster); -static int btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor, - int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id); -static int btree_cursor_up(hammer_btree_cursor_t cursor); -static int btree_cursor_down(hammer_btree_cursor_t cursor); -static int btree_split_internal(hammer_btree_cursor_t cursor); -static int btree_split_leaf(hammer_btree_cursor_t cursor); -static int btree_rebalance_node(hammer_btree_cursor_t cursor); -static int btree_collapse(hammer_btree_cursor_t cursor); - -/* - * Compare two BH-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp). - */ -static int -hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) -{ -#if 0 - kprintf("compare obj_id %016llx %016llx\n", - key1->obj_id, key2->obj_id); - kprintf("compare rec_type %04x %04x\n", - key1->rec_type, key2->rec_type); - kprintf("compare key %016llx %016llx\n", - key1->key, key2->key); -#endif - - /* - * A key1->obj_id of 0 matches any object id - */ - if (key1->obj_id) { - if (key1->obj_id < key2->obj_id) - return(-4); - if (key1->obj_id > key2->obj_id) - return(4); - } - - /* - * A key1->rec_type of 0 matches any record type. - */ - if (key1->rec_type) { - if (key1->rec_type < key2->rec_type) - return(-3); - if (key1->rec_type > key2->rec_type) - return(3); - } - - /* - * There is no special case for key. 0 means 0. - */ - if (key1->key < key2->key) - return(-2); - if (key1->key > key2->key) - return(2); - - /* - * This test has a number of special cases. create_tid in key1 is - * the as-of transction id, and delete_tid in key1 is NOT USED. - * - * A key1->create_tid of 0 matches any record regardles of when - * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be - * used to search for the most current state of the object. - * - * key2->create_tid is a HAMMER record and will never be - * 0. key2->delete_tid is the deletion transaction id or 0 if - * the record has not yet been deleted. - */ - if (key1->create_tid) { - if (key1->create_tid < key2->create_tid) - return(-1); - if (key2->delete_tid && key1->create_tid >= key2->delete_tid) - return(1); - } - - return(0); -} - -/* - * Create a separator half way inbetween key1 and key2. For fields just - * one unit apart, the separator will match key2. - */ -#define MAKE_SEPARATOR(key1, key2, dest, field) \ - dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); - -static void -hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, - hammer_base_elm_t dest) -{ - MAKE_SEPARATOR(key1, key2, dest, obj_id); - MAKE_SEPARATOR(key1, key2, dest, rec_type); - MAKE_SEPARATOR(key1, key2, dest, key); - MAKE_SEPARATOR(key1, key2, dest, create_tid); - dest->delete_tid = 0; - dest->obj_type = 0; - dest->reserved07 = 0; -} - -#undef MAKE_SEPARATOR - -/* - * Return whether a generic internal or leaf node is full - */ -static int -btree_node_is_full(struct hammer_base_node *node) -{ - switch(node->type) { - case HAMMER_BTREE_INTERNAL_NODE: - if (node->count == HAMMER_BTREE_INT_ELMS) - return(1); - break; - case HAMMER_BTREE_LEAF_NODE: - if (node->count == HAMMER_BTREE_LEAF_ELMS) - return(1); - break; - default: - panic("illegal btree subtype"); - } - return(0); -} - -static int -btree_max_elements(u_int8_t type) -{ - if (type == HAMMER_BTREE_LEAF_NODE) - return(HAMMER_BTREE_LEAF_ELMS); - if (type == HAMMER_BTREE_INTERNAL_NODE) - return(HAMMER_BTREE_INT_ELMS); - panic("btree_max_elements: bad type %d\n", type); -} - -/* - * Initialize a cursor, setting the starting point for a BH-Tree search. - * - * The passed cluster must be locked. This function will add a reference - * to it. - */ -int -hammer_btree_cursor_init(hammer_btree_cursor_t cursor, - struct hammer_cluster *cluster) -{ - int error; - - cursor->cluster = NULL; - cursor->node_buffer = NULL; - cursor->parent_buffer = NULL; - cursor->node = NULL; - cursor->parent = NULL; - cursor->index = 0; - cursor->parent_index = 0; - error = btree_cursor_set_cluster(cursor, cluster); - return(error); -} - -/* - * Clean up a HAMMER BH-Tree cursor after we are finished using it. - */ -void -hammer_btree_cursor_done(hammer_btree_cursor_t cursor) -{ - if (cursor->node_buffer) { - hammer_put_buffer(cursor->node_buffer, 0); - cursor->node_buffer = NULL; - } - if (cursor->parent_buffer) { - hammer_put_buffer(cursor->parent_buffer, 0); - cursor->parent_buffer = NULL; - } - if (cursor->cluster) { - hammer_put_cluster(cursor->cluster, 0); - cursor->cluster = NULL; - } - cursor->node = NULL; - cursor->parent = NULL; - cursor->left_bound = NULL; - cursor->right_bound = NULL; - cursor->index = 0; - cursor->parent_index = 0; -} - -/* - * Initialize a btree info structure and its associated cursor prior to - * running a BH-Tree operation. - */ -int -hammer_btree_info_init(hammer_btree_info_t info, struct hammer_cluster *cluster) -{ - int error; - - error = hammer_btree_cursor_init(&info->cursor, cluster); - info->data_buffer = NULL; - info->record_buffer = NULL; - info->data = NULL; - info->rec = NULL; - return (error); -} - -/* - * Clean up a BH-Tree info structure after we are finished using it. - */ -void -hammer_btree_info_done(hammer_btree_info_t info) -{ - hammer_btree_cursor_done(&info->cursor); - if (info->data_buffer) { - hammer_put_buffer(info->data_buffer, 0); - info->data_buffer = NULL; - } - if (info->record_buffer) { - hammer_put_buffer(info->record_buffer, 0); - info->record_buffer = NULL; - } - info->data = NULL; - info->rec = NULL; -} - -/* - * Search a cluster's BH-Tree. Return the matching node. The search - * normally traverses clusters but will terminate at a cluster entry - * in a leaf if HAMMER_BTREE_CLUSTER_TAG is specified. If this flag - * is specified EIO is returned if the search would otherwise have to - * cursor up into a parent cluster. - * - * The cursor must be completely initialized on entry. If the cursor is - * at the root of a cluster, the parent pointer & buffer may be NULL (in - * that case the bounds point to the bounds in the cluster header). The - * node_buffer and node must always be valid. - * - * The search code may be forced to iterate up the tree if the conditions - * required for an insertion or deletion are not met. This does not occur - * very often. - * - * INSERTIONS: The search will split full nodes and leaves on its way down - * and guarentee that the leaf it ends up on is not full. - * - * DELETIONS: The search will rebalance the tree on its way down. - */ -static -int -btree_search(hammer_btree_cursor_t cursor, hammer_base_elm_t key, int flags) -{ - struct hammer_btree_leaf_node *leaf; - int error; - int i; - int r; - - /* - * Move our cursor up the tree until we find a node whos range covers - * the key we are trying to locate. This may move us between - * clusters. - * - * Since the root cluster always has a root BH-Tree node, info->node - * is always non-NULL if no error occured. The parent field will be - * non-NULL unless we are at the root of a cluster. - */ - while (hammer_btree_cmp(key, cursor->left_bound) < 0 || - hammer_btree_cmp(key, cursor->right_bound) >= 0) { - /* - * Must stay in current cluster if flagged, code should never - * use the flag if it wants us to traverse to the parent - * cluster. - */ - if (cursor->parent == NULL && - (flags & HAMMER_BTREE_CLUSTER_TAG)) { - error = EIO; - goto done; - } - error = btree_cursor_up(cursor); - if (error) - goto done; - } - KKASSERT(cursor->node != NULL); - - /* - * If we are inserting we can't start at a full node if the parent - * is also full (because there is no way to split the node), - * continue running up the tree until we hit the root of the - * current cluster or until the requirement is satisfied. - */ - while (flags & HAMMER_BTREE_INSERT) { - if (btree_node_is_full(&cursor->node->base) == 0) - break; - if (cursor->parent == NULL) - break; - if (cursor->parent->base.count != HAMMER_BTREE_INT_ELMS) - break; - error = btree_cursor_up(cursor); - if (error) - goto done; - } - - /* - * If we are deleting we can't start at a node with only one element - * unless it is root, because all of our code assumes that nodes - * will never be empty. - * - * This handles the case where the cursor is sitting at a leaf and - * either the leaf or parent contain an insufficient number of - * elements. - */ - while (flags & HAMMER_BTREE_DELETE) { - if (cursor->node->base.count > 1) - break; - if (cursor->parent == NULL) - break; - KKASSERT(cursor->node->base.count != 0); - error = btree_cursor_up(cursor); - if (error) - goto done; - } - -new_cluster: - /* - * Push down through internal nodes to locate the requested key. - */ - while (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) { - struct hammer_btree_internal_node *node; - - /* - * If we are a the root node and deleting, try to collapse - * all of the root's children into the root. This is the - * only point where tree depth is reduced. - */ - if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) { - error = btree_collapse(cursor); - if (error) - goto done; - } - - /* - * Scan the node to find the subtree index to push down into. - * We go one-past, then back-up. The key should never be - * less then the left-hand boundary so I should never wind - * up 0. - */ - node = &cursor->node->internal; - for (i = 0; i < node->base.count; ++i) { - r = hammer_btree_cmp(key, &node->elms[i].base); - if (r < 0) - break; - } - KKASSERT(i != 0); - - /* - * The push-down index is now i - 1. - */ - --i; - cursor->index = i; - - /* - * Handle insertion and deletion requirements. - * - * If inserting split full nodes. The split code will - * adjust cursor->node and cursor->index if the current - * index winds up in the new node. - */ - if (flags & HAMMER_BTREE_INSERT) { - if (node->base.count == HAMMER_BTREE_INT_ELMS) { - error = btree_split_internal(cursor); - if (error) - goto done; - /* - * reload stale pointers - */ - i = cursor->index; - node = &cursor->node->internal; - } - } - - /* - * If deleting rebalance - do not allow the child to have - * just one element or we will not be able to delete it. - * - * Neither internal or leaf nodes (except a root-leaf) are - * allowed to drop to 0 elements. - * - * Our separators may have been reorganized after rebalancing, - * so we have to pop back up and rescan. - */ - if (flags & HAMMER_BTREE_DELETE) { - if (node->elms[i].subtree_count <= 1) { - error = btree_rebalance_node(cursor); - if (error) - goto done; - /* cursor->index is invalid after call */ - goto new_cluster; - } - } - - /* - * Push down (push into new node, existing node becomes - * the parent). - */ - error = btree_cursor_down(cursor); - if (error) - goto done; - } - - - /* - * We are at a leaf, do a linear search of the key array. - * (XXX do a binary search). On success the index is set to the - * matching element, on failure the index is set to the insertion - * point. - * - * Boundaries are not stored in leaf nodes, so the index can wind - * up to the left of element 0 (index == 0) or past the end of - * the array (index == leaf->base.count). - */ - leaf = &cursor->node->leaf; - KKASSERT(leaf->base.count <= HAMMER_BTREE_LEAF_ELMS); - - for (i = 0; i < leaf->base.count; ++i) { - r = hammer_btree_cmp(key, &leaf->elms[i].base); - if (r < 0) - break; - if (r == 0) { - /* - * Return an exact match unless this is a cluster - * element. If it is, and the cluster tag flag has - * not been set, push into the new cluster and - * continue the search. - */ - cursor->index = i; - if ((leaf->elms[i].base.obj_type & - HAMMER_OBJTYPE_CLUSTER_FLAG) && - (flags & HAMMER_BTREE_CLUSTER_TAG) == 0) { - error = btree_cursor_down(cursor); - if (error) - goto done; - goto new_cluster; - } - error = 0; - goto done; - } - } - - /* - * We could not find an exact match. Check for a cluster - * recursion. The cluster's range is bracketed by two - * leaf elements. One of the two must be in this node. - */ - if ((flags & HAMMER_BTREE_CLUSTER_TAG) == 0) { - if (i == leaf->base.count) { - if (leaf->elms[i-1].base.obj_type & - HAMMER_OBJTYPE_CLUSTER_FLAG) { - cursor->index = i - 1; - error = btree_cursor_down(cursor); - if (error) - goto done; - goto new_cluster; - } - } else { - if (leaf->elms[i].base.obj_type & - HAMMER_OBJTYPE_CLUSTER_FLAG) { - cursor->index = i; - error = btree_cursor_down(cursor); - if (error) - goto done; - goto new_cluster; - } - } - } - - /* - * If inserting split a full leaf before returning. This - * may have the side effect of adjusting cursor->node and - * cursor->index. - * - * We delayed the split in order to avoid any unnecessary splits. - * - * XXX parent's parent's subtree_count will be wrong after - * this (keep parent of parent around too? ugh). - */ - cursor->index = i; - if ((flags & HAMMER_BTREE_INSERT) && - leaf->base.count == HAMMER_BTREE_LEAF_ELMS) { - error = btree_split_leaf(cursor); - /* NOT USED - i = cursor->index; - node = &cursor->node->internal; - */ - if (error) - goto done; - } - error = ENOENT; + * B-Trees also make the stacking of trees fairly straightforward. + * + * INTER-CLUSTER ELEMENTS: An element of an internal node may reference + * the root of another cluster rather then a node in the current cluster. + * This is known as an inter-cluster references. Only B-Tree searches + * will cross cluster boundaries. The rebalancing and collapse code does + * not attempt to move children between clusters. A major effect of this + * is that we have to relax minimum element count requirements and allow + * trees to become somewhat unabalanced. + * + * INSERTIONS AND DELETIONS: When inserting we split full nodes on our + * way down as an optimization. I originally experimented with rebalancing + * nodes on the way down for deletions but it created a huge mess due to + * the way inter-cluster linkages work. Instead, now I simply allow + * the tree to become unbalanced and allow leaf nodes to become empty. + * The delete code will try to clean things up from the bottom-up but + * will stop if related elements are not in-core or if it cannot get a node + * lock. + */ +#include "hammer.h" +#include +#include - /* - * Set the cursor's last_error. This is used primarily by - * btree_search_fwd() to determine whether it needs to skip - * the current element or not. - */ -done: - cursor->last_error = error; - return(error); -} +static int btree_search(hammer_cursor_t cursor, int flags); +static int btree_split_internal(hammer_cursor_t cursor); +static int btree_split_leaf(hammer_cursor_t cursor); +static int btree_remove(hammer_node_t node, int index); +#if 0 +static int btree_rebalance(hammer_cursor_t cursor); +static int btree_collapse(hammer_cursor_t cursor); +#endif +static int btree_node_is_full(hammer_node_ondisk_t node); +static void hammer_make_separator(hammer_base_elm_t key1, + hammer_base_elm_t key2, hammer_base_elm_t dest); /* - * Ignoring key->key, iterate records after a search. The next record which - * matches the key (except for key->key) is returned. To synchronize with - * btree_search() we only check the record at the current cursor index if - * cursor->last_error is ENOENT, indicating that it is at an insertion point - * (so if we are not at the end of the node's array, the current record - * will be the 'next' record to check). + * Iterate records after a search. The cursor is iterated forwards past + * the current record until a record matching the key-range requirements + * is found. ENOENT is returned if the iteration goes past the ending + * key. * - * Records associated with regular files use the ending offset of the data - * block (non inclusive) as their key. E.g. if you write 20 bytes to a - * file at offset 10, the HAMMER record will use a key of 30 for that record. - * This way the cursor is properly positioned after a search so the first - * record returned by btree_iterate() is (likely) the one containing the - * desired data. This also means the results of the initial search are - * ignored. + * key_beg/key_end is an INCLUSVE range. i.e. if you are scanning to load + * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an + * offset of 4095. * - * This function is also used to iterate through database records, though - * in that case the caller utilizes any exact match located by btree_search() - * before diving into the iteration. + * cursor->key_beg may or may not be modified by this function during + * the iteration. */ int -hammer_btree_iterate(hammer_btree_cursor_t cursor, hammer_base_elm_t key) +hammer_btree_iterate(hammer_cursor_t cursor) { - hammer_btree_node_t node; - struct hammer_base_elm save_base; + hammer_node_ondisk_t node; + hammer_btree_elm_t elm; int64_t save_key; int error; int r; int s; /* - * If last_error is not zero we are at the insertion point and must - * start at the current element. If last_error is zero the caller - * has already processed the current cursor (or doesn't care about - * it) and we must iterate forwards. + * Skip past the current record */ - node = cursor->node; - if (cursor->index < node->base.count && cursor->last_error == 0) + node = cursor->node->ondisk; + if (node == NULL) { + KKASSERT(cursor->last_error != 0); + return(cursor->last_error); + } + if (cursor->index < node->count) ++cursor->index; + /* + * Loop until an element is found or we are done. + */ for (;;) { /* - * We iterate up the tree while we are at the last element. + * We iterate up the tree and then index over one element + * while we are at the last element in the current node. + * + * NOTE: This can pop us up to another cluster. * - * If we hit the root of the cluster we have to cursor up - * into the parent cluster, but then search to position - * the cursor at the next logical element in the iteration. - * We ignore an ENOENT error in this case. + * If we are at the root of the root cluster, cursor_up + * returns ENOENT. + * + * NOTE: hammer_cursor_up() will adjust cursor->key_beg + * when told to re-search for the cluster tag. * * XXX this could be optimized by storing the information in * the parent reference. */ -goupagain: - while (cursor->index == node->base.count) { - if (cursor->parent == NULL) { - save_base = *cursor->right_bound; - error = btree_cursor_up(cursor); - if (error) - goto done; - error = btree_search(cursor, &save_base, 0); - if (error == ENOENT) - error = 0; - if (error) - goto done; - node = cursor->node; - /* node cannot be empty. cursor->index is 0 */ - KKASSERT(cursor->index != node->base.count); - /* do not further increment the index */ - } else { - error = btree_cursor_up(cursor); - if (error) - goto done; - node = cursor->node; - KKASSERT(cursor->index != node->base.count); - ++cursor->index; - } + if (cursor->index == node->count) { + error = hammer_cursor_up(cursor); + if (error) + break; + node = cursor->node->ondisk; + KKASSERT(cursor->index != node->count); + ++cursor->index; + continue; } /* @@ -655,138 +173,161 @@ goupagain: * except for their transaction spaces. We can then skip * the node if the left and right transaction spaces are the * same sign. This directly optimized accesses to files with - * HUGE transactional histories, such as database files. + * HUGE transactional histories, such as database files, + * allowing us to avoid having to iterate through the entire + * history. */ - while (node->base.type == HAMMER_BTREE_INTERNAL_NODE) { - struct hammer_btree_internal_elm *elm; - - elm = &node->internal.elms[cursor->index]; + if (node->type == HAMMER_BTREE_TYPE_INTERNAL) { + elm = &node->elms[cursor->index]; if (elm[0].base.obj_id == elm[1].base.obj_id && elm[0].base.rec_type == elm[1].base.rec_type && elm[0].base.key == elm[1].base.key) { - save_key = key->key; - key->key = elm[0].base.key; - r = hammer_btree_cmp(key, &elm[0].base); - key->key = elm[1].base.key; - s = hammer_btree_cmp(key, &elm[1].base); - key->key = save_key; + /* + * Left side transaction space + */ + save_key = cursor->key_beg.key; + cursor->key_beg.key = elm[0].base.key; + r = hammer_btree_cmp(&cursor->key_beg, + &elm[0].base); + cursor->key_beg.key = save_key; + + /* + * Right side transaction space + */ + save_key = cursor->key_end.key; + cursor->key_end.key = elm[1].base.key; + s = hammer_btree_cmp(&cursor->key_end, + &elm[1].base); + cursor->key_end.key = save_key; + + /* + * If our range is entirely on one side or + * the other side we can skip the sub-tree. + */ if ((r < 0 && s < 0) || (r > 0 && s > 0)) { ++cursor->index; - goto goupagain; + continue; } } - error = btree_cursor_down(cursor); + error = hammer_cursor_down(cursor); if (error) - goto done; - KKASSERT(cursor->index != node->base.count); - node = cursor->node; + break; + KKASSERT(cursor->index == 0); + KKASSERT(cursor->index != node->count); + node = cursor->node->ondisk; + continue; } /* - * Determine if the record at the cursor matches, sans - * key->key (which is what we are iterating). + * We are at a leaf. + * + * Determine if the record at the cursor has gone beyond the + * end of our range. Remember that our key range is inclusive. + * + * When iterating we may have to 'pick out' records matching + * our transaction requirements. A comparison return of + * +1 or -1 indicates a transactional record that is too + * old or too new but does not terminate the search. */ - { - union hammer_btree_leaf_elm *elm; - - elm = &node->leaf.elms[cursor->index]; - save_key = key->key; - key->key = elm->base.key; - r = hammer_btree_cmp(key, &elm->base); - key->key = save_key; + elm = &node->elms[cursor->index]; + r = hammer_btree_cmp(&cursor->key_end, &elm->base); + if (r == -1 || r == 1) { + ++cursor->index; + continue; } /* - * The iteration stops if the obj_id or rec_type no longer - * match (unless the original key set those to 0, meaning the - * caller WANTS to iterate through those as well), denoted - * by -3,-2 or +2,+3 return values. Otherwise the mismatch - * is due to the transaction id and we can get both negative - * and positive values... we have to skip those records and - * continue searching. + * The search ends if the element goes beyond our key_end + * (after checking transactional return values above). + * Otherwise we have a successful match. */ - if (r == 0) { - error = 0; - break; - } - if (r < -2 || r > 2) { - error = ENOENT; - break; - } - ++cursor->index; + error = (r < 0) ? ENOENT : 0; + break; } -done: cursor->last_error = error; return(error); } /* - * Look up the key in the HAMMER BH-Tree and fill in the rest of the - * info structure with the results according to flags. 0 is returned on - * success, non-zero on error. + * Lookup cursor->key_beg. 0 is returned on success, ENOENT if the entry + * could not be found, and a fatal error otherwise. + * + * The cursor is suitably positioned for a deletion on success, and suitably + * positioned for an insertion on ENOENT. * - * The caller initializes (key, cluster) and makes the call. The cluster - * must be referenced and locked on call. The function can chain through - * multiple clusters and will replace the passed cluster as it does so, - * dereferencing and unlocking it, and referencing and locking the chain. - * On return info->cluster will still be referenced and locked but may - * represent a different cluster. + * The cursor may begin anywhere, the search will traverse clusters in + * either direction to locate the requested element. */ int -hammer_btree_lookup(hammer_btree_info_t info, hammer_base_elm_t key, int flags) +hammer_btree_lookup(hammer_cursor_t cursor) { int error; - error = btree_search(&info->cursor, key, flags); - if (error == 0 && - (flags & (HAMMER_BTREE_GET_RECORD|HAMMER_BTREE_GET_DATA))) { - error = hammer_btree_extract(info, flags); - } + error = btree_search(cursor, 0); + if (error == 0 && cursor->flags) + error = hammer_btree_extract(cursor, cursor->flags); return(error); } +/* + * Extract the record and/or data associated with the cursor's current + * position. Any prior record or data stored in the cursor is replaced. + * The cursor must be positioned at a leaf node. + * + * NOTE: Only records can be extracted from internal B-Tree nodes, and + * only for inter-cluster references. At the moment we only support + * extractions from leaf nodes. + */ int -hammer_btree_extract(hammer_btree_info_t info, int flags) +hammer_btree_extract(hammer_cursor_t cursor, int flags) { - struct hammer_cluster *cluster; - hammer_btree_leaf_elm_t elm; + hammer_node_ondisk_t node; + hammer_btree_elm_t elm; + hammer_cluster_t cluster; int32_t cloff; int error; - int iscl; - /* - * Extract the record and data reference if requested. - * + /* * A cluster record type has no data reference, the information - * is stored directly in the record and BH-Tree element. + * is stored directly in the record and B-Tree element. * * The case where the data reference resolves to the same buffer * as the record reference must be handled. */ - elm = &info->cursor.node->leaf.elms[info->cursor.index]; - iscl = (elm->base.obj_type & HAMMER_OBJTYPE_CLUSTER_FLAG) != 0; - cluster = info->cursor.cluster; + node = cursor->node->ondisk; + KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); + elm = &node->elms[cursor->index]; + cluster = cursor->node->cluster; error = 0; if ((flags & HAMMER_BTREE_GET_RECORD) && error == 0) { - cloff = iscl ? elm->cluster.rec_offset : elm->record.rec_offset; - - - info->rec = hammer_bread(cluster, cloff, HAMMER_FSBUF_RECORDS, - &error, &info->record_buffer); + cloff = elm->leaf.rec_offset; + cursor->record = hammer_bread(cluster, cloff, + HAMMER_FSBUF_RECORDS, &error, + &cursor->record_buffer); } else { cloff = 0; } - if ((flags & HAMMER_BTREE_GET_DATA) && iscl == 0 && error == 0) { - if ((cloff ^ elm->record.data_offset) & ~HAMMER_BUFMASK) { - info->data = hammer_bread(cluster, - elm->record.data_offset, - HAMMER_FSBUF_DATA, - &error, &info->record_buffer); + if ((flags & HAMMER_BTREE_GET_DATA) && error == 0) { + if ((cloff ^ elm->leaf.data_offset) & ~HAMMER_BUFMASK) { + /* + * Data in different buffer than record + */ + cursor->data = hammer_bread(cluster, + elm->leaf.data_offset, + HAMMER_FSBUF_DATA, &error, + &cursor->data_buffer); } else { - info->data = (void *) - ((char *)info->data_buffer->ondisk + - (elm->record.data_offset & HAMMER_BUFMASK)); + /* + * Data in same buffer as record. Note that we + * leave any existing data_buffer intact, even + * though we don't use it in this case, in case + * other records extracted during an iteration + * go back to it. + */ + cursor->data = (void *) + ((char *)cursor->record_buffer->ondisk + + (elm->leaf.data_offset & HAMMER_BUFMASK)); } } return(error); @@ -794,16 +335,21 @@ hammer_btree_extract(hammer_btree_info_t info, int flags) /* - * Insert a record into a BH-Tree's cluster. The caller has already - * reserved space for the record and data and must handle a ENOSPC - * return. + * Insert a leaf element into the B-Tree at the current cursor position. + * The cursor is positioned such that the element at and beyond the cursor + * are shifted to make room for the new record. + * + * The caller must call hammer_btree_lookup() with the HAMMER_BTREE_INSERT + * flag set and that call must return ENOENT before this function can be + * called. + * + * ENOSPC is returned if there is no room to insert a new record. */ int -hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm) +hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm) { - struct hammer_btree_cursor *cursor; - struct hammer_btree_internal_node *parent; - struct hammer_btree_leaf_node *leaf; + hammer_node_ondisk_t parent; + hammer_node_ondisk_t node; int i; #if 0 @@ -815,7 +361,7 @@ hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm) * The search also does some setup for our insert, so there is always * room in the leaf. */ - error = btree_search(&info->cursor, &elm->base, HAMMER_BTREE_INSERT); + error = btree_search(cursor, HAMMER_BTREE_INSERT); if (error != ENOENT) { if (error == 0) error = EEXIST; @@ -826,40 +372,55 @@ hammer_btree_insert(struct hammer_btree_info *info, hammer_btree_leaf_elm_t elm) /* * Insert the element at the leaf node and update the count in the * parent. It is possible for parent to be NULL, indicating that - * the root of the BH-Tree in the cluster is a leaf. It is also + * the root of the B-Tree in the cluster is a leaf. It is also * possible for the leaf to be empty. * * Remember that the right-hand boundary is not included in the * count. */ - cursor = &info->cursor; - leaf = &cursor->node->leaf; + node = cursor->node->ondisk; i = cursor->index; - KKASSERT(leaf->base.count < HAMMER_BTREE_LEAF_ELMS); - bcopy(&leaf->elms[i], &leaf->elms[i+1], - (leaf->base.count - i + 1) * sizeof(leaf->elms[0])); - leaf->elms[i] = *elm; - ++leaf->base.count; + KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF); + KKASSERT(node->count < HAMMER_BTREE_LEAF_ELMS); + if (i != node->count) { + bcopy(&node->elms[i], &node->elms[i+1], + (node->count - i) * sizeof(*elm)); + } + node->elms[i] = *elm; + ++node->count; + hammer_modify_node(cursor->node); - if ((parent = cursor->parent) != NULL) { + if ((parent = cursor->parent->ondisk) != NULL) { i = cursor->parent_index; - ++parent->elms[i].subtree_count; - KKASSERT(parent->elms[i].subtree_count <= leaf->base.count); - hammer_modify_buffer(cursor->parent_buffer); + ++parent->elms[i].internal.subtree_count; + KKASSERT(parent->elms[i].internal.subtree_count <= node->count); + hammer_modify_node(cursor->parent); } - hammer_modify_buffer(cursor->node_buffer); return(0); } /* - * Delete a record from the BH-Tree. + * Delete a record from the B-Tree's at the current cursor position. + * The cursor is positioned such that the current element is the one + * to be deleted. + * + * The caller must call hammer_btree_lookup() with the HAMMER_BTREE_DELETE + * flag set and that call must return 0 before this function can be + * called. + * + * It is possible that we will be asked to delete the last element in a + * leaf. This case only occurs if the downward search was unable to + * rebalance us, which in turn can occur if our parent has inter-cluster + * elements. So the 0-element case for a leaf is allowed. */ int -hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key) +hammer_btree_delete(hammer_cursor_t cursor) { - struct hammer_btree_cursor *cursor; - struct hammer_btree_internal_node *parent; - struct hammer_btree_leaf_node *leaf; + hammer_node_ondisk_t ondisk; + hammer_node_t node; + hammer_node_t parent; + hammer_btree_elm_t elm; + int error; int i; #if 0 @@ -868,313 +429,393 @@ hammer_btree_delete(struct hammer_btree_info *info, hammer_base_elm_t key) * Locate the leaf element to delete. The search is also responsible * for doing some of the rebalancing work on its way down. */ - error = btree_search(&info->cursor, key, HAMMER_BTREE_DELETE); + error = btree_search(cursor, HAMMER_BTREE_DELETE); if (error) return (error); #endif /* - * Delete the element from the leaf node. We leave empty leafs alone - * and instead depend on a future search to locate and destroy them. - * Otherwise we would have to recurse back up the tree to adjust - * the parent's subtree_count and we do not want to do that. + * Delete the element from the leaf node. * - * Remember that the right-hand boundary is not included in the - * count. + * Remember that leaf nodes do not have boundaries. */ - cursor = &info->cursor; - leaf = &cursor->node->leaf; + node = cursor->node; + ondisk = node->ondisk; i = cursor->index; - KKASSERT(cursor->node->base.type == HAMMER_BTREE_LEAF_NODE); - bcopy(&leaf->elms[i+1], &leaf->elms[i], - (leaf->base.count - i) * sizeof(leaf->elms[0])); - --leaf->base.count; - if ((parent = cursor->parent) != NULL) { + KKASSERT(ondisk->type == HAMMER_BTREE_TYPE_LEAF); + if (i + 1 != ondisk->count) { + bcopy(&ondisk->elms[i+1], &ondisk->elms[i], + (ondisk->count - i - 1) * sizeof(ondisk->elms[0])); + } + --ondisk->count; + if (cursor->parent != NULL) { /* * Adjust parent's notion of the leaf's count. subtree_count - * is only approximately, it is allowed to be too small but + * is only approximate, it is allowed to be too small but * never allowed to be too large. Make sure we don't drop * the count below 0. */ - i = cursor->parent_index; - if (parent->elms[i].subtree_count) - --parent->elms[i].subtree_count; - KKASSERT(parent->elms[i].subtree_count <= leaf->base.count); - hammer_modify_buffer(cursor->parent_buffer); + parent = cursor->parent; + elm = &parent->ondisk->elms[cursor->parent_index]; + if (elm->internal.subtree_count) + --elm->internal.subtree_count; + KKASSERT(elm->internal.subtree_count <= ondisk->count); + hammer_modify_node(parent); } - hammer_modify_buffer(cursor->node_buffer); - return(0); -} -/************************************************************************ - * CURSOR SUPPORT * - ************************************************************************ - * - * These routines do basic cursor operations. This support will probably - * be expanded in the future to add link fields for linear scans. - */ + /* + * If the leaf is empty try to remove the subtree reference + * in at (parent, parent_index). This will unbalance the + * tree. + * + * Note that internal nodes must have at least one element + * so their boundary information is properly laid out. If + * we would cause our parent to become empty we try to + * recurse up the tree, but if that doesn't work we just + * leave the tree with an empty leaf. + */ + if (ondisk->count == 0) { + error = btree_remove(cursor->parent, cursor->parent_index); + if (error == 0) { + hammer_free_btree(node->cluster, node->node_offset); + } else if (error == EAGAIN) { + hammer_modify_node(node); + error = 0; + } /* else a real error occured XXX */ + } else { + hammer_modify_node(node); + error = 0; + } + return(error); +} /* - * Unconditionally set the cursor to the root of the specified cluster. - * The current cursor node is set to the root node of the cluster (which - * can be an internal node or a degenerate leaf), and the parent info - * is cleaned up and cleared. + * PRIMAY B-TREE SEARCH SUPPORT PROCEDURE + * + * Search a cluster's B-Tree for cursor->key_beg, return the matching node. + * + * The search begins at the current node and will instantiate a NULL + * parent if necessary and if not at the root of the cluster. On return + * parent will be non-NULL unless the cursor is sitting at a root-leaf. + * + * The search code may be forced to iterate up the tree if the conditions + * required for an insertion or deletion are not met. This does not occur + * very often. + * + * INSERTIONS: The search will split full nodes and leaves on its way down + * and guarentee that the leaf it ends up on is not full. * - * The passed cluster must be locked. This function will add a reference - * to it. The cursor must already have a cluster assigned to it, which we - * will replace. + * DELETIONS: The search will rebalance the tree on its way down. */ -static +static int -btree_cursor_set_cluster_by_value(hammer_btree_cursor_t cursor, - int32_t vol_no, int32_t clu_no, hammer_tid_t clu_id) +btree_search(hammer_cursor_t cursor, int flags) { - struct hammer_cluster *cluster; - struct hammer_volume *volume; - int error = 0; - - if (vol_no < 0) - return(EIO); - cluster = cursor->cluster; - KKASSERT(cluster != NULL); - if (vol_no == cluster->volume->vol_no) { - cluster = hammer_get_cluster(cluster->volume, clu_no, - &error, 0); - } else { - volume = hammer_get_volume(cluster->volume->hmp, - vol_no, &error); - if (volume) { - cluster = hammer_get_cluster(volume, clu_no, - &error, 0); - hammer_put_volume(volume, 0); - } else { - cluster = NULL; - } + hammer_node_ondisk_t node; + hammer_cluster_t cluster; + int error; + int i; + int r; + + flags |= cursor->flags; + + /* + * Move our cursor up the tree until we find a node whos range covers + * the key we are trying to locate. This may move us between + * clusters. + * + * The left bound is inclusive, the right bound is non-inclusive. + * It is ok to cursor up too far so when cursoring across a cluster + * boundary. + * + * First see if we can skip the whole cluster. hammer_cursor_up() + * handles both cases but this way we don't check the cluster + * bounds when going up the tree within a cluster. + */ + cluster = cursor->node->cluster; + while ( + hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_beg) < 0 || + hammer_btree_cmp(&cursor->key_beg, &cluster->clu_btree_end) >= 0) { + error = hammer_cursor_toroot(cursor); + if (error) + goto done; + error = hammer_cursor_up(cursor); + if (error) + goto done; + cluster = cursor->node->cluster; } /* - * Make sure the cluster id matches. XXX At the moment the - * clu_id in the btree cluster element is only 32 bits, so only - * compare the low 32 bits. + * Deal with normal cursoring within a cluster. The right bound + * is non-inclusive. That is, the bounds form a separator. */ - if (cluster) { - if ((int32_t)cluster->ondisk->clu_id == (int32_t)clu_id) { - btree_cursor_set_cluster(cursor, cluster); - } else { - error = EIO; - } - hammer_put_cluster(cluster, 0); + while (hammer_btree_cmp(&cursor->key_beg, cursor->left_bound) < 0 || + hammer_btree_cmp(&cursor->key_beg, cursor->right_bound) >= 0) { + error = hammer_cursor_up(cursor); + if (error) + goto done; } - return (error); -} -static -int -btree_cursor_set_cluster(hammer_btree_cursor_t cursor, - struct hammer_cluster *cluster) -{ - int error = 0; - - hammer_dup_cluster(&cursor->cluster, cluster); - cursor->node = hammer_bread(cluster, - cluster->ondisk->clu_btree_root, - HAMMER_FSBUF_BTREE, - &error, - &cursor->node_buffer); - cursor->index = 0; - if (cursor->parent) { - hammer_put_buffer(cursor->parent_buffer, 0); - cursor->parent_buffer = NULL; - cursor->parent = NULL; - cursor->parent_index = 0; + /* + * We better have ended up with a node somewhere, and our second + * while loop had better not have traversed up a cluster. + */ + KKASSERT(cursor->node != NULL && cursor->node->cluster == cluster); + + /* + * If we are inserting we can't start at a full node if the parent + * is also full (because there is no way to split the node), + * continue running up the tree until we hit the root of the + * root cluster or until the requirement is satisfied. + * + * NOTE: These cursor-up's CAN continue to cross cluster boundaries. + * + * XXX as an optimization it should be possible to unbalance the tree + * and stop at the root of the current cluster. + */ + while (flags & HAMMER_BTREE_INSERT) { + if (btree_node_is_full(cursor->node->ondisk) == 0) + break; + if (cursor->parent == NULL) + break; + if (cursor->parent->ondisk->count != HAMMER_BTREE_INT_ELMS) + break; + error = hammer_cursor_up(cursor); + /* cluster and node are now may become stale */ + if (error) + goto done; } - cursor->left_bound = &cluster->ondisk->clu_btree_beg; - cursor->right_bound = &cluster->ondisk->clu_btree_end; - return (error); -} + /* cluster = cursor->node->cluster; not needed until next cluster = */ -/* - * Cursor the node up to the parent node. If at the root of a cluster, - * cursor to the root of the cluster's parent cluster. Note carefully - * that we do NOT scan the parent cluster to find the leaf that dropped - * into our current cluster. - * - * This function is primarily used by the search code to avoid having - * to search from the root of the filesystem BH-Tree. - */ -static -int -btree_cursor_up(hammer_btree_cursor_t cursor) -{ - struct hammer_cluster_ondisk *ondisk; - struct hammer_btree_internal_node *parent; - int error; - int i; - int r; +#if 0 + /* + * If we are deleting we can't start at an internal node with only + * one element unless it is root, because all of our code assumes + * that internal nodes will never be empty. Just do this generally + * for both leaf and internal nodes to get better balance. + * + * This handles the case where the cursor is sitting at a leaf and + * either the leaf or parent contain an insufficient number of + * elements. + * + * NOTE: These cursor-up's CAN continue to cross cluster boundaries. + */ + while (flags & HAMMER_BTREE_DELETE) { + if (cursor->node->ondisk->count > 1) + break; + if (cursor->parent == NULL) + break; + KKASSERT(cursor->node->ondisk->count != 0); + error = hammer_cursor_up(cursor); + /* cluster and node are now may become stale */ + if (error) + goto done; + } +#endif - error = 0; - if (cursor->parent == NULL) { +/*new_cluster:*/ + /* + * Push down through internal nodes to locate the requested key. + */ + cluster = cursor->node->cluster; + node = cursor->node->ondisk; + while (node->type == HAMMER_BTREE_TYPE_INTERNAL) { +#if 0 /* - * We are at the root of the cluster, pop up to the root - * of the parent cluster. Return ENOENT if we are at the - * root cluster of the filesystem. + * If we are a the root node and deleting, try to collapse + * all of the root's children into the root. This is the + * only point where tree depth is reduced. */ - ondisk = cursor->cluster->ondisk; - if (ondisk->clu_btree_parent_vol_no < 0) { - error = ENOENT; - } else { - error = btree_cursor_set_cluster_by_value( - cursor, - ondisk->clu_btree_parent_vol_no, - ondisk->clu_btree_parent_clu_no, - ondisk->clu_btree_parent_clu_id); + if ((flags & HAMMER_BTREE_DELETE) && cursor->parent == NULL) { + error = btree_collapse(cursor); + /* node becomes stale after call */ + if (error) + goto done; } - } else { + node = cursor->node->ondisk; +#endif + + /* + * Scan the node to find the subtree index to push down into. + * We go one-past, then back-up. The key should never be + * less then the left-hand boundary so I should never wind + * up 0. + */ + for (i = 0; i < node->count; ++i) { + r = hammer_btree_cmp(&cursor->key_beg, + &node->elms[i].base); + if (r < 0) + break; + } + KKASSERT(i != 0); + + /* + * The push-down index is now i - 1. + */ + --i; + cursor->index = i; + + /* + * Handle insertion and deletion requirements. + * + * If inserting split full nodes. The split code will + * adjust cursor->node and cursor->index if the current + * index winds up in the new node. + */ + if (flags & HAMMER_BTREE_INSERT) { + if (node->count == HAMMER_BTREE_INT_ELMS) { + error = btree_split_internal(cursor); + if (error) + goto done; + /* + * reload stale pointers + */ + i = cursor->index; + node = cursor->node->ondisk; + } + } + +#if 0 /* - * Copy the current node's parent info into the node and load - * a new parent. + * If deleting rebalance - do not allow the child to have + * just one element or we will not be able to delete it. + * + * Neither internal or leaf nodes (except a root-leaf) are + * allowed to drop to 0 elements. (XXX - well, leaf nodes + * can at the moment). + * + * Our separators may have been reorganized after rebalancing, + * so we have to pop back up and rescan. + * + * XXX test for subtree_count < maxelms / 2, minus 1 or 2 + * for hysteresis? */ - cursor->index = cursor->parent_index; - cursor->node = (hammer_btree_node_t)cursor->parent; - hammer_dup_buffer(&cursor->node_buffer, cursor->parent_buffer); + if (flags & HAMMER_BTREE_DELETE) { + if (node->elms[i].internal.subtree_count <= 1) { + error = btree_rebalance(cursor); + if (error) + goto done; + /* cursor->index is invalid after call */ + goto new_cluster; + } + } +#endif /* - * Load the parent's parent into parent and figure out the - * parent's element index for its child. Just NULL it out - * if we hit the root of the cluster. + * Push down (push into new node, existing node becomes + * the parent). */ - if (cursor->parent->base.parent) { - parent = hammer_bread(cursor->cluster, - cursor->node->base.parent, - HAMMER_FSBUF_BTREE, - &error, - &cursor->parent_buffer); - for (i = 0; i < parent->base.count; ++i) { - r = hammer_btree_cmp( - &cursor->node->internal.elms[0].base, - &parent->elms[i].base); - if (r < 0) - break; - } - cursor->parent = parent; - cursor->parent_index = i - 1; - KKASSERT(parent->elms[i].subtree_offset == - hammer_bclu_offset(cursor->node_buffer, - cursor->node)); - } else { - hammer_put_buffer(cursor->parent_buffer, 0); - cursor->parent = NULL; - cursor->parent_buffer = NULL; - cursor->parent_index = 0; - } + error = hammer_cursor_down(cursor); + /* node and cluster become stale */ + if (error) + goto done; + node = cursor->node->ondisk; + cluster = cursor->node->cluster; } - return(error); -} -/* - * Cursor down into (node, index) - * - * Push down into the current cursor. The current cursor becomes the parent. - * If the current cursor represents a leaf cluster element this function will - * push into the root of a new cluster and clear the parent fields. - * - * Pushin down at a leaf which is not a cluster element returns EIO. - */ -static -int -btree_cursor_down(hammer_btree_cursor_t cursor) -{ - hammer_btree_node_t node; - int error; + /* + * We are at a leaf, do a linear search of the key array. + * (XXX do a binary search). On success the index is set to the + * matching element, on failure the index is set to the insertion + * point. + * + * Boundaries are not stored in leaf nodes, so the index can wind + * up to the left of element 0 (index == 0) or past the end of + * the array (index == node->count). + */ + KKASSERT(node->count <= HAMMER_BTREE_LEAF_ELMS); + + for (i = 0; i < node->count; ++i) { + r = hammer_btree_cmp(&cursor->key_beg, &node->elms[i].base); - node = cursor->node; - if (node->base.type == HAMMER_BTREE_LEAF_NODE) { - /* - * Push into another cluster - */ - hammer_btree_leaf_elm_t elm; - - elm = &node->leaf.elms[cursor->index]; - if (elm->base.rec_type == HAMMER_RECTYPE_CLUSTER) { - error = btree_cursor_set_cluster_by_value( - cursor, - elm->cluster.vol_no, - elm->cluster.clu_no, - elm->cluster.verifier); - } else { - error = EIO; - } - } else { /* - * Push into another BH-Tree node (internal or leaf) + * Stop if we've flipped past key_beg */ - struct hammer_btree_internal_elm *elm; + if (r < 0) + break; - KKASSERT(node->base.type == HAMMER_BTREE_INTERNAL_NODE); - elm = &node->internal.elms[cursor->index]; - KKASSERT(elm->subtree_offset != 0); - cursor->parent_index = cursor->index; - cursor->parent = &cursor->node->internal; - hammer_dup_buffer(&cursor->parent_buffer, cursor->node_buffer); - - cursor->node = hammer_bread(cursor->cluster, - elm->subtree_offset, - HAMMER_FSBUF_BTREE, - &error, - &cursor->node_buffer); - cursor->index = 0; - cursor->left_bound = &elm[0].base; - cursor->right_bound = &elm[1].base; - KKASSERT(cursor->node == NULL || - cursor->node->base.parent == elm->subtree_offset); -#ifdef INVARIANTS /* - * The bounds of an internal node must match the parent's - * partitioning values. Leaf nodes do not store bounds. + * Return an exact match */ - if (cursor->node->base.type == HAMMER_BTREE_INTERNAL_NODE) { - KKASSERT(hammer_btree_cmp(cursor->left_bound, &cursor->node->internal.elms[0].base) == 0); - KKASSERT(hammer_btree_cmp(cursor->right_bound, &cursor->node->internal.elms[cursor->node->base.count].base) == 0); + if (r == 0) { + cursor->index = i; + error = 0; + goto done; } -#endif } + + /* + * No exact match was found, i is now at the insertion point. + * + * If inserting split a full leaf before returning. This + * may have the side effect of adjusting cursor->node and + * cursor->index. + */ + cursor->index = i; + if ((flags & HAMMER_BTREE_INSERT) && + node->count == HAMMER_BTREE_LEAF_ELMS) { + error = btree_split_leaf(cursor); + /* NOT USED + i = cursor->index; + node = &cursor->node->internal; + */ + if (error) + goto done; + } + error = ENOENT; + + /* + * Set the cursor's last_error. + */ +done: + cursor->last_error = error; return(error); } + /************************************************************************ - * SPLITTING AND MERGING * + * SPLITTING AND MERGING * ************************************************************************ * * These routines do all the dirty work required to split and merge nodes. */ /* - * Split an internal into two nodes and move the separator at the split + * Split an internal node into two nodes and move the separator at the split * point to the parent. Note that the parent's parent's element pointing * to our parent will have an incorrect subtree_count (we don't update it). * It will be low, which is ok. * - * Cursor->index indicates the element the caller intends to push into. - * We will adjust cursor->node and cursor->index if that element winds + * (cursor->node, cursor->index) indicates the element the caller intends + * to push into. We will adjust node and index if that element winds * up in the split node. + * + * If we are at the root of a cluster a new root must be created with two + * elements, one pointing to the original root and one pointing to the + * newly allocated split node. + * + * NOTE! Being at the root of a cluster is different from being at the + * root of the root cluster. cursor->parent will not be NULL and + * cursor->node->ondisk.parent must be tested against 0. Theoretically + * we could propogate the algorithm into the parent and deal with multiple + * 'roots' in the cluster header, but it's easier not to. */ static int -btree_split_internal(hammer_btree_cursor_t cursor) +btree_split_internal(hammer_cursor_t cursor) { - struct hammer_btree_internal_node *parent; - struct hammer_btree_internal_node *node; - struct hammer_btree_internal_node *new_node; - struct hammer_btree_internal_elm *elm; - struct hammer_btree_internal_elm *parent_elm; - struct hammer_buffer *new_buffer; - int32_t parent_offset; + hammer_node_ondisk_t ondisk; + hammer_node_t node; + hammer_node_t parent; + hammer_node_t new_node; + hammer_btree_elm_t elm; + hammer_btree_elm_t parent_elm; int parent_index; int made_root; int split; int error; - const size_t esize = sizeof(struct hammer_btree_internal_elm); + const int esize = sizeof(*elm); /* * We are splitting but elms[split] will be promoted to the parent, @@ -1182,37 +823,42 @@ btree_split_internal(hammer_btree_cursor_t cursor) * insertion point will be on the left-hand side adjust the split * point to give the right hand side one additional node. */ - node = &cursor->node->internal; - split = (node->base.count + 1) / 2; + node = cursor->node; + ondisk = node->ondisk; + split = (ondisk->count + 1) / 2; if (cursor->index <= split) --split; error = 0; /* - * If we are at the root of the tree, create a new root node with + * If we are at the root of the cluster, create a new root node with * 1 element and split normally. Avoid making major modifications * until we know the whole operation will work. + * + * The root of the cluster is different from the root of the root + * cluster. Use the node's on-disk structure's parent offset to + * detect the case. */ - parent = cursor->parent; - if (parent == NULL) { - parent = hammer_alloc_btree(cursor->cluster, &error, - &cursor->parent_buffer); + if (ondisk->parent == 0) { + parent = hammer_alloc_btree(node->cluster, &error); if (parent == NULL) return(error); - parent->base.count = 1; - parent->base.parent = 0; - parent->base.type = HAMMER_BTREE_INTERNAL_NODE; - parent->base.subtype = node->base.type; - parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg; - parent->elms[0].subtree_offset = - hammer_bclu_offset(cursor->node_buffer, node); - parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end; + hammer_lock_ex(&parent->lock); + ondisk = parent->ondisk; + ondisk->count = 1; + ondisk->parent = 0; + ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; + ondisk->elms[0].base = node->cluster->clu_btree_beg; + ondisk->elms[0].internal.subtree_type = node->ondisk->type; + ondisk->elms[0].internal.subtree_offset = node->node_offset; + ondisk->elms[1].base = node->cluster->clu_btree_end; made_root = 1; - cursor->parent_index = 0; /* insertion point in parent */ + parent_index = 0; /* index of current node in parent */ } else { made_root = 0; + parent = cursor->parent; + parent_index = cursor->parent_index; } - parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent); /* * Split node into new_node at the split point. @@ -1227,36 +873,38 @@ btree_split_internal(hammer_btree_cursor_t cursor) * 0 1 2 3 4 5 6 * */ - new_buffer = NULL; - new_node = hammer_alloc_btree(cursor->cluster, &error, &new_buffer); + new_node = hammer_alloc_btree(node->cluster, &error); if (new_node == NULL) { - if (made_root) - hammer_free_btree_ptr(cursor->parent_buffer, - (hammer_btree_node_t)parent); + if (made_root) { + hammer_unlock(&parent->lock); + hammer_free_btree(node->cluster, parent->node_offset); + hammer_rel_node(parent); + } return(error); } + hammer_lock_ex(&new_node->lock); /* - * Create the new node. P become the left-hand boundary in the + * Create the new node. P becomes the left-hand boundary in the * new node. Copy the right-hand boundary as well. * * elm is the new separator. */ - elm = &node->elms[split]; - bcopy(elm, &new_node->elms[0], (node->base.count - split + 1) * esize); - new_node->base.count = node->base.count - split; - new_node->base.parent = parent_offset; - new_node->base.type = HAMMER_BTREE_INTERNAL_NODE; - new_node->base.subtype = node->base.subtype; - KKASSERT(node->base.type == new_node->base.type); + ondisk = node->ondisk; + elm = &ondisk->elms[split]; + bcopy(elm, &new_node->ondisk->elms[0], + (ondisk->count - split + 1) * esize); + new_node->ondisk->count = ondisk->count - split; + new_node->ondisk->parent = parent->node_offset; + new_node->ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; + KKASSERT(ondisk->type == new_node->ondisk->type); /* * Cleanup the original node. P becomes the new boundary, its * subtree_offset was moved to the new node. If we had created * a new root its parent pointer may have changed. */ - node->base.parent = parent_offset; - elm->subtree_offset = 0; + elm->internal.subtree_offset = 0; /* * Insert the separator into the parent, fixup the parent's @@ -1265,27 +913,33 @@ btree_split_internal(hammer_btree_cursor_t cursor) * * Remember that base.count does not include the right-hand boundary. */ - parent_index = cursor->parent_index; - parent->elms[parent_index].subtree_count = split; - parent_elm = &parent->elms[parent_index+1]; + ondisk = parent->ondisk; + ondisk->elms[parent_index].internal.subtree_count = split; + parent_elm = &ondisk->elms[parent_index+1]; bcopy(parent_elm, parent_elm + 1, - (parent->base.count - parent_index) * esize); - parent_elm->base = elm->base; /* separator P */ - parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_node); - parent_elm->subtree_count = new_node->base.count; - - hammer_modify_buffer(new_buffer); - hammer_modify_buffer(cursor->parent_buffer); - hammer_modify_buffer(cursor->node_buffer); + (ondisk->count - parent_index) * esize); + parent_elm->internal.base = elm->base; /* separator P */ + parent_elm->internal.subtree_offset = new_node->node_offset; + parent_elm->internal.subtree_count = new_node->ondisk->count; /* * The cluster's root pointer may have to be updated. */ if (made_root) { - cursor->cluster->ondisk->clu_btree_root = node->base.parent; - hammer_modify_cluster(cursor->cluster); + node->cluster->ondisk->clu_btree_root = parent->node_offset; + hammer_modify_cluster(node->cluster); + node->ondisk->parent = parent->node_offset; + if (cursor->parent) { + hammer_unlock(&cursor->parent->lock); + hammer_rel_node(cursor->parent); + } + cursor->parent = parent; /* lock'd and ref'd */ } + hammer_modify_node(node); + hammer_modify_node(new_node); + hammer_modify_node(parent); + /* * Ok, now adjust the cursor depending on which element the original * index was pointing at. If we are >= the split point the push node @@ -1294,14 +948,22 @@ btree_split_internal(hammer_btree_cursor_t cursor) * NOTE: If we are at the split point itself we cannot stay with the * original node because the push index will point at the right-hand * boundary, which is illegal. + * + * NOTE: The cursor's parent or parent_index must be adjusted for + * the case where a new parent (new root) was created, and the case + * where the cursor is now pointing at the split node. */ if (cursor->index >= split) { + cursor->parent_index = parent_index + 1; cursor->index -= split; - cursor->node = (hammer_btree_node_t)new_node; - hammer_put_buffer(cursor->node_buffer, 0); - cursor->node_buffer = new_buffer; + hammer_unlock(&cursor->node->lock); + hammer_rel_node(cursor->node); + cursor->node = new_node; /* locked and ref'd */ + } else { + cursor->parent_index = parent_index; + hammer_unlock(&new_node->lock); + hammer_rel_node(new_node); } - return (0); } @@ -1310,29 +972,28 @@ btree_split_internal(hammer_btree_cursor_t cursor) */ static int -btree_split_leaf(hammer_btree_cursor_t cursor) +btree_split_leaf(hammer_cursor_t cursor) { - struct hammer_btree_internal_node *parent; - struct hammer_btree_leaf_node *leaf; - struct hammer_btree_leaf_node *new_leaf; - union hammer_btree_leaf_elm *elm; - struct hammer_btree_internal_elm *parent_elm; - struct hammer_buffer *new_buffer; - int32_t parent_offset; + hammer_node_ondisk_t ondisk; + hammer_node_t parent; + hammer_node_t leaf; + hammer_node_t new_leaf; + hammer_btree_elm_t elm; + hammer_btree_elm_t parent_elm; int parent_index; int made_root; int split; int error; - const size_t esize = sizeof(struct hammer_btree_internal_elm); + const size_t esize = sizeof(*elm); /* - * We are splitting but elms[split] will be promoted to the parent, - * leaving the right hand node with one less element. If the - * insertion point will be on the left-hand side adjust the split - * point to give the right hand side one additional node. + * Calculate the split point. If the insertion point will be on + * the left-hand side adjust the split point to give the right + * hand side one additional node. */ - leaf = &cursor->node->leaf; - split = (leaf->base.count + 1) / 2; + leaf = cursor->node; + ondisk = leaf->ondisk; + split = (ondisk->count + 1) / 2; if (cursor->index <= split) --split; error = 0; @@ -1342,26 +1003,26 @@ btree_split_leaf(hammer_btree_cursor_t cursor) * 1 element and split normally. Avoid making major modifications * until we know the whole operation will work. */ - parent = cursor->parent; - if (parent == NULL) { - parent = hammer_alloc_btree(cursor->cluster, &error, - &cursor->parent_buffer); + if (ondisk->parent == 0) { + parent = hammer_alloc_btree(leaf->cluster, &error); if (parent == NULL) return(error); - parent->base.count = 1; - parent->base.parent = 0; - parent->base.type = HAMMER_BTREE_INTERNAL_NODE; - parent->base.subtype = leaf->base.type; - parent->elms[0].base = cursor->cluster->ondisk->clu_btree_beg; - parent->elms[0].subtree_offset = - hammer_bclu_offset(cursor->node_buffer, leaf); - parent->elms[1].base = cursor->cluster->ondisk->clu_btree_end; + hammer_lock_ex(&parent->lock); + ondisk = parent->ondisk; + ondisk->count = 1; + ondisk->parent = 0; + ondisk->type = HAMMER_BTREE_TYPE_INTERNAL; + ondisk->elms[0].base = leaf->cluster->clu_btree_beg; + ondisk->elms[0].internal.subtree_type = leaf->ondisk->type; + ondisk->elms[0].internal.subtree_offset = leaf->node_offset; + ondisk->elms[1].base = leaf->cluster->clu_btree_end; made_root = 1; - cursor->parent_index = 0; /* insertion point in parent */ + parent_index = 0; /* insertion point in parent */ } else { made_root = 0; + parent = cursor->parent; + parent_index = cursor->parent_index; } - parent_offset = hammer_bclu_offset(cursor->parent_buffer, parent); /* * Split leaf into new_leaf at the split point. Select a separator @@ -1375,33 +1036,35 @@ btree_split_leaf(hammer_btree_cursor_t cursor) * / \ * L L L L L L L L */ - new_buffer = NULL; - new_leaf = hammer_alloc_btree(cursor->cluster, &error, &new_buffer); + new_leaf = hammer_alloc_btree(leaf->cluster, &error); if (new_leaf == NULL) { - if (made_root) - hammer_free_btree_ptr(cursor->parent_buffer, - (hammer_btree_node_t)parent); + if (made_root) { + hammer_unlock(&parent->lock); + hammer_free_btree(leaf->cluster, parent->node_offset); + hammer_rel_node(parent); + } return(error); } + hammer_lock_ex(&new_leaf->lock); /* * Create the new node. P become the left-hand boundary in the * new node. Copy the right-hand boundary as well. */ - elm = &leaf->elms[split]; - bcopy(elm, &new_leaf->elms[0], (leaf->base.count - split) * esize); - new_leaf->base.count = leaf->base.count - split; - new_leaf->base.parent = parent_offset; - new_leaf->base.type = HAMMER_BTREE_LEAF_NODE; - new_leaf->base.subtype = 0; - KKASSERT(leaf->base.type == new_leaf->base.type); + ondisk = leaf->ondisk; + elm = &ondisk->elms[split]; + bcopy(elm, &new_leaf->ondisk->elms[0], (ondisk->count - split) * esize); + new_leaf->ondisk->count = ondisk->count - split; + new_leaf->ondisk->parent = parent->node_offset; + new_leaf->ondisk->type = HAMMER_BTREE_TYPE_LEAF; + KKASSERT(ondisk->type == new_leaf->ondisk->type); /* - * Cleanup the original node. P becomes the new boundary, its - * subtree_offset was moved to the new node. If we had created - * a new root its parent pointer may have changed. + * Cleanup the original node. Because this is a leaf node and + * leaf nodes do not have a right-hand boundary, there + * aren't any special edge cases to clean up. */ - leaf->base.parent = parent_offset; + /* nothing to do */ /* * Insert the separator into the parent, fixup the parent's @@ -1411,29 +1074,35 @@ btree_split_leaf(hammer_btree_cursor_t cursor) * Remember that base.count does not include the right-hand boundary. * We are copying parent_index+1 to parent_index+2, not +0 to +1. */ - parent_index = cursor->parent_index; - parent->elms[parent_index].subtree_count = split; - parent_elm = &parent->elms[parent_index+1]; - if (parent_index + 1 != parent->base.count) { + ondisk = parent->ondisk; + ondisk->elms[parent_index].internal.subtree_count = split; + parent_elm = &ondisk->elms[parent_index+1]; + if (parent_index + 1 != ondisk->count) { bcopy(parent_elm, parent_elm + 1, - (parent->base.count - parent_index - 1) * esize); + (ondisk->count - parent_index - 1) * esize); } hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base); - parent_elm->subtree_offset = hammer_bclu_offset(new_buffer, new_leaf); - parent_elm->subtree_count = new_leaf->base.count; - - hammer_modify_buffer(new_buffer); - hammer_modify_buffer(cursor->parent_buffer); - hammer_modify_buffer(cursor->node_buffer); + parent_elm->internal.subtree_offset = new_leaf->node_offset; + parent_elm->internal.subtree_count = new_leaf->ondisk->count; /* * The cluster's root pointer may have to be updated. */ if (made_root) { - cursor->cluster->ondisk->clu_btree_root = leaf->base.parent; - hammer_modify_cluster(cursor->cluster); + leaf->cluster->ondisk->clu_btree_root = parent->node_offset; + hammer_modify_cluster(leaf->cluster); + leaf->ondisk->parent = parent->node_offset; + if (cursor->parent) { + hammer_unlock(&cursor->parent->lock); + hammer_rel_node(cursor->parent); + } + cursor->parent = parent; /* lock'd and ref'd */ } + hammer_modify_node(leaf); + hammer_modify_node(new_leaf); + hammer_modify_node(parent); + /* * Ok, now adjust the cursor depending on which element the original * index was pointing at. If we are >= the split point the push node @@ -1444,58 +1113,149 @@ btree_split_leaf(hammer_btree_cursor_t cursor) * boundary, which is illegal. */ if (cursor->index >= split) { + cursor->parent_index = parent_index + 1; cursor->index -= split; - cursor->node = (hammer_btree_node_t)new_leaf; - hammer_put_buffer(cursor->node_buffer, 0); - cursor->node_buffer = new_buffer; + cursor->node = new_leaf; + hammer_unlock(&cursor->node->lock); + hammer_rel_node(cursor->node); + cursor->node = new_leaf; + } else { + cursor->parent_index = parent_index; + hammer_unlock(&new_leaf->lock); + hammer_rel_node(new_leaf); } - return (0); } /* - * This routine is called on an internal node prior to recursing down - * through (node, index) when (node, index) MIGHT have too few elements for - * the caller to perform a deletion. + * Remove the element at (node, index). If the internal node would become + * empty passively recurse up the tree. + * + * A locked internal node is passed to this function, the node remains + * locked on return. Leaf nodes cannot be passed to this function. + * + * Returns EAGAIN if we were unable to acquire the needed locks. The caller + * does not deal with the empty leaf until determines whether this recursion + * has succeeded or not. + */ +int +btree_remove(hammer_node_t node, int index) +{ + hammer_node_ondisk_t ondisk; + hammer_node_t parent; + int error; + + ondisk = node->ondisk; + KKASSERT(ondisk->count > 0); + + /* + * Remove the element, shifting remaining elements left one. + * Note that our move must include the right-boundary element. + */ + if (ondisk->count != 1) { + bcopy(&ondisk->elms[index+1], &ondisk->elms[index], + (ondisk->count - index) * sizeof(ondisk->elms[0])); + --ondisk->count; + hammer_modify_node(node); + return(0); + } + + /* + * Internal nodes cannot drop to 0 elements, so remove the node + * from ITS parent. If the node is the root node, convert it to + * an empty leaf node (which can drop to 0 elements). + */ + if (ondisk->parent == 0) { + ondisk->count = 0; + ondisk->type = HAMMER_BTREE_TYPE_LEAF; + hammer_modify_node(node); + return(0); + } + + /* + * Try to remove the node from its parent. Return EAGAIN if we + * cannot. + */ + parent = hammer_get_node(node->cluster, ondisk->parent, &error); + if (hammer_lock_ex_try(&parent->lock)) { + hammer_rel_node(parent); + return(EAGAIN); + } + ondisk = parent->ondisk; + for (index = 0; index < ondisk->count; ++index) { + if (ondisk->elms[index].internal.subtree_offset == + node->node_offset) { + break; + } + } + if (index == ondisk->count) { + kprintf("btree_remove: lost parent linkage to node\n"); + error = EIO; + } else { + error = btree_remove(parent, index); + if (error == 0) { + hammer_free_btree(node->cluster, node->node_offset); + /* NOTE: node can be reallocated at any time now */ + } + } + hammer_unlock(&parent->lock); + hammer_rel_node(parent); + return (error); +} + +#if 0 + +/* + * This routine is called on the internal node (node) prior to recursing down + * through (node, index) when the node referenced by (node, index) MIGHT + * have too few elements for the caller to perform a deletion. * * cursor->index is invalid on return because the separators may have gotten * adjusted, the caller must rescan the node's elements. The caller may set * cursor->index to -1 if it wants us to do a general rebalancing. + * + * This routine rebalances the children of the (node), collapsing children + * together if possible. On return each child will have at least L/2-1 + * elements unless the node only has one child. * * NOTE: Because we do not update the parent's parent in the split code, * the subtree_count used by the caller may be incorrect. We correct it * here. Also note that we cannot change the depth of the tree's leaf * nodes here (see btree_collapse()). * - * This routine rebalances the children of the node, collapsing children - * together if possible. On return each child will have at least L/2-1 - * elements unless the node only has one child. + * NOTE: We make no attempt to rebalance inter-cluster elements. */ static int -btree_rebalance_node(hammer_btree_cursor_t cursor) +btree_rebalance(hammer_cursor_t cursor) { - struct hammer_btree_internal_node *node; - hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS]; - struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS]; - hammer_btree_node_t child; - hammer_btree_elm_inmemory_t elms; + hammer_node_ondisk_t ondisk; + hammer_node_t node; + hammer_node_t children[HAMMER_BTREE_INT_ELMS]; + hammer_node_t child; + hammer_btree_elm_t elm; + hammer_btree_elm_t elms; int i, j, n, nelms, goal; int maxelms, halfelms; int error; /* - * Basic setup + * If the elm being recursed through is an inter-cluster reference, + * don't worry about it. */ - node = &cursor->node->internal; - KKASSERT(node->elms[cursor->index].subtree_offset != 0); + ondisk = cursor->node->ondisk; + elm = &ondisk->elms[cursor->index]; + if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER) + return(0); + + KKASSERT(elm->internal.subtree_offset != 0); error = 0; /* * Load the children of node and do any necessary corrections - * to subtree_count. subtree_count may be incorrect due to the + * to subtree_count. subtree_count may be too low due to the * way insertions split nodes. Get a count of the total number - * of elements held by our children. + * of actual elements held by our children. */ error = 0; @@ -1509,10 +1269,11 @@ btree_rebalance_node(hammer_btree_cursor_t cursor) continue; child = hammer_bread(cursor->cluster, elm->subtree_offset, HAMMER_FSBUF_BTREE, &error, - &child_buffer[i]); + &child_buffer[i], XXX); children[i] = child; if (child == NULL) continue; + XXX KKASSERT(node->base.subtype == child->base.type); /* @@ -1536,7 +1297,7 @@ btree_rebalance_node(hammer_btree_cursor_t cursor) child = children[i]; for (j = 0; j < child->base.count; ++j) { elms[n].owner = child; - if (node->base.subtype == HAMMER_BTREE_LEAF_NODE) + if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) elms[n].u.leaf = child->leaf.elms[j]; else elms[n].u.internal = child->internal.elms[j]; @@ -1572,7 +1333,7 @@ btree_rebalance_node(hammer_btree_cursor_t cursor) child = children[i]; for (j = 0; j < goal && n < nelms; ++j) { - if (node->base.subtype == HAMMER_BTREE_LEAF_NODE) { + if (node->base.subtype == HAMMER_BTREE_TYPE_LEAF) { child->leaf.elms[j] = elms[n].u.leaf; } else { child->internal.elms[j] = elms[n].u.internal; @@ -1584,12 +1345,12 @@ btree_rebalance_node(hammer_btree_cursor_t cursor) * expensive. */ if (elms[n].owner != child && - node->base.subtype == HAMMER_BTREE_INTERNAL_NODE) { + node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) { subchild = hammer_bread(cursor->cluster, elms[n].u.internal.subtree_offset, HAMMER_FSBUF_BTREE, &error, - &subchild_buffer); + &subchild_buffer, XXX); if (subchild) { subchild->base.parent = hammer_bclu_offset(child_buffer[i], @@ -1603,7 +1364,7 @@ btree_rebalance_node(hammer_btree_cursor_t cursor) /* * Set right boundary if the children are internal nodes. */ - if (node->base.subtype == HAMMER_BTREE_INTERNAL_NODE) + if (node->base.subtype == HAMMER_BTREE_TYPE_INTERNAL) child->internal.elms[j] = elms[n].u.internal; child->base.count = j; hammer_modify_buffer(child_buffer[i]); @@ -1665,10 +1426,10 @@ failed: */ static int -btree_collapse(hammer_btree_cursor_t cursor) +btree_collapse(hammer_cursor_t cursor) { - hammer_btree_node_t root, child; - hammer_btree_node_t children[HAMMER_BTREE_INT_ELMS]; + hammer_btree_node_ondisk_t root, child; + hammer_btree_node_ondisk_t children[HAMMER_BTREE_INT_ELMS]; struct hammer_buffer *child_buffer[HAMMER_BTREE_INT_ELMS]; int count; int i, j, n; @@ -1690,7 +1451,7 @@ btree_collapse(hammer_btree_cursor_t cursor) * children. */ KKASSERT(root->base.parent == 0); /* must be root node */ - KKASSERT(root->base.type == HAMMER_BTREE_INTERNAL_NODE); + KKASSERT(root->base.type == HAMMER_BTREE_TYPE_INTERNAL); KKASSERT(count <= HAMMER_BTREE_INT_ELMS); for (i = n = 0; i < count; ++i) { @@ -1716,7 +1477,7 @@ btree_collapse(hammer_btree_cursor_t cursor) continue; child = hammer_bread(cursor->cluster, elm->subtree_offset, HAMMER_FSBUF_BTREE, &error, - &child_buffer[i]); + &child_buffer[i], XXX); children[i] = child; if (child == NULL) continue; @@ -1745,7 +1506,7 @@ btree_collapse(hammer_btree_cursor_t cursor) * element's boundaries should match the root's left and right * boundaries. */ - if (root->base.subtype == HAMMER_BTREE_LEAF_NODE) { + if (root->base.subtype == HAMMER_BTREE_TYPE_LEAF) { for (i = n = 0; i < count; ++i) { child = children[i]; for (j = 0; j < child->base.count; ++j) { @@ -1787,7 +1548,8 @@ btree_collapse(hammer_btree_cursor_t cursor) elm->subtree_offset, HAMMER_FSBUF_BTREE, &error, - &subchild_buffer); + &subchild_buffer, + XXX); if (subchild) { subchild->base.parent = root_offset; hammer_modify_buffer(subchild_buffer); @@ -1799,7 +1561,7 @@ btree_collapse(hammer_btree_cursor_t cursor) /* XXX generate a new separator? */ root->internal.elms[n] = child->internal.elms[j]; } - root->base.type = HAMMER_BTREE_INTERNAL_NODE; + root->base.type = HAMMER_BTREE_TYPE_INTERNAL; root->base.subtype = subsubtype; if (subchild_buffer) hammer_put_buffer(subchild_buffer, 0); @@ -1819,3 +1581,143 @@ done: return(error); } +#endif + +/************************************************************************ + * MISCELLANIOUS SUPPORT * + ************************************************************************/ + +/* + * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp). + * + * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c. + * + * Note that key1 and key2 are treated differently. key1 is allowed to + * wildcard some of its fields by setting them to 0, while key2 is expected + * to be in an on-disk form (no wildcards). + */ +int +hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2) +{ +#if 0 + kprintf("compare obj_id %016llx %016llx\n", + key1->obj_id, key2->obj_id); + kprintf("compare rec_type %04x %04x\n", + key1->rec_type, key2->rec_type); + kprintf("compare key %016llx %016llx\n", + key1->key, key2->key); +#endif + + /* + * A key1->obj_id of 0 matches any object id + */ + if (key1->obj_id) { + if (key1->obj_id < key2->obj_id) + return(-4); + if (key1->obj_id > key2->obj_id) + return(4); + } + + /* + * A key1->rec_type of 0 matches any record type. + */ + if (key1->rec_type) { + if (key1->rec_type < key2->rec_type) + return(-3); + if (key1->rec_type > key2->rec_type) + return(3); + } + + /* + * There is no special case for key. 0 means 0. + */ + if (key1->key < key2->key) + return(-2); + if (key1->key > key2->key) + return(2); + + /* + * This test has a number of special cases. create_tid in key1 is + * the as-of transction id, and delete_tid in key1 is NOT USED. + * + * A key1->create_tid of 0 matches any record regardles of when + * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be + * used to search for the most current state of the object. + * + * key2->create_tid is a HAMMER record and will never be + * 0. key2->delete_tid is the deletion transaction id or 0 if + * the record has not yet been deleted. + */ + if (key1->create_tid) { + if (key1->create_tid < key2->create_tid) + return(-1); + if (key2->delete_tid && key1->create_tid >= key2->delete_tid) + return(1); + } + + return(0); +} + +/* + * Create a separator half way inbetween key1 and key2. For fields just + * one unit apart, the separator will match key2. + * + * The handling of delete_tid is a little confusing. It is only possible + * to have one record in the B-Tree where all fields match except delete_tid. + * This means, worse case, two adjacent elements may have a create_tid that + * is one-apart and cause the separator to choose the right-hand element's + * create_tid. e.g. (create,delete): (1,x)(2,x) -> separator is (2,x). + * + * So all we have to do is set delete_tid to the right-hand element to + * guarentee that the separator is properly between the two elements. + */ +#define MAKE_SEPARATOR(key1, key2, dest, field) \ + dest->field = key1->field + ((key2->field - key1->field + 1) >> 1); + +static void +hammer_make_separator(hammer_base_elm_t key1, hammer_base_elm_t key2, + hammer_base_elm_t dest) +{ + bzero(dest, sizeof(*dest)); + MAKE_SEPARATOR(key1, key2, dest, obj_id); + MAKE_SEPARATOR(key1, key2, dest, rec_type); + MAKE_SEPARATOR(key1, key2, dest, key); + MAKE_SEPARATOR(key1, key2, dest, create_tid); + dest->delete_tid = key2->delete_tid; +} + +#undef MAKE_SEPARATOR + +/* + * Return whether a generic internal or leaf node is full + */ +static int +btree_node_is_full(hammer_node_ondisk_t node) +{ + switch(node->type) { + case HAMMER_BTREE_TYPE_INTERNAL: + if (node->count == HAMMER_BTREE_INT_ELMS) + return(1); + break; + case HAMMER_BTREE_TYPE_LEAF: + if (node->count == HAMMER_BTREE_LEAF_ELMS) + return(1); + break; + default: + panic("illegal btree subtype"); + } + return(0); +} + +#if 0 +static int +btree_max_elements(u_int8_t type) +{ + if (type == HAMMER_BTREE_TYPE_LEAF) + return(HAMMER_BTREE_LEAF_ELMS); + if (type == HAMMER_BTREE_TYPE_INTERNAL) + return(HAMMER_BTREE_INT_ELMS); + panic("btree_max_elements: bad type %d\n", type); +} +#endif + diff --git a/sys/vfs/hammer/hammer_btree.h b/sys/vfs/hammer/hammer_btree.h index 2120d678d4..8c39a21823 100644 --- a/sys/vfs/hammer/hammer_btree.h +++ b/sys/vfs/hammer/hammer_btree.h @@ -31,57 +31,65 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.4 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.5 2007/11/19 00:53:40 dillon Exp $ */ /* - * HAMMER BH-Tree index + * HAMMER B-Tree index * - * HAMMER implements a modified B+Tree. In all documentation this will - * simply be refered to as the HAMMER BH-Tree. Basically la BH-Tree - * looks like a B+Tree (A B-Tree which stores its records only at the leafs - * of the tree), but adds two additional boundary elements which describe - * the left-most and right-most element a node is able to represent. In - * otherwords, we have boundary elements at the two ends of a BH-Tree node - * instead of sub-tree pointers. + * HAMMER implements a modified B+Tree. B+Trees store records only + * at their leaves and HAMMER's modification is to adjust the internal + * elements so there is a boundary element on each side instead of sub-tree + * pointers. * - * A BH-Tree internal node looks like this: + * We just call our modified B+Tree a 'B-Tree' in HAMMER documentation to + * reduce confusion. + * + * A B-Tree internal node looks like this: * * B N N N N N N B <-- boundary and internal elements * S S S S S S S <-- subtree pointers * - * A BH-Tree leaf node looks like this: + * A B-Tree leaf node looks like this: * * L L L L L L L L <-- leaf elemenets * (there is also a previous and next-leaf pointer) * - * The recursion radix is reduced by 2 relative to a normal B-Tree but - * as a bonus we treat internal nodes completely differently from leaf nodes, - * which allows us to pack in more elements. + * The recursion radix of an internal node is reduced by 1 relative to + * a normal B-Tree in order to accomodate the right-hand boundary. + * + * The big benefit to using a B-Tree with built-in bounds information is + * that it makes it possible to cache pointers into the middle of the tree + * and not have to start searches, insertions, OR deletions at the root node. + * The boundary elements allow searches to progress in a definitive direction + * from any point in the tree without revisting nodes. This greatly improves + * the efficiency of many operations, most especially record appends. * - * The big benefit to using a BH-Tree is that it is possible to cache - * pointers into the middle of the tree and not have to start searches, - * insertions, OR deletions at the root node. In particular, searches are - * able to progress in a definitive direction from any point in the tree - * without revisting nodes. This greatly improves the efficiency of many - * operations, most especially record appends. + * HAMMER B-Trees are per-cluster. The global multi-cluster B-Tree is + * constructed by allowing internal nodes to link to the roots of other + * clusters. Fields in the cluster header then reference back to its + * parent and use the cluster generation number to detect stale linkages. * - * BH-Trees also make stacking of trees fairly straightforward. + * The B-Tree balancing code can operate within a cluster or across the + * filesystem's ENTIRE B-Tree super-structure. A cluster's B-Tree root + * can be a leaf node in the worse case. A cluster is guarenteed to have + * sufficient free space to hold a single completely full leaf in the + * degenerate case. * - * Most of the structures below are on-disk structures. + * All of the structures below are on-disk structures. */ /* - * Common base for all B+Tree element types (40 bytes) + * Common base for all B-Tree element types (40 bytes) * - * Note that obj_type is set to the object type the record represents - * if an inode, directory entry, or a cluster range. A cluster range is + * obj_type is set to the object type the record represents if an inode, + * directory entry, or an inter-cluster reference. A cluster range is * special in that the B-Tree nodes represent a range within the B-Tree * inclusive of rec_type field, so obj_type must be used to detect the * cluster range entries. * - * The special field is used as a subtree pointer for internal nodes, - * and reserved for leaf nodes. + * subtree_type is only used by internal B-Tree elements and indicates + * whether we are referencing a cluster, a leaf, or an internal node. */ struct hammer_base_elm { int64_t obj_id; /* 00 object record is associated with */ @@ -90,9 +98,9 @@ struct hammer_base_elm { hammer_tid_t create_tid; /* 10 transaction id for record creation */ hammer_tid_t delete_tid; /* 18 transaction id for record update/del */ - u_int16_t rec_type; /* 20 */ - u_int8_t obj_type; /* 22 (special) */ - u_int8_t reserved06; /* 23 (special) */ + u_int16_t rec_type; /* 20 _RECTYPE_ */ + u_int8_t obj_type; /* 22 _OBJTYPE_ (restricted) */ + u_int8_t subtree_type; /* 23 B-Tree element type */ int32_t reserved07; /* 24 (future) */ /* 28 */ }; @@ -100,23 +108,37 @@ struct hammer_base_elm { typedef struct hammer_base_elm *hammer_base_elm_t; /* - * Internal element - 48 bytes. Internal elements are independantly - * organized. Note that internal elements are a different size than - * leaf elements. + * Internal element (40 + 16 = 56 bytes). + * + * An internal element contains the left-hand boundary and a recursion to + * another B-Tree node. If rec_offset is 0 it points to another node in the + * current cluster at subtree_offset. Otherwise it points to the root + * of another cluster at volno/cluid. + * + * sub-cluster references have an associated record while intra-cluster + * references do not. + * + * subtree_count is the number of elements in an intra-cluster reference. + * For inter-cluster references subtree_count is chaining depth heuristic + * used to help balance the B-Tree globally. */ struct hammer_btree_internal_elm { struct hammer_base_elm base; - int32_t subtree_offset; - int8_t subtree_count; /* hint: can be too small, but not too big */ - int8_t reserved01; - int8_t reserved02; - int8_t reserved03; + int32_t rec_offset; /* 0 indicates internal reference */ + int32_t subtree_offset; /* offset or cluster id */ + int32_t subtree_volno; /* unused or volume number */ + int32_t subtree_count; /* hint: can be too small, but not too big */ }; +#define subtree_cluid subtree_offset +#define subtree_type base.subtree_type + /* - * Leaf B-Tree element (40 + 16 = 56 bytes) + * Leaf B-Tree element (40 + 16 = 56 bytes). + * + * A leaf */ -struct hammer_btree_record_elm { +struct hammer_btree_leaf_elm { struct hammer_base_elm base; int32_t rec_offset; int32_t data_offset; @@ -124,48 +146,39 @@ struct hammer_btree_record_elm { u_int32_t data_crc; }; -/* - * Leaf Cluster-reference B-Tree element (40 + 16 = 56 bytes) - */ -struct hammer_btree_cluster_elm { - struct hammer_base_elm base; - int32_t rec_offset; /* cluster recursion record */ - u_int32_t verifier; /* low 32 bits of target clu_id */ - int32_t vol_no; - int32_t clu_no; -}; - /* * Rollup btree leaf element types - 56 byte structure */ -union hammer_btree_leaf_elm { +union hammer_btree_elm { struct hammer_base_elm base; - struct hammer_btree_record_elm record; - struct hammer_btree_cluster_elm cluster; + struct hammer_btree_leaf_elm leaf; + struct hammer_btree_internal_elm internal; }; -typedef union hammer_btree_leaf_elm *hammer_btree_leaf_elm_t; +typedef union hammer_btree_elm *hammer_btree_elm_t; /* - * B-Tree node (normal or meta) - 56 + 56 * 8 = 504 bytes + * B-Tree node (normal or meta) - 24 + 56 * 14 = 808 bytes (8-byte aligned) * - * 32 B-Tree nodes fit in a 16K filesystem buffer. Remember, the filesystem - * buffer has a 128 byte header so (16384 - 128) / 32 = 508, but we want - * our structures to be 8-byte aligned so we use 504. + * 20 B-Tree nodes fit in a 16K filesystem buffer, leaving us room for + * the 128 byte filesystem buffer header and another 96 bytes of filler. * - * We store B-Tree records as elements in internal nodes, which means we - * must have a separate index of sub-tree pointers. The sub-tree pointers - * interlace the elements (that is, the elements act as separators). Thus - * we have to have one more pointer then we do elements so we can represent - * the gamut - the sub-tree to the left of the first element AND the sub-tree - * to the right of the last element. + * Each node contains 14 elements. The last element for an internal node + * is the right-boundary so internal nodes have one fewer logical elements + * then leaf nodes. * - * subcount represents the number of elements in the subnode (1-8) + * 'count' always refers to the number of elements and is non-inclusive of + * the right-hand boundary for an internal node. * - * 'count' always refers to the number of elements. + * NOTE: The node head for an internal does not contain the subtype + * (The B-Tree node type for the nodes referenced by its elements). + * Instead, each element specifies the subtype (elm->base.subtype). + * This allows us to maintain an unbalanced B-Tree and to easily identify + * special inter-cluster link elements. */ -#define HAMMER_BTREE_LEAF_ELMS 8 -#define HAMMER_BTREE_INT_ELMS 9 /* +1 more for the right boundary */ +#define HAMMER_BTREE_LEAF_ELMS 14 +#define HAMMER_BTREE_INT_ELMS (HAMMER_BTREE_LEAF_ELMS - 1) +#define HAMMER_BTREE_NODES 20 /* * It is safe to combine two adjacent nodes if the total number of elements @@ -174,126 +187,44 @@ typedef union hammer_btree_leaf_elm *hammer_btree_leaf_elm_t; #define HAMMER_BTREE_LEAF_FILL (HAMMER_BTREE_LEAF_ELMS - 3) #define HAMMER_BTREE_INT_FILL (HAMMER_BTREE_INT_ELMS - 3) -#define HAMMER_BTREE_INTERNAL_NODE ((u_int32_t)'I') -#define HAMMER_BTREE_LEAF_NODE ((u_int32_t)'L') +#define HAMMER_BTREE_TYPE_INTERNAL ((u_int8_t)'I') +#define HAMMER_BTREE_TYPE_LEAF ((u_int8_t)'L') +#define HAMMER_BTREE_TYPE_CLUSTER ((u_int8_t)'C') -struct hammer_base_node { +struct hammer_node_ondisk { + /* + * B-Tree node header (24 bytes) + */ int32_t count; - int32_t parent; + int32_t parent; /* 0 if at root of cluster */ u_int8_t type; - u_int8_t subtype; + u_int8_t reserved01; u_int16_t reserved02; - int32_t reserved03; -}; - -struct hammer_btree_internal_node { - /* - * B-Tree internal node header (24 bytes) - */ - struct hammer_base_node base; - int32_t reserved[2]; + int32_t reserved03; /* future heuristic */ + int32_t reserved04; /* future link_left */ + int32_t reserved05; /* future link_right */ /* - * Internal element array. The subtree fields are logically to - * the right of the element key. These fields are unused and - * must be 0 for the right-hand boundary element. - * - * The left-hand boundary element is at elms[0], the right-hand - * boundary element is at elms[count]. count does not include - * the right-hand boundary (so count at the termination of a - * standard for() loop will point at the right-hand boundary). + * Element array. Internal nodes have one less logical element + * (meaning: the same number of physical elements) in order to + * accomodate the right-hand boundary. The left-hand boundary + * is integrated into the first element. Leaf nodes have no + * boundary elements. */ - struct hammer_btree_internal_elm elms[HAMMER_BTREE_INT_ELMS+1]; + union hammer_btree_elm elms[HAMMER_BTREE_LEAF_ELMS]; }; -/* - * The hammer leaf node contains a smaller header - */ -struct hammer_btree_leaf_node { - /* - * B-Tree leaf node header - */ - struct hammer_base_node base; - int32_t link_left; - int32_t link_right; - int32_t reserved[8]; - - /* - * Leaf element array - */ - union hammer_btree_leaf_elm elms[HAMMER_BTREE_LEAF_ELMS]; -}; - -union hammer_btree_node { - struct hammer_base_node base; - struct hammer_btree_internal_node internal; - struct hammer_btree_leaf_node leaf; -}; - -typedef union hammer_btree_node *hammer_btree_node_t; - -/* - * This in-memory structure is used by btree_rebalance_node(). Note - * that internal elements are a different size than record and cluster - * elements. - */ -struct hammer_btree_elm_inmemory { - hammer_btree_node_t owner; - union { - struct hammer_base_elm base; - struct hammer_btree_internal_elm internal; - union hammer_btree_leaf_elm leaf; - } u; -}; - -typedef struct hammer_btree_elm_inmemory *hammer_btree_elm_inmemory_t; - +typedef struct hammer_node_ondisk *hammer_node_ondisk_t; /* - * B-Tree filesystem buffer + * B-Tree filesystem buffer (16K exactly) */ -#define HAMMER_BTREE_NODES \ - ((HAMMER_BUFSIZE - sizeof(struct hammer_fsbuf_head)) / \ - sizeof(union hammer_btree_node)) - struct hammer_fsbuf_btree { - struct hammer_fsbuf_head head; - char unused[128]; - union hammer_btree_node nodes[HAMMER_BTREE_NODES]; -}; - -/* - * Key and caching structures used for HAMMER B-Tree lookups. These are - * in-memory structures. - */ -struct hammer_btree_cursor { - hammer_base_elm_t left_bound; - hammer_base_elm_t right_bound; - struct hammer_cluster *cluster; - struct hammer_buffer *parent_buffer; - struct hammer_btree_internal_node *parent; - struct hammer_buffer *node_buffer; - hammer_btree_node_t node; /* internal or leaf node */ - int index; - int parent_index; - int last_error; + struct hammer_fsbuf_head head; /* 128 */ + char filler[96]; + struct hammer_node_ondisk nodes[HAMMER_BTREE_NODES]; }; -typedef struct hammer_btree_cursor *hammer_btree_cursor_t; - -struct hammer_btree_info { - struct hammer_btree_cursor cursor; - struct hammer_buffer *data_buffer; - struct hammer_buffer *record_buffer; - union hammer_data_ondisk *data; /* returned data pointer */ - union hammer_record_ondisk *rec;/* returned record pointer */ -}; - -typedef struct hammer_btree_info *hammer_btree_info_t; - -#define HAMMER_BTREE_GET_RECORD 0x0001 -#define HAMMER_BTREE_GET_DATA 0x0002 -#define HAMMER_BTREE_CLUSTER_TAG 0x0004 /* stop at the cluster tag */ -#define HAMMER_BTREE_INSERT 0x0008 /* adjust for insert */ -#define HAMMER_BTREE_DELETE 0x0010 /* adjust for delete */ - +#if HAMMER_BTREE_NODES * (HAMMER_BTREE_LEAF_ELMS * 56 + 24) + 128 + 96 != 16384 +#error "Sanity check hammer_fsbuf_btree" +#endif diff --git a/sys/vfs/hammer/hammer_cursor.c b/sys/vfs/hammer/hammer_cursor.c new file mode 100644 index 0000000000..861be29fe2 --- /dev/null +++ b/sys/vfs/hammer/hammer_cursor.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2007 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.1 2007/11/19 00:53:40 dillon Exp $ + */ + +/* + * HAMMER B-Tree index - cursor support routines + */ +#include "hammer.h" + +static int hammer_load_cursor_parent(hammer_cursor_t cursor); +static int hammer_load_cursor_parent_local(hammer_cursor_t cursor); +static int hammer_load_cursor_parent_cluster(hammer_cursor_t cursor); + +/* + * Initialize a fresh cursor at the root of the filesystem hierarchy. + * + * cursor->parent will be NULL on return since we are loading the root + * node of the root cluster. + */ +int +hammer_init_cursor_hmp(hammer_cursor_t cursor, hammer_mount_t hmp) +{ + hammer_cluster_t cluster; + int error; + + bzero(cursor, sizeof(*cursor)); + cluster = hammer_get_root_cluster(hmp, &error); + if (error == 0) { + cursor->node = hammer_get_node(cluster, + cluster->ondisk->clu_btree_root, + &error); + hammer_rel_cluster(cluster, 0); + if (error == 0) + hammer_lock_ex(&cursor->node->lock); + } + if (error == 0) + error = hammer_load_cursor_parent(cursor); + return(error); +} + +/* + * Initialize a fresh cursor using the B-Tree node cached by the + * in-memory inode. + */ +int +hammer_init_cursor_ip(hammer_cursor_t cursor, hammer_inode_t ip) +{ + int error; + + if (ip->cache) { + bzero(cursor, sizeof(*cursor)); + cursor->node = ip->cache; + error = hammer_ref_node(cursor->node); + if (error == 0) { + hammer_lock_ex(&cursor->node->lock); + error = hammer_load_cursor_parent(cursor); + } else { + hammer_rel_node(cursor->node); + cursor->node = NULL; + } + } else { + error = hammer_init_cursor_hmp(cursor, ip->hmp); + } + return(error); +} + +/* + * We are finished with a cursor. We NULL out various fields as sanity + * check, in case the structure is inappropriately used afterwords. + */ +void +hammer_done_cursor(hammer_cursor_t cursor) +{ + if (cursor->parent) { + hammer_unlock(&cursor->parent->lock); + hammer_rel_node(cursor->parent); + cursor->parent = NULL; + } + if (cursor->node) { + hammer_unlock(&cursor->node->lock); + hammer_rel_node(cursor->node); + cursor->node = NULL; + } + if (cursor->data_buffer) { + hammer_rel_buffer(cursor->data_buffer, 0); + cursor->data_buffer = NULL; + } + if (cursor->record_buffer) { + hammer_rel_buffer(cursor->record_buffer, 0); + cursor->record_buffer = NULL; + } + cursor->data = NULL; + cursor->record = NULL; + cursor->left_bound = NULL; + cursor->right_bound = NULL; +} + +/* + * Load the parent of cursor->node into cursor->parent. There are several + * cases. (1) The parent is in the current cluster. (2) We are at the + * root of the cluster and the parent is in another cluster. (3) We are at + * the root of the root cluster. + */ +static +int +hammer_load_cursor_parent(hammer_cursor_t cursor) +{ + hammer_cluster_t cluster; + int error; + + cluster = cursor->node->cluster; + + if (cursor->node->ondisk->parent) { + error = hammer_load_cursor_parent_local(cursor); + } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0) { + error = hammer_load_cursor_parent_cluster(cursor); + } else { + cursor->parent = NULL; + cursor->parent_index = 0; + cursor->left_bound = &cluster->ondisk->clu_btree_beg; + cursor->right_bound = &cluster->ondisk->clu_btree_end; + error = 0; + } + return(error); +} + +static +int +hammer_load_cursor_parent_local(hammer_cursor_t cursor) +{ + hammer_node_t node; + hammer_node_t parent; + hammer_btree_elm_t elm; + int error; + int i; + + node = cursor->node; + parent = hammer_get_node(node->cluster, node->ondisk->parent, &error); + if (error) + return(error); + elm = NULL; + for (i = 0; i < parent->ondisk->count; ++i) { + elm = &parent->ondisk->elms[i]; + if (parent->ondisk->elms[i].internal.subtree_offset == + node->node_offset) { + break; + } + } + KKASSERT(i != parent->ondisk->count); + KKASSERT(parent->ondisk->elms[i].internal.rec_offset == 0); + cursor->parent = parent; + cursor->parent_index = i; + cursor->left_bound = &elm[0].internal.base; + cursor->right_bound = &elm[1].internal.base; + + if (hammer_lock_ex_try(&parent->lock) != 0) { + hammer_unlock(&cursor->node->lock); + hammer_lock_ex(&parent->lock); + hammer_lock_ex(&cursor->node->lock); + /* XXX check node generation count */ + } + return(error); +} + +static +int +hammer_load_cursor_parent_cluster(hammer_cursor_t cursor) +{ + hammer_cluster_ondisk_t ondisk; + hammer_cluster_t cluster; + hammer_volume_t volume; + hammer_node_t node; + hammer_node_t parent; + hammer_btree_elm_t elm; + int32_t clu_no; + int32_t vol_no; + int error; + int i; + + node = cursor->node; + ondisk = node->cluster->ondisk; + vol_no = ondisk->clu_btree_parent_vol_no; + clu_no = ondisk->clu_btree_parent_clu_no; + + /* + * Acquire the node from (volume, cluster, offset) + */ + volume = hammer_get_volume(node->cluster->volume->hmp, vol_no, &error); + if (error) + return (error); + cluster = hammer_get_cluster(volume, clu_no, &error, 0); + hammer_rel_volume(volume, 0); + if (error) + return (error); + parent = hammer_get_node(cluster, ondisk->clu_btree_parent_offset, + &error); + hammer_rel_cluster(cluster, 0); + if (error) + return (error); + + /* + * Ok, we have the node. Locate the inter-cluster element + */ + elm = NULL; + for (i = 0; i < parent->ondisk->count; ++i) { + elm = &parent->ondisk->elms[i]; + if (elm->internal.rec_offset != 0 && + elm->internal.subtree_cluid == clu_no) { + break; + } + } + KKASSERT(i != parent->ondisk->count); + cursor->parent = parent; + cursor->parent_index = i; + cursor->left_bound = &elm[0].internal.base; + cursor->right_bound = &elm[1].internal.base; + + KKASSERT(hammer_btree_cmp(cursor->left_bound, + &parent->cluster->ondisk->clu_btree_beg) == 0); + KKASSERT(hammer_btree_cmp(cursor->right_bound, + &parent->cluster->ondisk->clu_btree_end) == 0); + + if (hammer_lock_ex_try(&parent->lock) != 0) { + hammer_unlock(&cursor->node->lock); + hammer_lock_ex(&parent->lock); + hammer_lock_ex(&cursor->node->lock); + /* XXX check node generation count */ + } + return(0); +} + + +/* + * Cursor up to our parent node. Return ENOENT if we are at the root of + * the root cluster. + */ +int +hammer_cursor_up(hammer_cursor_t cursor) +{ + int error; + + /* + * Set the node to its parent. If the parent is NULL we are at + * the root of the root cluster and return ENOENT. + */ + hammer_unlock(&cursor->node->lock); + hammer_rel_node(cursor->node); + cursor->node = cursor->parent; + cursor->index = cursor->parent_index; + cursor->parent = NULL; + cursor->parent_index = 0; + + if (cursor->node == NULL) { + error = ENOENT; + } else { + error = hammer_load_cursor_parent(cursor); + } + return(error); +} + +/* + * Set the cursor to the root of the current cursor's cluster. + */ +int +hammer_cursor_toroot(hammer_cursor_t cursor) +{ + hammer_cluster_t cluster; + int error; + + /* + * Already at root of cluster? + */ + if (cursor->node->ondisk->parent == 0) + return (0); + + /* + * Parent is root of cluster? + */ + if (cursor->parent->ondisk->parent == 0) + return (hammer_cursor_up(cursor)); + + /* + * Ok, reload the cursor with the root of the cluster, then + * locate its parent. + */ + cluster = cursor->node->cluster; + error = hammer_ref_cluster(cluster); + if (error) + return (error); + + hammer_unlock(&cursor->parent->lock); + hammer_rel_node(cursor->parent); + hammer_unlock(&cursor->node->lock); + hammer_rel_node(cursor->node); + cursor->parent = NULL; + cursor->parent_index = 0; + + cursor->node = hammer_get_node(cluster, cluster->ondisk->clu_btree_root, + &error); + cursor->index = 0; + hammer_rel_cluster(cluster, 0); + if (error == 0) + error = hammer_load_cursor_parent(cursor); + return(error); +} + +/* + * Cursor down through the current node, which must be an internal node. + * + * This routine adjusts the cursor and sets index to 0. + */ +int +hammer_cursor_down(hammer_cursor_t cursor) +{ + hammer_node_t node; + hammer_btree_elm_t elm; + hammer_volume_t volume; + hammer_cluster_t cluster; + int32_t vol_no; + int32_t clu_no; + int error; + + /* + * The current node becomes the current parent + */ + node = cursor->node; + KKASSERT(node->ondisk->type == HAMMER_BTREE_TYPE_INTERNAL); + KKASSERT(cursor->index >= 0 && cursor->index < node->ondisk->count); + if (cursor->parent) { + hammer_unlock(&cursor->parent->lock); + hammer_rel_node(cursor->parent); + } + cursor->parent = node; + cursor->parent_index = cursor->index; + cursor->node = NULL; + cursor->index = 0; + + /* + * Extract element to push into at (node,index) + */ + elm = &node->ondisk->elms[cursor->parent_index]; + + /* + * Ok, push down into elm. If rec_offset is non-zero this is + * an inter-cluster push, otherwise it is a intra-cluster push. + */ + if (elm->internal.rec_offset) { + vol_no = elm->internal.subtree_volno; + clu_no = elm->internal.subtree_cluid; + volume = hammer_get_volume(node->cluster->volume->hmp, + vol_no, &error); + if (error) + return(error); + cluster = hammer_get_cluster(volume, clu_no, &error, 0); + hammer_rel_volume(volume, 0); + if (error) + return(error); + node = hammer_get_node(cluster, + cluster->ondisk->clu_btree_root, + &error); + hammer_rel_cluster(cluster, 0); + } else { + node = hammer_get_node(node->cluster, + elm->internal.subtree_offset, + &error); + } + if (error == 0) { + hammer_lock_ex(&node->lock); + cursor->node = node; + cursor->index = 0; + } + return(error); +} + diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h new file mode 100644 index 0000000000..db84c61f7a --- /dev/null +++ b/sys/vfs/hammer/hammer_cursor.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2007 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.1 2007/11/19 00:53:40 dillon Exp $ + */ + +/* + * The hammer_cursor structure is the primary in-memory management structure + * for B-Tree operations. + * + * The most important issue to make note of is that a hammer_cursor is a + * tracking structure. Any active hammer_cursor structure will be linked into + * hammer_node based lists and B-Tree operations executed by unrelated + * treads MAY MODIFY YOUR CURSOR when you are not holding an exclusive + * lock on the cursor's nodes. + * + * Most operations maintain an exclusive lock on their node and + * parent node, but certain cursor operations may temporarily release + * one or both locks. In particular, a cursor_up operation may have + * to temporarily release underlying locks to avoid a deadlock. + */ +struct hammer_cursor { + /* + * Parent B-Tree node, current B-Tree node, and related element + * indices. + */ + hammer_node_t parent; + int parent_index; + hammer_node_t node; + int index; + + /* + * Pointer to the current node's bounds. Typically points to the + * appropriate boundary elements in the parent or points to bounds + * stored in the cluster. The right-boundary is range-exclusive. + */ + hammer_base_elm_t left_bound; + hammer_base_elm_t right_bound; + + /* + * Key or key range governing search + */ + struct hammer_base_elm key_beg; + struct hammer_base_elm key_end; + + /* + * Related data and record references. Note that the related buffers + * can be NULL when data and/or record is not, typically indicating + * information referenced via an in-memory record. + */ + struct hammer_buffer *data_buffer; /* extracted data & record */ + union hammer_data_ondisk *data; + struct hammer_buffer *record_buffer; + union hammer_record_ondisk *record; + + /* + * Iteration and extraction control variables + */ + int flags; + int last_error; + + /* + * Merged in-memory/on-disk iterations also use these fields. + */ + struct hammer_inode *ip; + struct hammer_record *iprec; +}; + +typedef struct hammer_cursor *hammer_cursor_t; + +#define HAMMER_BTREE_GET_RECORD 0x0001 +#define HAMMER_BTREE_GET_DATA 0x0002 +#define HAMMER_BTREE_CLUSTER_TAG 0x0004 /* stop at the cluster tag */ +#define HAMMER_BTREE_INSERT 0x0008 /* adjust for insert */ +#define HAMMER_BTREE_DELETE 0x0010 /* adjust for delete */ + diff --git a/sys/vfs/hammer/hammer_disk.h b/sys/vfs/hammer/hammer_disk.h index 6663428bfe..58a5c6b07c 100644 --- a/sys/vfs/hammer/hammer_disk.h +++ b/sys/vfs/hammer/hammer_disk.h @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.5 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_disk.h,v 1.6 2007/11/19 00:53:40 dillon Exp $ */ #ifndef _SYS_UUID_H_ @@ -202,6 +202,8 @@ struct hammer_volume_ondisk { u_int32_t vol0_bitmap[1024]; }; +typedef struct hammer_volume_ondisk *hammer_volume_ondisk_t; + #define HAMMER_VOLF_VALID 0x0001 /* valid entry */ #define HAMMER_VOLF_OPEN 0x0002 /* volume is open */ #define HAMMER_VOLF_USINGSUPERCL 0x0004 /* using superclusters */ @@ -231,6 +233,8 @@ struct hammer_supercl_ondisk { struct hammer_almeta scl_meta[HAMMER_SUPERCL_METAELMS]; }; +typedef struct hammer_supercl_ondisk *hammer_supercl_ondisk_t; + /* * HAMMER Cluster header * @@ -272,10 +276,8 @@ struct hammer_cluster_ondisk { uuid_t vol_fsid; /* identify filesystem - sanity check */ uuid_t vol_fstype; /* identify filesystem type - sanity check */ - u_int64_t clu_gen; /* identify generation number of cluster */ - u_int64_t clu_unused01; - hammer_tid_t clu_id; /* unique cluster self identification */ + hammer_tid_t clu_gen; /* generation number */ int32_t vol_no; /* cluster contained in volume (sanity) */ u_int32_t clu_flags; /* cluster flags */ @@ -296,11 +298,10 @@ struct hammer_cluster_ondisk { /* * Specify the range of information stored in this cluster as two - * btree elements. These elements exist as separate records that - * point to us in the parent cluster's B-Tree. - * - * Note that clu_btree_end is range-inclusive, not range-exclusive. - * i.e. 0-1023 instead of 0,1024. + * btree elements. These elements match the left and right + * boundary elements in the internal B-Tree node of the parent + * cluster that points to the root of our cluster. Because these + * are boundary elements, the right boundary is range-NONinclusive. */ struct hammer_base_elm clu_btree_beg; struct hammer_base_elm clu_btree_end; @@ -308,12 +309,16 @@ struct hammer_cluster_ondisk { /* * The cluster's B-Tree root can change as a side effect of insertion * and deletion operations so store an offset instead of embedding - * the root node. + * the root node. The parent_offset is stale if the generation number + * does not match. + * + * Parent linkages are explicit. */ int32_t clu_btree_root; int32_t clu_btree_parent_vol_no; int32_t clu_btree_parent_clu_no; - hammer_tid_t clu_btree_parent_clu_id; + int32_t clu_btree_parent_offset; + hammer_tid_t clu_btree_parent_clu_gen; u_int64_t synchronized_rec_id; @@ -323,6 +328,8 @@ struct hammer_cluster_ondisk { struct hammer_almeta clu_mdata_meta[HAMMER_CLU_SLAVE_METAELMS]; }; +typedef struct hammer_cluster_ondisk *hammer_cluster_ondisk_t; + /* * HAMMER records are 96 byte entities encoded into 16K filesystem buffers. * Each record has a 64 byte header and a 32 byte extension. 170 records @@ -370,12 +377,23 @@ struct hammer_base_record { * targets and even though they are built within a HAMMER filesystem they * get their own obj_id space (and thus can serve as a replication target) * and look like a mount point to the system. + * + * Inter-cluster records are special-cased in the B-Tree. These records + * are referenced from a B-Tree INTERNAL node, NOT A LEAF. This means + * that the element in the B-Tree node is actually a boundary element whos + * base element fields, including rec_type, reflect the boundary, NOT + * the inter-cluster record type. + * + * HAMMER_RECTYPE_CLUSTER - only set in the actual inter-cluster record, + * not set in the left or right boundary elements around the inter-cluster + * reference of an internal node in the B-Tree (because doing so would + * interfere with the boundary tests). */ #define HAMMER_RECTYPE_UNKNOWN 0 #define HAMMER_RECTYPE_LOWEST 1 /* lowest record type avail */ #define HAMMER_RECTYPE_INODE 1 /* inode in obj_id space */ #define HAMMER_RECTYPE_PSEUDO_INODE 2 /* pseudo filesysem */ -#define HAMMER_RECTYPE_CLUSTER 3 /* cluster reference */ +#define HAMMER_RECTYPE_CLUSTER 3 /* inter-cluster reference */ #define HAMMER_RECTYPE_DATA 0x10 #define HAMMER_RECTYPE_DIRENTRY 0x11 #define HAMMER_RECTYPE_DB 0x12 @@ -391,10 +409,6 @@ struct hammer_base_record { #define HAMMER_OBJTYPE_SOFTLINK 7 #define HAMMER_OBJTYPE_PSEUDOFS 8 /* pseudo filesystem obj */ -#define HAMMER_OBJTYPE_CLUSTER_FLAG 0x20 -#define HAMMER_OBJTYPE_CLUSTER_BEG 0x20 -#define HAMMER_OBJTYPE_CLUSTER_END 0x21 - /* * Generic full-sized record */ @@ -570,6 +584,7 @@ struct hammer_inode_data { u_int64_t parent_obj_id;/* parent directory obj_id */ uuid_t uid; uuid_t gid; + /* XXX device, softlink extension */ }; #define HAMMER_INODE_DATA_VERSION 1 diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index adfdc7eb39..0ac3bd062e 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.3 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.4 2007/11/19 00:53:40 dillon Exp $ */ #include "hammer.h" @@ -63,10 +63,6 @@ hammer_vop_reclaim(struct vop_reclaim_args *ap) /* * Obtain a vnode for the specified inode number. An exclusively locked * vnode is returned. - * - * To avoid deadlocks we cannot hold the inode lock while we are creating - * a new vnode. We can prevent the inode from going away, however. If - * we race another vget we just throw away our newly created vnode. */ int hammer_vfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) @@ -85,9 +81,8 @@ hammer_vfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp) *vpp = NULL; return(error); } - hammer_lock_to_ref(&ip->lock); error = hammer_get_vnode(ip, LK_EXCLUSIVE, vpp); - hammer_put_inode_ref(ip); + hammer_rel_inode(ip); return (error); } @@ -107,42 +102,48 @@ hammer_get_vnode(struct hammer_inode *ip, int lktype, struct vnode **vpp) error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0); if (error) break; - if (ip->vp == NULL) { - vp = *vpp; - ip->vp = vp; - vp->v_type = hammer_get_vnode_type( - ip->ino_rec.base.base.obj_type); - vp->v_data = (void *)ip; - /* vnode locked by getnewvnode() */ - break; - } - vp->v_type = VBAD; - vx_put(vp); - } else { - /* - * loop if the vget fails (aka races), or if the vp - * no longer matches ip->vp. - */ - if (vget(vp, LK_EXCLUSIVE) == 0) { - if (vp == ip->vp) - break; - vput(vp); + hammer_lock_ex(&ip->lock); + if (ip->vp != NULL) { + hammer_unlock(&ip->lock); + vp->v_type = VBAD; + vx_put(vp); + continue; } + hammer_ref(&ip->lock); + vp = *vpp; + ip->vp = vp; + vp->v_type = hammer_get_vnode_type( + ip->ino_rec.base.base.obj_type); + vp->v_data = (void *)ip; + /* vnode locked by getnewvnode() */ + /* make related vnode dirty if inode dirty? */ + hammer_unlock(&ip->lock); + break; + } + + /* + * loop if the vget fails (aka races), or if the vp + * no longer matches ip->vp. + */ + if (vget(vp, LK_EXCLUSIVE) == 0) { + if (vp == ip->vp) + break; + vput(vp); } } return(error); } /* - * Get and lock a HAMMER inode. These functions do not attach or detach - * the related vnode. + * Acquire a HAMMER inode. The returned inode is not locked. These functions + * do not attach or detach the related vnode (use hammer_get_vnode() for + * that). */ struct hammer_inode * hammer_get_inode(struct hammer_mount *hmp, u_int64_t obj_id, int *errorp) { - struct hammer_btree_info binfo; struct hammer_inode_info iinfo; - struct hammer_base_elm key; + struct hammer_cursor cursor; struct hammer_inode *ip; /* @@ -154,7 +155,7 @@ hammer_get_inode(struct hammer_mount *hmp, u_int64_t obj_id, int *errorp) loop: ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo); if (ip) { - hammer_lock(&ip->lock); + hammer_ref(&ip->lock); *errorp = 0; return(ip); } @@ -163,21 +164,24 @@ loop: ip->obj_id = obj_id; ip->obj_asof = iinfo.obj_asof; ip->hmp = hmp; + RB_INIT(&ip->rec_tree); /* + * Locate the on-disk inode. * If we do not have an inode cached search the HAMMER on-disk B-Tree * for it. */ - hammer_btree_info_init(&binfo, hmp->rootcl); - key.obj_id = ip->obj_id; - key.key = 0; - key.create_tid = iinfo.obj_asof; - key.delete_tid = 0; - key.rec_type = HAMMER_RECTYPE_INODE; - key.obj_type = 0; - *errorp = hammer_btree_lookup(&binfo, &key, HAMMER_BTREE_GET_RECORD | - HAMMER_BTREE_GET_DATA); + hammer_init_cursor_hmp(&cursor, hmp); + cursor.key_beg.obj_id = ip->obj_id; + cursor.key_beg.key = 0; + cursor.key_beg.create_tid = iinfo.obj_asof; + cursor.key_beg.delete_tid = 0; + cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE; + cursor.key_beg.obj_type = 0; + cursor.flags = HAMMER_BTREE_GET_RECORD | HAMMER_BTREE_GET_DATA; + + *errorp = hammer_btree_lookup(&cursor); /* * On success the B-Tree lookup will hold the appropriate @@ -185,10 +189,11 @@ loop: * information. Copy the information to the in-memory inode. */ if (*errorp == 0) { - ip->ino_rec = binfo.rec->inode; - ip->ino_data = binfo.data->inode; + ip->ino_rec = cursor.record->inode; + ip->ino_data = cursor.data->inode; } - hammer_btree_info_done(&binfo); + hammer_cache_node(cursor.node, &ip->cache); + hammer_done_cursor(&cursor); /* * On success load the inode's record and data and insert the @@ -196,7 +201,10 @@ loop: * insertion of the same inode so deal with that condition too. */ if (*errorp == 0) { + hammer_ref(&ip->lock); if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { + hammer_uncache_node(&ip->cache); + hammer_unref(&ip->lock); kfree(ip, M_HAMMER); goto loop; } @@ -207,39 +215,93 @@ loop: return (ip); } -void -hammer_lock_inode(struct hammer_inode *ip) +/* + * Create a new filesystem object, returning the inode in *ipp. The + * returned inode will be referenced but not locked. + * + * The inode is created in-memory and will be delay-synchronized to the + * disk. + */ +int +hammer_create_inode(struct hammer_transaction *trans, struct vattr *vap, + struct ucred *cred, struct hammer_inode *dip, + struct hammer_inode **ipp) { - hammer_lock(&ip->lock); -} + struct hammer_mount *hmp; + struct hammer_inode *ip; -void -hammer_put_inode(struct hammer_inode *ip) -{ - hammer_unlock(&ip->lock); + hmp = trans->hmp; + ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO); + ip->obj_id = ++hmp->last_ino; + KKASSERT(ip->obj_id != 0); + ip->obj_asof = HAMMER_MAX_TID; /* XXX */ + ip->hmp = hmp; + ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY | + HAMMER_INODE_ITIMES; + ip->last_tid = trans->tid; + + RB_INIT(&ip->rec_tree); + + ip->ino_rec.ino_atime = trans->tid; + ip->ino_rec.ino_mtime = trans->tid; + ip->ino_rec.ino_size = 0; + ip->ino_rec.ino_nlinks = 0; + /* XXX */ + ip->ino_rec.base.rec_id = ++hmp->rootvol->ondisk->vol0_recid; + hammer_modify_volume(hmp->rootvol); + KKASSERT(ip->ino_rec.base.rec_id != 0); + ip->ino_rec.base.base.obj_id = ip->obj_id; + ip->ino_rec.base.base.key = 0; + ip->ino_rec.base.base.create_tid = trans->tid; + ip->ino_rec.base.base.delete_tid = 0; + ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE; + ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type); + + ip->ino_data.version = HAMMER_INODE_DATA_VERSION; + ip->ino_data.mode = vap->va_mode; + ip->ino_data.ctime = trans->tid; + ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0; + if (vap->va_vaflags & VA_UID_UUID_VALID) + ip->ino_data.uid = vap->va_uid_uuid; + else + hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid); + if (vap->va_vaflags & VA_GID_UUID_VALID) + ip->ino_data.gid = vap->va_gid_uuid; + else + hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid); + + hammer_ref(&ip->lock); + if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) { + hammer_unref(&ip->lock); + panic("hammer_create_inode: duplicate obj_id"); + } + *ipp = ip; + return(0); } void -hammer_put_inode_ref(struct hammer_inode *ip) +hammer_rel_inode(struct hammer_inode *ip) { + /* XXX check last ref */ hammer_unref(&ip->lock); } /* + * Unload and destroy the specified inode. + * * (called via RB_SCAN) */ int hammer_unload_inode(struct hammer_inode *ip, void *data __unused) { - struct vnode *vp; - KKASSERT(ip->lock.refs == 0); - if ((vp = ip->vp) != NULL) { - ip->vp = NULL; - vp->v_data = NULL; - /* XXX */ - } + KKASSERT(ip->vp == NULL); + hammer_ref(&ip->lock); RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip); + + hammer_uncache_node(&ip->cache); + + /* XXX flush */ kfree(ip, M_HAMMER); return(0); } @@ -253,25 +315,61 @@ hammer_modify_inode(struct hammer_transaction *trans, struct hammer_inode *ip, int flags) { ip->flags |= flags; + ip->last_tid = trans->tid; +} + +/************************************************************************ + * HAMMER INODE MERGED-RECORD FUNCTIONS * + ************************************************************************ + * + * These functions augment the B-Tree scanning functions in hammer_btree.c + * by merging in-memory records with on-disk records. + */ + +hammer_record_ondisk_t +hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) +{ + KKASSERT(0); + return(NULL); +} + +hammer_record_ondisk_t +hammer_ip_next(hammer_cursor_t cursor) +{ + KKASSERT(0); + return(NULL); +} + +int +hammer_ip_resolve_data(hammer_cursor_t cursor) +{ KKASSERT(0); + return(NULL); } /* * Access the filesystem buffer containing the cluster-relative byte * offset, validate the buffer type, load *bufferp and return a - * pointer to the requested data. + * pointer to the requested data. The buffer is reference and locked on + * return. * * If buf_type is 0 the buffer is assumed to be a pure-data buffer and * no type or crc check is performed. * + * If *bufferp is not NULL on entry it is assumed to contain a locked + * and referenced buffer which will then be replaced. + * + * If the caller is holding another unrelated buffer locked it must be + * passed in reorderbuf so we can properly order buffer locks. + * * XXX add a flag for the buffer type and check the CRC here XXX */ void * -hammer_bread(struct hammer_cluster *cluster, int32_t cloff, - u_int64_t buf_type, - int *errorp, struct hammer_buffer **bufferp) +hammer_bread(hammer_cluster_t cluster, int32_t cloff, + u_int64_t buf_type, int *errorp, + struct hammer_buffer **bufferp) { - struct hammer_buffer *buffer; + hammer_buffer_t buffer; int32_t buf_no; int32_t buf_off; @@ -282,16 +380,19 @@ hammer_bread(struct hammer_cluster *cluster, int32_t cloff, buffer = *bufferp; if (buffer == NULL || buffer->cluster != cluster || buffer->buf_no != buf_no) { - if (buffer) - hammer_put_buffer(buffer, 0); + if (buffer) { + hammer_unlock(&buffer->io.lock); + hammer_rel_buffer(buffer, 0); + } buffer = hammer_get_buffer(cluster, buf_no, 0, errorp); *bufferp = buffer; if (buffer == NULL) return(NULL); + hammer_lock_ex(&buffer->io.lock); } /* - * Validate the buffer type and crc XXX + * Validate the buffer type */ buf_off = cloff & HAMMER_BUFMASK; if (buf_type) { @@ -306,7 +407,6 @@ hammer_bread(struct hammer_cluster *cluster, int32_t cloff, *errorp = EIO; return(NULL); } - /* XXX crc */ } /* diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 48db524450..5addc7f4a6 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.1 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.2 2007/11/19 00:53:40 dillon Exp $ */ /* * IO Primitives and buffer cache management @@ -268,7 +268,8 @@ hammer_io_complete(struct buf *bp) /* * Callback from kernel when it wishes to deallocate a passively * associated structure. This can only occur if the buffer is - * passively associated with the structure. + * passively associated with the structure. The kernel has locked + * the buffer. * * If we cannot disassociate we set B_LOCKED to prevent the buffer * from getting reused. @@ -282,26 +283,55 @@ hammer_io_deallocate(struct buf *bp) KKASSERT(io->io.released); crit_enter(); - if (io->io.lock.refs) { - bp->b_flags |= B_LOCKED; - } else { - hammer_lock(&io->io.lock); - KKASSERT(io->io.lock.refs == 1); + + /* + * First, ref the structure to prevent either the buffer or the + * structure from going away. + */ + hammer_ref(&io->io.lock); + + /* + * Buffers can have active references from cached hammer_node's, + * even if those nodes are themselves passively cached. Attempt + * to clean them out. This may not succeed. + */ + if (io->io.type == HAMMER_STRUCTURE_BUFFER && + hammer_lock_ex_try(&io->io.lock) == 0) { + hammer_flush_buffer_nodes(&io->buffer); + hammer_unlock(&io->io.lock); + } + + if (hammer_islastref(&io->io.lock)) { + /* + * If we are the only ref left we can disassociate the + * I/O. It had better still be in a released state because + * the kernel is holding a lock on the buffer. + */ + KKASSERT(io->io.released); + hammer_io_disassociate(io); + bp->b_flags &= ~B_LOCKED; + switch(io->io.type) { case HAMMER_STRUCTURE_VOLUME: - hammer_put_volume(&io->volume, 1); + hammer_rel_volume(&io->volume, 1); break; case HAMMER_STRUCTURE_SUPERCL: - hammer_put_supercl(&io->supercl, 1); + hammer_rel_supercl(&io->supercl, 1); break; case HAMMER_STRUCTURE_CLUSTER: - hammer_put_cluster(&io->cluster, 1); + hammer_rel_cluster(&io->cluster, 1); break; case HAMMER_STRUCTURE_BUFFER: - hammer_put_buffer(&io->buffer, 1); + hammer_rel_buffer(&io->buffer, 1); break; } /* NOTE: io may be invalid (kfree'd) here */ + } else { + /* + * Otherwise tell the kernel not to destroy the buffer + */ + bp->b_flags |= B_LOCKED; + hammer_unlock(&io->io.lock); } crit_exit(); } @@ -356,6 +386,17 @@ hammer_io_checkwrite(struct buf *bp) } } +/* + * Return non-zero if the caller should flush the structure associated + * with this io sub-structure. + */ +int +hammer_io_checkflush(struct hammer_io *io) +{ + if (io->bp == NULL || (io->bp->b_flags & B_LOCKED)) + return(1); + return(0); +} /* * Return non-zero if we wish to delay the kernel's attempt to flush diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index 9ee4c33876..6f8e303dbf 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,41 +31,180 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.1 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.2 2007/11/19 00:53:40 dillon Exp $ */ #include "hammer.h" +static int hammer_add_record(hammer_transaction_t trans, + hammer_record_t record); + /* - * Allocate a new filesystem object + * Red-black tree support. */ -int -hammer_alloc_inode(struct hammer_transaction *trans, struct vattr *vap, - struct ucred *cred, struct hammer_inode **ipp) +static int +hammer_rec_rb_compare(struct hammer_record *rec1, struct hammer_record *rec2) +{ + if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type) + return(-1); + if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type) + return(1); + + if (rec1->rec.base.base.key < rec2->rec.base.base.key) + return(-1); + if (rec1->rec.base.base.key > rec2->rec.base.base.key) + return(1); + + if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid) + return(-1); + if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid) + return(1); + return(0); +} + +static int +hammer_rec_compare(struct hammer_base_elm *info, struct hammer_record *rec) { - return(EINVAL); + /* + * A key1->rec_type of 0 matches any record type. + */ + if (info->rec_type) { + if (info->rec_type < rec->rec.base.base.rec_type) + return(-3); + if (info->rec_type > rec->rec.base.base.rec_type) + return(3); + } + + /* + * There is no special case for key. 0 means 0. + */ + if (info->key < rec->rec.base.base.key) + return(-2); + if (info->key > rec->rec.base.base.key) + return(2); + + /* + * This test has a number of special cases. create_tid in key1 is + * the as-of transction id, and delete_tid in key1 is NOT USED. + * + * A key1->create_tid of 0 matches any record regardles of when + * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be + * used to search for the most current state of the object. + * + * key2->create_tid is a HAMMER record and will never be + * 0. key2->delete_tid is the deletion transaction id or 0 if + * the record has not yet been deleted. + */ + if (info->create_tid) { + if (info->create_tid < rec->rec.base.base.create_tid) + return(-1); + if (rec->rec.base.base.delete_tid && + info->create_tid >= rec->rec.base.base.delete_tid) { + return(1); + } + } + return(0); } +RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); +RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node, + hammer_rec_compare, hammer_base_elm_t); + +/* + * Add a directory entry (dip,ncp) which references inode (ip). + * + * Note that the low 32 bits of the namekey are set temporarily to create + * a unique in-memory record, and may be modified a second time when the + * record is synchronized to disk. In particular, the low 32 bits cannot be + * all 0's when synching to disk, which is not handled here. + */ int hammer_add_directory(struct hammer_transaction *trans, struct hammer_inode *dip, struct namecache *ncp, struct hammer_inode *ip) { - return(EINVAL); + struct hammer_record *record; + int error; + int bytes; + + record = hammer_alloc_ip_record(trans, dip); + + bytes = ncp->nc_nlen + 1; + + record->rec.entry.base.base.obj_id = dip->obj_id; + record->rec.entry.base.base.key = hammer_directory_namekey(ncp->nc_name, bytes - 1); + record->rec.entry.base.base.key += trans->hmp->namekey_iterator++; + record->rec.entry.base.base.create_tid = trans->tid; + record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY; + record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type; + record->rec.entry.base.base.obj_id = ip->obj_id; + if (bytes <= sizeof(record->rec.entry.den_name)) { + record->data = (void *)record->rec.entry.den_name; + } else { + record->data = kmalloc(bytes, M_HAMMER, M_WAITOK); + record->flags |= HAMMER_RECF_ALLOCDATA; + bcopy(ncp->nc_name, record->data, bytes); + } + record->data_len = bytes; + ++dip->ino_rec.ino_nlinks; + hammer_modify_inode(trans, dip, HAMMER_INODE_RDIRTY); + error = hammer_add_record(trans, record); + return(error); +} + +/* + * Allocate a record for the caller to finish filling in + */ +struct hammer_record * +hammer_alloc_ip_record(struct hammer_transaction *trans, hammer_inode_t ip) +{ + hammer_record_t record; + + record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO); + record->last_tid = trans->tid; + record->ip = ip; + return (record); } -#if 0 +/* + * Free a record. Clean the structure up even though we are throwing it + * away as a sanity check. + */ +void +hammer_free_ip_record(struct hammer_record *record) +{ + if (record->flags & HAMMER_RECF_ONRBTREE) { + RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record); + record->flags &= ~HAMMER_RECF_ONRBTREE; + } + if (record->flags & HAMMER_RECF_ALLOCDATA) { + kfree(record->data, M_HAMMER); + record->flags &= ~HAMMER_RECF_ALLOCDATA; + } + record->data = NULL; + kfree(record, M_HAMMER); +} /* - * + * Add the record to the inode's rec_tree. Directory entries */ +static int -hammer_add_record(struct hammer_transaction *trans, - struct hammer_inode *ip, hammer_record_ondisk_t rec, - void *data, int bytes) +hammer_add_record(struct hammer_transaction *trans, hammer_record_t record) { + while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { + if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){ + hammer_free_ip_record(record); + return (EEXIST); + } + record->rec.base.base.key &= ~(0xFFFFFFFFLL); + record->rec.base.base.key |= trans->hmp->namekey_iterator++; + } + record->flags |= HAMMER_RECF_ONRBTREE; + return(0); } +#if 0 /* * Delete records belonging to the specified range. Deal with edge and * overlap cases. This function sets the delete tid and breaks adds diff --git a/sys/vfs/hammer/hammer_ondisk.c b/sys/vfs/hammer/hammer_ondisk.c index e4e8633416..740cde76b1 100644 --- a/sys/vfs/hammer/hammer_ondisk.c +++ b/sys/vfs/hammer/hammer_ondisk.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.3 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.4 2007/11/19 00:53:40 dillon Exp $ */ /* * Manage HAMMER's on-disk structures. These routines are primarily @@ -45,27 +45,27 @@ #include #include -static void hammer_free_volume(struct hammer_volume *volume); -static struct hammer_cluster * - hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, - int new); +static void hammer_free_volume(hammer_volume_t volume); +static int hammer_load_volume(hammer_volume_t volume); +static int hammer_load_supercl(hammer_supercl_t supercl, int isnew); +static int hammer_load_cluster(hammer_cluster_t cluster, int isnew); +static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type); +static void hammer_remove_node_clist(hammer_buffer_t buffer, + hammer_node_t node); static void initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type); -static void alloc_new_buffer(struct hammer_cluster *cluster, +static void alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live, u_int64_t type, int32_t nelements, int32_t start, int *errorp, struct hammer_buffer **bufferp); #if 0 -static void readhammerbuf(struct hammer_volume *vol, void *data, +static void readhammerbuf(hammer_volume_t vol, void *data, int64_t offset); -static void writehammerbuf(struct hammer_volume *vol, const void *data, +static void writehammerbuf(hammer_volume_t vol, const void *data, int64_t offset); #endif -static int64_t calculate_cluster_offset(struct hammer_volume *vol, - int32_t clu_no); -static int64_t calculate_supercl_offset(struct hammer_volume *vol, - int32_t scl_no); - +static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no); +static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no); struct hammer_alist_config Buf_alist_config; struct hammer_alist_config Vol_normal_alist_config; @@ -78,7 +78,7 @@ struct hammer_alist_config Clu_slave_alist_config; * Red-Black tree support for various structures */ static int -hammer_ino_rb_compare(struct hammer_inode *ip1, struct hammer_inode *ip2) +hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2) { if (ip1->obj_id < ip2->obj_id) return(-1); @@ -92,7 +92,7 @@ hammer_ino_rb_compare(struct hammer_inode *ip1, struct hammer_inode *ip2) } static int -hammer_inode_info_cmp(hammer_inode_info_t info, struct hammer_inode *ip) +hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip) { if (info->obj_id < ip->obj_id) return(-1); @@ -106,7 +106,7 @@ hammer_inode_info_cmp(hammer_inode_info_t info, struct hammer_inode *ip) } static int -hammer_vol_rb_compare(struct hammer_volume *vol1, struct hammer_volume *vol2) +hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2) { if (vol1->vol_no < vol2->vol_no) return(-1); @@ -116,7 +116,7 @@ hammer_vol_rb_compare(struct hammer_volume *vol1, struct hammer_volume *vol2) } static int -hammer_scl_rb_compare(struct hammer_supercl *cl1, struct hammer_supercl *cl2) +hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2) { if (cl1->scl_no < cl2->scl_no) return(-1); @@ -126,7 +126,7 @@ hammer_scl_rb_compare(struct hammer_supercl *cl1, struct hammer_supercl *cl2) } static int -hammer_clu_rb_compare(struct hammer_cluster *cl1, struct hammer_cluster *cl2) +hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2) { if (cl1->clu_no < cl2->clu_no) return(-1); @@ -136,7 +136,7 @@ hammer_clu_rb_compare(struct hammer_cluster *cl1, struct hammer_cluster *cl2) } static int -hammer_buf_rb_compare(struct hammer_buffer *buf1, struct hammer_buffer *buf2) +hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2) { if (buf1->buf_no < buf2->buf_no) return(-1); @@ -145,6 +145,16 @@ hammer_buf_rb_compare(struct hammer_buffer *buf1, struct hammer_buffer *buf2) return(0); } +static int +hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2) +{ + if (node1->node_offset < node2->node_offset) + return(-1); + if (node1->node_offset < node2->node_offset) + return(1); + return(0); +} + /* * Note: The lookup function for hammer_ino_rb_tree winds up being named * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info). The other lookup @@ -161,8 +171,13 @@ RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node, hammer_clu_rb_compare, int32_t, clu_no); RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node, hammer_buf_rb_compare, int32_t, buf_no); +RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node, + hammer_nod_rb_compare, int32_t, node_offset); -/* +/************************************************************************ + * VOLUMES * + ************************************************************************ + * * Load a HAMMER volume by name. Returns 0 on success or a positive error * code on failure. Volumes must be loaded at mount time, get_volume() will * not load a new volume. @@ -170,10 +185,10 @@ RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node, * Calls made to hammer_load_volume() or single-threaded */ int -hammer_load_volume(struct hammer_mount *hmp, const char *volname) +hammer_install_volume(struct hammer_mount *hmp, const char *volname) { struct mount *mp; - struct hammer_volume *volume; + hammer_volume_t volume; struct hammer_volume_ondisk *ondisk; struct nlookupdata nd; struct buf *bp = NULL; @@ -257,13 +272,15 @@ hammer_load_volume(struct hammer_mount *hmp, const char *volname) /* * Set the root volume and load the root cluster. HAMMER special * cases rootvol and rootcl and will not deallocate the structures. + * We do not hold a ref because this would prevent related I/O + * from being flushed. */ if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) { hmp->rootvol = volume; hmp->rootcl = hammer_get_cluster(volume, ondisk->vol0_root_clu_no, &error, 0); - hammer_put_cluster(hmp->rootcl, 0); + hammer_rel_cluster(hmp->rootcl, 0); hmp->fsid_udev = dev2udev(vn_todev(volume->devvp)); } late_failure: @@ -282,10 +299,10 @@ late_failure: * so returns -1 on failure. */ int -hammer_unload_volume(struct hammer_volume *volume, void *data __unused) +hammer_unload_volume(hammer_volume_t volume, void *data __unused) { struct hammer_mount *hmp = volume->hmp; - struct hammer_cluster *rootcl; + hammer_cluster_t rootcl; int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0); /* @@ -296,18 +313,17 @@ hammer_unload_volume(struct hammer_volume *volume, void *data __unused) * Clean up the root cluster, which is held unlocked in the root * volume. */ + hammer_ref(&volume->io.lock); if (hmp->rootvol == volume) { - if ((rootcl = hmp->rootcl) != NULL) { - hammer_lock(&rootcl->io.lock); + if ((rootcl = hmp->rootcl) != NULL) hmp->rootcl = NULL; - hammer_put_cluster(rootcl, 1); - } hmp->rootvol = NULL; } /* * Flush the volume */ + KKASSERT(volume->io.lock.refs == 1); hammer_io_release(&volume->io, 1); volume->ondisk = NULL; if (volume->devvp) { @@ -330,7 +346,7 @@ hammer_unload_volume(struct hammer_volume *volume, void *data __unused) static void -hammer_free_volume(struct hammer_volume *volume) +hammer_free_volume(hammer_volume_t volume) { if (volume->vol_name) { kfree(volume->vol_name, M_HAMMER); @@ -346,11 +362,10 @@ hammer_free_volume(struct hammer_volume *volume) /* * Get a HAMMER volume. The volume must already exist. */ -struct hammer_volume * +hammer_volume_t hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) { struct hammer_volume *volume; - struct hammer_volume_ondisk *ondisk; /* * Locate the volume structure @@ -360,18 +375,64 @@ hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) *errorp = ENOENT; return(NULL); } + hammer_ref(&volume->io.lock); /* - * Load the ondisk buffer if necessary. Create a b_dep dependancy - * for the buffer that allows us to manage our structure with the - * buffer (mostly) left in a released state. + * Deal with on-disk info */ - hammer_lock(&volume->io.lock); if (volume->ondisk == NULL) { - *errorp = hammer_io_read(volume->devvp, &volume->io); + *errorp = hammer_load_volume(volume); if (*errorp) { + hammer_rel_volume(volume, 1); + volume = NULL; + } + } else { + *errorp = 0; + } + return(volume); +} + +hammer_volume_t +hammer_get_root_volume(struct hammer_mount *hmp, int *errorp) +{ + hammer_volume_t volume; + + volume = hmp->rootvol; + KKASSERT(volume != NULL); + hammer_ref(&volume->io.lock); + + /* + * Deal with on-disk info + */ + if (volume->ondisk == NULL) { + *errorp = hammer_load_volume(volume); + if (*errorp) { + hammer_rel_volume(volume, 1); + volume = NULL; + } + } else { + *errorp = 0; + } + return (volume); +} + +/* + * Load a volume's on-disk information. The volume must be referenced and + * not locked. We temporarily acquire an exclusive lock to interlock + * against releases or multiple get's. + */ +static int +hammer_load_volume(hammer_volume_t volume) +{ + struct hammer_volume_ondisk *ondisk; + int error; + + hammer_lock_ex(&volume->io.lock); + if (volume->ondisk == NULL) { + error = hammer_io_read(volume->devvp, &volume->io); + if (error) { hammer_unlock(&volume->io.lock); - return(NULL); + return (error); } volume->ondisk = ondisk = (void *)volume->io.bp->b_data; @@ -389,34 +450,44 @@ hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp) volume->alist.info = NULL; } hammer_alist_init(&volume->alist); + } else { + error = 0; } - *errorp = 0; - return(volume); + hammer_unlock(&volume->io.lock); + return(0); } /* - * Unlock and release a volume. - * - * Buffer cache interactions (applies to volumes, superclusters, clusters, - * and buffer structures): - * + * Release a volume. Call hammer_io_release on the last reference. We have + * to acquire an exclusive lock to interlock against volume->ondisk tests + * in hammer_load_volume(). */ void -hammer_put_volume(struct hammer_volume *volume, int flush) +hammer_rel_volume(hammer_volume_t volume, int flush) { if (hammer_islastref(&volume->io.lock)) { - hammer_io_release(&volume->io, flush); - } else { + hammer_lock_ex(&volume->io.lock); + if (hammer_islastref(&volume->io.lock)) { + volume->ondisk = NULL; + hammer_io_release(&volume->io, flush); + } hammer_unlock(&volume->io.lock); } + hammer_unref(&volume->io.lock); } -struct hammer_supercl * -hammer_get_supercl(struct hammer_volume *volume, int32_t scl_no, +/************************************************************************ + * SUPER-CLUSTERS * + ************************************************************************ + * + * Manage super-clusters. Note that a supercl holds a reference to its + * associated volume. + */ +hammer_supercl_t +hammer_get_supercl(hammer_volume_t volume, int32_t scl_no, int *errorp, int isnew) { - struct hammer_supercl_ondisk *ondisk; - struct hammer_supercl *supercl; + hammer_supercl_t supercl; /* * Locate and lock the super-cluster structure, creating one @@ -430,34 +501,53 @@ again: supercl->volume = volume; supercl->io.offset = calculate_supercl_offset(volume, scl_no); supercl->io.type = HAMMER_STRUCTURE_SUPERCL; - hammer_lock(&supercl->io.lock); + hammer_ref(&supercl->io.lock); /* * Insert the cluster into the RB tree and handle late * collisions. */ if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) { - hammer_unlock(&supercl->io.lock); + hammer_unref(&supercl->io.lock); kfree(supercl, M_HAMMER); goto again; } hammer_ref(&volume->io.lock); } else { - hammer_lock(&supercl->io.lock); + hammer_ref(&supercl->io.lock); } /* - * Load the cluster's on-disk info + * Deal with on-disk info */ - *errorp = 0; + if (supercl->ondisk == NULL || isnew) { + *errorp = hammer_load_supercl(supercl, isnew); + if (*errorp) { + hammer_rel_supercl(supercl, 1); + supercl = NULL; + } + } else { + *errorp = 0; + } + return(supercl); +} + +static int +hammer_load_supercl(hammer_supercl_t supercl, int isnew) +{ + struct hammer_supercl_ondisk *ondisk; + hammer_volume_t volume = supercl->volume; + int error; + + hammer_lock_ex(&supercl->io.lock); if (supercl->ondisk == NULL) { if (isnew) - *errorp = hammer_io_new(volume->devvp, &supercl->io); + error = hammer_io_new(volume->devvp, &supercl->io); else - *errorp = hammer_io_read(volume->devvp, &supercl->io); - if (*errorp) { - hammer_put_supercl(supercl, 1); - return(NULL); + error = hammer_io_read(volume->devvp, &supercl->io); + if (error) { + hammer_unlock(&supercl->io.lock); + return (error); } supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data; @@ -480,47 +570,61 @@ again: hammer_alist_init(&supercl->alist); } } else if (isnew) { - *errorp = hammer_io_new(volume->devvp, &supercl->io); - if (*errorp) { - hammer_put_supercl(supercl, 1); - supercl = NULL; - } + error = hammer_io_new(volume->devvp, &supercl->io); + } else { + error = 0; } - return (supercl); + hammer_unlock(&supercl->io.lock); + return (error); } +/* + * Release a super-cluster. We have to deal with several places where + * another thread can ref the super-cluster. + * + * Only destroy the structure itself if the related buffer cache buffer + * was disassociated from it. This ties the management of the structure + * to the buffer cache subsystem. + */ void -hammer_put_supercl(struct hammer_supercl *supercl, int flush) +hammer_rel_supercl(hammer_supercl_t supercl, int flush) { - struct hammer_volume *volume; + hammer_volume_t volume; if (hammer_islastref(&supercl->io.lock)) { - volume = supercl->volume; - hammer_io_release(&supercl->io, flush); - if (supercl->io.bp == NULL && - hammer_islastref(&supercl->io.lock)) { - RB_REMOVE(hammer_scl_rb_tree, - &volume->rb_scls_root, supercl); - kfree(supercl, M_HAMMER); - if (hammer_islastref(&volume->io.lock)) { - hammer_ref_to_lock(&volume->io.lock); - hammer_put_volume(volume, 0); + hammer_lock_ex(&supercl->io.lock); + if (hammer_islastref(&supercl->io.lock)) { + supercl->ondisk = NULL; + hammer_io_release(&supercl->io, flush); + if (supercl->io.bp == NULL && + hammer_islastref(&supercl->io.lock)) { + volume = supercl->volume; + RB_REMOVE(hammer_scl_rb_tree, + &volume->rb_scls_root, supercl); + supercl->volume = NULL; /* sanity */ + kfree(supercl, M_HAMMER); + hammer_rel_volume(volume, 0); + return; } } - } else { hammer_unlock(&supercl->io.lock); } + hammer_unref(&supercl->io.lock); } -struct hammer_cluster * -hammer_get_cluster(struct hammer_volume *volume, int32_t clu_no, +/************************************************************************ + * CLUSTERS * + ************************************************************************ + * + * Manage clusters. Note that a cluster holds a reference to its + * associated volume. + */ +hammer_cluster_t +hammer_get_cluster(hammer_volume_t volume, int32_t clu_no, int *errorp, int isnew) { - struct hammer_cluster *cluster; + hammer_cluster_t cluster; - /* - * Locate and lock the cluster structure, creating one if necessary. - */ again: cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no); if (cluster == NULL) { @@ -529,55 +633,83 @@ again: cluster->volume = volume; cluster->io.offset = calculate_cluster_offset(volume, clu_no); RB_INIT(&cluster->rb_bufs_root); + RB_INIT(&cluster->rb_nods_root); cluster->io.type = HAMMER_STRUCTURE_CLUSTER; - hammer_lock(&cluster->io.lock); + hammer_ref(&cluster->io.lock); /* * Insert the cluster into the RB tree and handle late * collisions. */ if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) { - hammer_unlock(&cluster->io.lock); + hammer_unref(&cluster->io.lock); kfree(cluster, M_HAMMER); goto again; } hammer_ref(&volume->io.lock); } else { - hammer_lock(&cluster->io.lock); + hammer_ref(&cluster->io.lock); } - return(hammer_load_cluster(cluster, errorp, isnew)); + + /* + * Deal with on-disk info + */ + if (cluster->ondisk == NULL || isnew) { + *errorp = hammer_load_cluster(cluster, isnew); + if (*errorp) { + hammer_rel_cluster(cluster, 1); + cluster = NULL; + } + } else { + *errorp = 0; + } + return (cluster); } -struct hammer_cluster * -hammer_get_rootcl(struct hammer_mount *hmp) +hammer_cluster_t +hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp) { - struct hammer_cluster *cluster = hmp->rootcl; - int dummy_error; + hammer_cluster_t cluster; - hammer_lock(&cluster->io.lock); - return(hammer_load_cluster(cluster, &dummy_error, 0)); -} + cluster = hmp->rootcl; + KKASSERT(cluster != NULL); + hammer_ref(&cluster->io.lock); + /* + * Deal with on-disk info + */ + if (cluster->ondisk == NULL) { + *errorp = hammer_load_cluster(cluster, 0); + if (*errorp) { + hammer_rel_cluster(cluster, 1); + cluster = NULL; + } + } else { + *errorp = 0; + } + return (cluster); +} static -struct hammer_cluster * -hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, int isnew) +int +hammer_load_cluster(hammer_cluster_t cluster, int isnew) { - struct hammer_volume *volume = cluster->volume; + hammer_volume_t volume = cluster->volume; struct hammer_cluster_ondisk *ondisk; + int error; /* * Load the cluster's on-disk info */ - *errorp = 0; + hammer_lock_ex(&cluster->io.lock); if (cluster->ondisk == NULL) { if (isnew) - *errorp = hammer_io_new(volume->devvp, &cluster->io); + error = hammer_io_new(volume->devvp, &cluster->io); else - *errorp = hammer_io_read(volume->devvp, &cluster->io); - if (*errorp) { - hammer_put_cluster(cluster, 1); - return(NULL); + error = hammer_io_read(volume->devvp, &cluster->io); + if (error) { + hammer_unlock(&cluster->io.lock); + return (error); } cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data; @@ -593,6 +725,9 @@ hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, int isnew) cluster->alist_mdata.meta = ondisk->clu_mdata_meta; cluster->alist_mdata.info = cluster; + cluster->clu_btree_beg = ondisk->clu_btree_beg; + cluster->clu_btree_end = ondisk->clu_btree_end; + /* * If this is a new cluster we have to initialize * various ondisk structural elements. The caller is @@ -612,52 +747,108 @@ hammer_load_cluster(struct hammer_cluster *cluster, int *errorp, int isnew) hammer_alist_init(&cluster->alist_mdata); } } else if (isnew) { - *errorp = hammer_io_new(volume->devvp, &cluster->io); - if (*errorp) { - hammer_put_cluster(cluster, 1); - cluster = NULL; - } + error = hammer_io_new(volume->devvp, &cluster->io); + } else { + error = 0; + } + hammer_unlock(&cluster->io.lock); + return (error); +} + +/* + * Reference a cluster that is either already referenced or via a specially + * handled pointer (aka rootcl). + */ +int +hammer_ref_cluster(hammer_cluster_t cluster) +{ + int error; + + KKASSERT(cluster != NULL); + hammer_ref(&cluster->io.lock); + + /* + * Deal with on-disk info + */ + if (cluster->ondisk == NULL) { + error = hammer_load_cluster(cluster, 0); + if (error) + hammer_rel_cluster(cluster, 1); + } else { + error = 0; } - return(cluster); + return(error); } +/* + * Release a cluster. We have to deal with several places where + * another thread can ref the cluster. + * + * Only destroy the structure itself if the related buffer cache buffer + * was disassociated from it. This ties the management of the structure + * to the buffer cache subsystem. + */ void -hammer_put_cluster(struct hammer_cluster *cluster, int flush) +hammer_rel_cluster(hammer_cluster_t cluster, int flush) { - struct hammer_volume *volume; + hammer_node_t node; + hammer_volume_t volume; if (hammer_islastref(&cluster->io.lock)) { - volume = cluster->volume; - hammer_io_release(&cluster->io, flush); - if (cluster->io.bp == NULL && - volume->hmp->rootcl != cluster && - hammer_islastref(&cluster->io.lock)) { - RB_REMOVE(hammer_clu_rb_tree, - &volume->rb_clus_root, cluster); - kfree(cluster, M_HAMMER); - if (hammer_islastref(&volume->io.lock)) { - hammer_ref_to_lock(&volume->io.lock); - hammer_put_volume(volume, 0); + hammer_lock_ex(&cluster->io.lock); + if (hammer_islastref(&cluster->io.lock)) { + cluster->ondisk = NULL; + hammer_io_release(&cluster->io, flush); + + /* + * Clean out the B-Tree node cache, if any, then + * clean up the volume ref and free the cluster. + * + * If the cluster acquires a new reference while we + * are trying to clean it out, abort the cleaning. + * + * There really shouldn't be any nodes at this point + * but we allow a node with no buffer association + * so handle the case. + */ + while (cluster->io.bp == NULL && + hammer_islastref(&cluster->io.lock) && + (node = RB_ROOT(&cluster->rb_nods_root)) != NULL + ) { + KKASSERT(node->lock.refs == 0); + hammer_flush_node(node); + } + if (cluster->io.bp == NULL && + hammer_islastref(&cluster->io.lock)) { + volume = cluster->volume; + RB_REMOVE(hammer_clu_rb_tree, + &volume->rb_clus_root, cluster); + cluster->volume = NULL; /* sanity */ + kfree(cluster, M_HAMMER); + hammer_rel_volume(volume, 0); + return; } } - } else { hammer_unlock(&cluster->io.lock); } + hammer_unref(&cluster->io.lock); } -/* - * Get a buffer from a cluster. Note that buffer #0 is the cluster header - * itself and may not be retrieved with this function. +/************************************************************************ + * BUFFERS * + ************************************************************************ + * + * Manage buffers. Note that a buffer holds a reference to its associated + * cluster, and its cluster will hold a reference to the cluster's volume. * - * If buf_type is 0 the buffer already exists in-memory or on-disk. - * Otherwise a new buffer is initialized with the specified buffer type. + * A non-zero buf_type indicates that a new buffer should be created and + * zero'd. */ -struct hammer_buffer * -hammer_get_buffer(struct hammer_cluster *cluster, int32_t buf_no, - int64_t buf_type, int *errorp) +hammer_buffer_t +hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no, + u_int64_t buf_type, int *errorp) { - hammer_fsbuf_ondisk_t ondisk; - struct hammer_buffer *buffer; + hammer_buffer_t buffer; /* * Find the buffer. Note that buffer 0 corresponds to the cluster @@ -678,34 +869,59 @@ again: buffer->io.offset = cluster->io.offset + (buf_no * HAMMER_BUFSIZE); buffer->io.type = HAMMER_STRUCTURE_BUFFER; - hammer_lock(&buffer->io.lock); + TAILQ_INIT(&buffer->clist); + hammer_ref(&buffer->io.lock); /* * Insert the cluster into the RB tree and handle late * collisions. */ if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) { - hammer_unlock(&buffer->io.lock); + hammer_unref(&buffer->io.lock); kfree(buffer, M_HAMMER); goto again; } hammer_ref(&cluster->io.lock); } else { - hammer_lock(&buffer->io.lock); + hammer_ref(&buffer->io.lock); } - *errorp = 0; + /* + * Deal with on-disk info + */ + if (buffer->ondisk == NULL || buf_type) { + *errorp = hammer_load_buffer(buffer, buf_type); + if (*errorp) { + hammer_rel_buffer(buffer, 1); + buffer = NULL; + } + } else { + *errorp = 0; + } + return(buffer); +} + +static int +hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type) +{ + hammer_volume_t volume; + hammer_fsbuf_ondisk_t ondisk; + int error; + + /* + * Load the buffer's on-disk info + */ + volume = buffer->volume; + hammer_lock_ex(&buffer->io.lock); if (buffer->ondisk == NULL) { if (buf_type) { - *errorp = hammer_io_new(buffer->volume->devvp, - &buffer->io); + error = hammer_io_new(volume->devvp, &buffer->io); } else { - *errorp = hammer_io_read(buffer->volume->devvp, - &buffer->io); + error = hammer_io_read(volume->devvp, &buffer->io); } - if (*errorp) { - hammer_put_buffer(buffer, 1); - return(NULL); + if (error) { + hammer_unlock(&buffer->io.lock); + return (error); } buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data; buffer->alist.config = &Buf_alist_config; @@ -714,65 +930,389 @@ again: if (buf_type) { initbuffer(&buffer->alist, &ondisk->head, buf_type); } + buffer->buf_type = ondisk->head.buf_type; } else if (buf_type) { - *errorp = hammer_io_new(buffer->volume->devvp, - &buffer->io); - if (*errorp) { - hammer_put_buffer(buffer, 1); - buffer = NULL; + error = hammer_io_new(volume->devvp, &buffer->io); + } else { + error = 0; + } + hammer_unlock(&buffer->io.lock); + return (error); +} + +/* + * Reference a buffer that is either already referenced or via a specially + * handled pointer (aka cursor->buffer). + */ +int +hammer_ref_buffer(hammer_buffer_t buffer) +{ + int error; + + hammer_ref(&buffer->io.lock); + if (buffer->ondisk == NULL) { + error = hammer_load_buffer(buffer, 0); + if (error) { + hammer_rel_buffer(buffer, 1); + /* + * NOTE: buffer pointer can become stale after + * the above release. + */ + } else { + KKASSERT(buffer->buf_type == + buffer->ondisk->head.buf_type); } + } else { + error = 0; } - return (buffer); + return(error); } +/* + * Release a buffer. We have to deal with several places where + * another thread can ref the buffer. + * + * Only destroy the structure itself if the related buffer cache buffer + * was disassociated from it. This ties the management of the structure + * to the buffer cache subsystem. buffer->ondisk determines whether the + * embedded io is referenced or not. + */ void -hammer_put_buffer(struct hammer_buffer *buffer, int flush) +hammer_rel_buffer(hammer_buffer_t buffer, int flush) { - struct hammer_cluster *cluster; + hammer_cluster_t cluster; + hammer_node_t node; if (hammer_islastref(&buffer->io.lock)) { - hammer_io_release(&buffer->io, flush); - if (buffer->io.bp == NULL && - hammer_islastref(&buffer->io.lock)) { - RB_REMOVE(hammer_buf_rb_tree, - &buffer->cluster->rb_bufs_root, buffer); - cluster = buffer->cluster; - kfree(buffer, M_HAMMER); - if (hammer_islastref(&cluster->io.lock)) { - hammer_ref_to_lock(&cluster->io.lock); - hammer_put_cluster(cluster, 0); + hammer_lock_ex(&buffer->io.lock); + if (hammer_islastref(&buffer->io.lock)) { + buffer->ondisk = NULL; + hammer_io_release(&buffer->io, flush); + + /* + * Clean out the B-Tree node cache, if any, then + * clean up the cluster ref and free the buffer. + * + * If the buffer acquires a new reference while we + * are trying to clean it out, abort the cleaning. + */ + while (buffer->io.bp == NULL && + hammer_islastref(&buffer->io.lock) && + (node = TAILQ_FIRST(&buffer->clist)) != NULL + ) { + KKASSERT(node->lock.refs == 0); + hammer_flush_node(node); + } + if (buffer->io.bp == NULL && + hammer_islastref(&buffer->io.lock)) { + cluster = buffer->cluster; + RB_REMOVE(hammer_buf_rb_tree, + &cluster->rb_bufs_root, buffer); + buffer->cluster = NULL; /* sanity */ + kfree(buffer, M_HAMMER); + hammer_rel_cluster(cluster, 0); + return; } } - } else { hammer_unlock(&buffer->io.lock); } + hammer_unref(&buffer->io.lock); } +/* + * Flush passively cached B-Tree nodes associated with this buffer. + * + * NOTE: The buffer is referenced and locked. + */ void -hammer_dup_buffer(struct hammer_buffer **bufferp, struct hammer_buffer *buffer) +hammer_flush_buffer_nodes(hammer_buffer_t buffer) { - if (buffer != *bufferp) { - if (buffer) - hammer_lock(&buffer->io.lock); - if (*bufferp) - hammer_put_buffer(*bufferp, 0); - *bufferp = buffer; + hammer_node_t node; + + node = TAILQ_FIRST(&buffer->clist); + while (node) { + buffer->save_scan = TAILQ_NEXT(node, entry); + if (node->lock.refs == 0) + hammer_flush_node(node); + node = buffer->save_scan; + } +} + +/************************************************************************ + * NODES * + ************************************************************************ + * + * Manage B-Tree nodes. B-Tree nodes represent the primary indexing + * method used by the HAMMER filesystem. + * + * Unlike other HAMMER structures, a hammer_node can be PASSIVELY + * associated with its buffer. It can have an active buffer reference + * even when the node itself has no references. The node also passively + * associates itself with its cluster without holding any cluster refs. + * The cluster ref is indirectly maintained by the active buffer ref when + * a node is acquired. + * + * A hammer_node can also be passively associated with other HAMMER + * structures, such as inodes, while retaining 0 references. These + * associations can be cleared backwards using a pointer-to-pointer in + * the hammer_node. + * + * This allows the HAMMER implementation to cache hammer_node's long-term + * and short-cut a great deal of the infrastructure's complexity. In + * most cases a cached node can be reacquired without having to dip into + * either the buffer or cluster management code. + * + * The caller must pass a referenced cluster on call and will retain + * ownership of the reference on return. The node will acquire its own + * additional references, if necessary. + */ +hammer_node_t +hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp) +{ + hammer_node_t node; + + /* + * Locate the structure, allocating one if necessary. + */ +again: + node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root, + node_offset); + if (node == NULL) { + node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO); + node->node_offset = node_offset; + node->cluster = cluster; + if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root, + node)) { + kfree(node, M_HAMMER); + goto again; + } + } + *errorp = hammer_ref_node(node); + if (*errorp) { + /* + * NOTE: The node pointer may be stale on error return. + * In fact, its probably been destroyed. + */ + node = NULL; + } + return(node); +} + +/* + * Reference the node to prevent disassociations, then associate and + * load the related buffer. This routine can also be called to reference + * a node from a cache pointer. + * + * NOTE: Because the caller does not have a ref on the node, the caller's + * node pointer will be stale if an error is returned. We may also wind + * up clearing the related cache pointers. + * + * NOTE: The cluster is indirectly referenced by our buffer ref. + */ +int +hammer_ref_node(hammer_node_t node) +{ + hammer_buffer_t buffer; + int32_t buf_no; + int error; + + hammer_ref(&node->lock); + if (node->ondisk == NULL) { + hammer_lock_ex(&node->lock); + if (node->ondisk == NULL) { + /* + * This is a little confusing but the jist is that + * node->buffer determines whether the node is on + * the buffer's clist and node->ondisk determines + * whether the buffer is referenced. + */ + if ((buffer = node->buffer) != NULL) { + error = hammer_ref_buffer(buffer); + } else { + buf_no = node->node_offset / HAMMER_BUFSIZE; + buffer = hammer_get_buffer(node->cluster, + buf_no, 0, &error); + if (buffer) { + KKASSERT(error == 0); + TAILQ_INSERT_TAIL(&buffer->clist, + node, entry); + node->buffer = buffer; + } + } + if (error == 0) { + node->ondisk = (void *)((char *)buffer->ondisk + + (node->node_offset & HAMMER_BUFMASK)); + } + } + hammer_unlock(&node->lock); } + if (error) + hammer_rel_node(node); + return (error); } +/* + * Release a hammer_node. The node retains a passive association with + * its cluster, buffer and caches. + * + * However, to avoid cluttering up kernel memory with tons of B-Tree + * node cache structures we destroy the node if no passive cache or + * (instantiated) buffer references exist. + */ void -hammer_dup_cluster(struct hammer_cluster **clusterp, - struct hammer_cluster *cluster) -{ - if (cluster != *clusterp) { - if (cluster) - hammer_lock(&cluster->io.lock); - if (*clusterp) - hammer_put_cluster(*clusterp, 0); - *clusterp = cluster; +hammer_rel_node(hammer_node_t node) +{ + hammer_cluster_t cluster; + hammer_buffer_t buffer; + + if (hammer_islastref(&node->lock)) { + cluster = node->cluster; + /* + * Clutter control, this case only occurs after a failed + * load since otherwise ondisk will be non-NULL. + */ + if (node->cache1 == NULL && node->cache2 == NULL && + node->ondisk == NULL) { + RB_REMOVE(hammer_nod_rb_tree, &cluster->rb_nods_root, + node); + if ((buffer = node->buffer) != NULL) { + node->buffer = NULL; + hammer_remove_node_clist(buffer, node); + } + kfree(node, M_HAMMER); + return; + } + + /* + * node->ondisk determines whether we have a buffer reference + * to get rid of or not. Only get rid of the reference if + * the kernel tried to flush the buffer. + * + * NOTE: Once unref'd the node can be physically destroyed, + * so our node is stale afterwords. + * + * This case occurs if the node still has cache references. + * We could remove the references and free the structure + * but for now we allow them (and the node structure) to + * remain intact. + */ + if (node->ondisk && hammer_io_checkflush(&node->buffer->io)) { + buffer = node->buffer; + node->buffer = NULL; + node->ondisk = NULL; + hammer_remove_node_clist(buffer, node); + hammer_unref(&node->lock); + hammer_rel_buffer(buffer, 0); + } else { + hammer_unref(&node->lock); + } + } else { + hammer_unref(&node->lock); } } +/* + * Cache-and-release a hammer_node. Kinda like catching and releasing a + * fish, but keeping an eye on him. The node is passively cached in *cache. + * + * NOTE! HAMMER may NULL *cache at any time, even after you have + * referenced the node! + */ +void +hammer_cache_node(hammer_node_t node, struct hammer_node **cache) +{ + if (node->cache1 != cache) { + if (node->cache2 == cache) { + struct hammer_node **tmp; + tmp = node->cache1; + node->cache1 = node->cache2; + node->cache2 = tmp; + } else { + if (node->cache2) + *node->cache2 = NULL; + node->cache2 = node->cache1; + node->cache1 = cache; + *cache = node; + } + } + hammer_rel_node(node); +} + +void +hammer_uncache_node(struct hammer_node **cache) +{ + hammer_node_t node; + + if ((node = *cache) != NULL) { + *cache = NULL; + if (node->cache1 == cache) { + node->cache1 = node->cache2; + node->cache2 = NULL; + } else if (node->cache2 == cache) { + node->cache2 = NULL; + } else { + panic("hammer_uncache_node: missing cache linkage"); + } + if (node->cache1 == NULL && node->cache2 == NULL && + node->lock.refs == 0) { + hammer_flush_node(node); + } + } +} + +/* + * Remove a node's cache references and destroy the node if it has no + * references. This is typically called from the buffer handling code. + * + * The node may have an active buffer reference (ondisk != NULL) even + * if the node itself has no references. + * + * Note that a caller iterating through nodes via a buffer must have its + * own reference on the buffer or our hammer_rel_buffer() call below may + * rip it out from under the caller. + */ +void +hammer_flush_node(hammer_node_t node) +{ + hammer_buffer_t buffer; + + if (node->cache1) + *node->cache1 = NULL; + if (node->cache2) + *node->cache2 = NULL; + if (node->lock.refs == 0) { + RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root, + node); + if ((buffer = node->buffer) != NULL) { + node->buffer = NULL; + hammer_remove_node_clist(buffer, node); + if (node->ondisk) { + node->ondisk = NULL; + hammer_rel_buffer(buffer, 0); + } + } + kfree(node, M_HAMMER); + } +} + +/* + * Remove a node from the buffer's clist. Adjust save_scan as appropriate. + * This is in its own little routine to properly handle interactions with + * save_scan, so it is possible to block while scanning a buffer's node list. + */ +static +void +hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node) +{ + if (buffer->save_scan == node) + buffer->save_scan = TAILQ_NEXT(node, entry); + TAILQ_REMOVE(&buffer->clist, node, entry); +} + +/************************************************************************ + * A-LIST ALLOCATORS * + ************************************************************************/ + /* * Allocate HAMMER elements - btree nodes, data storage, and record elements * @@ -786,19 +1326,21 @@ hammer_dup_cluster(struct hammer_cluster **clusterp, * fails a new buffer is allocated and associated with the buffer type * A-list and the element is allocated out of the new buffer. */ -void * -hammer_alloc_btree(struct hammer_cluster *cluster, - int *errorp, struct hammer_buffer **bufferp) + +hammer_node_t +hammer_alloc_btree(hammer_cluster_t cluster, int *errorp) { - struct hammer_buffer *buffer; + hammer_buffer_t buffer; hammer_alist_t live; + hammer_node_t node; int32_t elm_no; int32_t buf_no; - void *item; + int32_t node_offset; /* * Allocate a B-Tree element */ + buffer = NULL; live = &cluster->alist_btree; elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index); if (elm_no == HAMMER_ALIST_BLOCK_NONE) @@ -806,10 +1348,12 @@ hammer_alloc_btree(struct hammer_cluster *cluster, if (elm_no == HAMMER_ALIST_BLOCK_NONE) { alloc_new_buffer(cluster, live, HAMMER_FSBUF_BTREE, HAMMER_BTREE_NODES, - cluster->ondisk->idx_index, errorp, bufferp); + cluster->ondisk->idx_index, errorp, &buffer); elm_no = hammer_alist_alloc(live, 1); if (elm_no == HAMMER_ALIST_BLOCK_NONE) { *errorp = ENOSPC; + if (buffer) + hammer_rel_buffer(buffer, 0); return(NULL); } } @@ -819,26 +1363,27 @@ hammer_alloc_btree(struct hammer_cluster *cluster, * Load and return the B-Tree element */ buf_no = elm_no / HAMMER_FSBUF_MAXBLKS; - buffer = *bufferp; - if (buffer == NULL || buffer->cluster != cluster || - buffer->buf_no != buf_no) { - if (buffer) - hammer_put_buffer(buffer, 0); - buffer = hammer_get_buffer(cluster, buf_no, 0, errorp); - *bufferp = buffer; + node_offset = buf_no * HAMMER_BUFSIZE + + offsetof(union hammer_fsbuf_ondisk, + btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]); + node = hammer_get_node(cluster, node_offset, errorp); + if (node) { + bzero(node->ondisk, sizeof(*node->ondisk)); + } else { + hammer_alist_free(live, elm_no, 1); + hammer_rel_node(node); + node = NULL; } - KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_BTREE); - item = &buffer->ondisk->btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]; - bzero(item, sizeof(union hammer_btree_node)); - *errorp = 0; - return(item); + if (buffer) + hammer_rel_buffer(buffer, 0); + return(node); } void * -hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes, - int *errorp, struct hammer_buffer **bufferp) +hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes, + int *errorp, struct hammer_buffer **bufferp) { - struct hammer_buffer *buffer; + hammer_buffer_t buffer; hammer_alist_t live; int32_t elm_no; int32_t buf_no; @@ -873,7 +1418,7 @@ hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes, if (buffer == NULL || buffer->cluster != cluster || buffer->buf_no != buf_no) { if (buffer) - hammer_put_buffer(buffer, 0); + hammer_rel_buffer(buffer, 0); buffer = hammer_get_buffer(cluster, buf_no, 0, errorp); *bufferp = buffer; } @@ -885,10 +1430,10 @@ hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes, } void * -hammer_alloc_record(struct hammer_cluster *cluster, - int *errorp, struct hammer_buffer **bufferp) +hammer_alloc_record(hammer_cluster_t cluster, + int *errorp, struct hammer_buffer **bufferp) { - struct hammer_buffer *buffer; + hammer_buffer_t buffer; hammer_alist_t live; int32_t elm_no; int32_t buf_no; @@ -921,7 +1466,7 @@ hammer_alloc_record(struct hammer_cluster *cluster, if (buffer == NULL || buffer->cluster != cluster || buffer->buf_no != buf_no) { if (buffer) - hammer_put_buffer(buffer, 0); + hammer_rel_buffer(buffer, 0); buffer = hammer_get_buffer(cluster, buf_no, 0, errorp); *bufferp = buffer; } @@ -937,12 +1482,12 @@ hammer_alloc_record(struct hammer_cluster *cluster, * or a cluster-relative byte offset. */ void -hammer_free_btree_ptr(struct hammer_buffer *buffer, hammer_btree_node_t node) +hammer_free_btree_ptr(hammer_buffer_t buffer, hammer_node_ondisk_t node) { int32_t elm_no; hammer_alist_t live; - elm_no = node - buffer->ondisk->btree.nodes; + elm_no = node - &buffer->ondisk->btree.nodes[0]; KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES); elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS; live = &buffer->cluster->alist_btree; @@ -950,7 +1495,7 @@ hammer_free_btree_ptr(struct hammer_buffer *buffer, hammer_btree_node_t node) } void -hammer_free_data_ptr(struct hammer_buffer *buffer, void *data, int bytes) +hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes) { int32_t elm_no; int32_t nblks; @@ -966,8 +1511,7 @@ hammer_free_data_ptr(struct hammer_buffer *buffer, void *data, int bytes) } void -hammer_free_record_ptr(struct hammer_buffer *buffer, - union hammer_record_ondisk *rec) +hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec) { int32_t elm_no; hammer_alist_t live; @@ -980,9 +1524,9 @@ hammer_free_record_ptr(struct hammer_buffer *buffer, } void -hammer_free_btree(struct hammer_cluster *cluster, int32_t bclu_offset) +hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset) { - const int32_t blksize = sizeof(union hammer_btree_node); + const int32_t blksize = sizeof(struct hammer_node_ondisk); int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK; hammer_alist_t live; int32_t elm_no; @@ -996,8 +1540,7 @@ hammer_free_btree(struct hammer_cluster *cluster, int32_t bclu_offset) } void -hammer_free_data(struct hammer_cluster *cluster, int32_t bclu_offset, - int32_t bytes) +hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes) { const int32_t blksize = HAMMER_DATA_BLKSIZE; int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK; @@ -1015,7 +1558,7 @@ hammer_free_data(struct hammer_cluster *cluster, int32_t bclu_offset, } void -hammer_free_record(struct hammer_cluster *cluster, int32_t bclu_offset) +hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset) { const int32_t blksize = sizeof(union hammer_record_ondisk); int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK; @@ -1037,11 +1580,11 @@ hammer_free_record(struct hammer_cluster *cluster, int32_t bclu_offset) * type-specific A-list and initialized. */ static void -alloc_new_buffer(struct hammer_cluster *cluster, hammer_alist_t live, +alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live, u_int64_t type, int32_t nelements, int start, int *errorp, struct hammer_buffer **bufferp) { - struct hammer_buffer *buffer; + hammer_buffer_t buffer; int32_t buf_no; start = start / HAMMER_FSBUF_MAXBLKS; /* convert to buf_no */ @@ -1070,7 +1613,7 @@ alloc_new_buffer(struct hammer_cluster *cluster, hammer_alist_t live, */ buffer = hammer_get_buffer(cluster, buf_no, type, errorp); if (*bufferp) - hammer_put_buffer(*bufferp, 0); + hammer_rel_buffer(*bufferp, 0); *bufferp = buffer; } @@ -1086,17 +1629,17 @@ alloc_new_buffer(struct hammer_cluster *cluster, hammer_alist_t live, void flush_all_volumes(void) { - struct hammer_volume *vol; + hammer_volume_t vol; for (vol = VolBase; vol; vol = vol->next) flush_volume(vol); } void -flush_volume(struct hammer_volume *vol) +flush_volume(hammer_volume_t vol) { - struct hammer_supercl *supercl; - struct hammer_cluster *cl; + hammer_supercl_t supercl; + hammer_cluster_t cl; for (supercl = vol->supercl_base; supercl; supercl = supercl->next) flush_supercl(supercl); @@ -1106,7 +1649,7 @@ flush_volume(struct hammer_volume *vol) } void -flush_supercl(struct hammer_supercl *supercl) +flush_supercl(hammer_supercl_t supercl) { int64_t supercl_offset; @@ -1115,9 +1658,9 @@ flush_supercl(struct hammer_supercl *supercl) } void -flush_cluster(struct hammer_cluster *cl) +flush_cluster(hammer_cluster_t cl) { - struct hammer_buffer *buf; + hammer_buffer_t buf; int64_t cluster_offset; for (buf = cl->buffer_base; buf; buf = buf->next) @@ -1127,7 +1670,7 @@ flush_cluster(struct hammer_cluster *cl) } void -flush_buffer(struct hammer_buffer *buf) +flush_buffer(hammer_buffer_t buf) { int64_t buffer_offset; @@ -1154,7 +1697,7 @@ initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type) * is the number of clusters a supercluster can manage. */ static int64_t -calculate_cluster_offset(struct hammer_volume *volume, int32_t clu_no) +calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no) { int32_t scl_group; int64_t scl_group_size; @@ -1188,7 +1731,7 @@ calculate_cluster_offset(struct hammer_volume *volume, int32_t clu_no) * Calculate a super-cluster's offset in the volume. */ static int64_t -calculate_supercl_offset(struct hammer_volume *volume, int32_t scl_no) +calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no) { int64_t off; int32_t scl_group; @@ -1228,8 +1771,8 @@ calculate_supercl_offset(struct hammer_volume *volume, int32_t scl_no) static int buffer_alist_init(void *info, int32_t blk, int32_t radix) { - struct hammer_cluster *cluster = info; - struct hammer_buffer *buffer; + hammer_cluster_t cluster = info; + hammer_buffer_t buffer; int32_t buf_no; int error = 0; @@ -1241,7 +1784,7 @@ buffer_alist_init(void *info, int32_t blk, int32_t radix) buf_no = blk / HAMMER_FSBUF_MAXBLKS; buffer = hammer_get_buffer(cluster, buf_no, 0, &error); if (buffer) - hammer_put_buffer(buffer, 0); + hammer_rel_buffer(buffer, 0); return (error); } @@ -1255,8 +1798,8 @@ static int buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix, int32_t count, int32_t atblk, int32_t *fullp) { - struct hammer_cluster *cluster = info; - struct hammer_buffer *buffer; + hammer_cluster_t cluster = info; + hammer_buffer_t buffer; int32_t buf_no; int32_t r; int error = 0; @@ -1270,7 +1813,7 @@ buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix, if (r != HAMMER_ALIST_BLOCK_NONE) r += blk; *fullp = hammer_alist_isfull(&buffer->alist); - hammer_put_buffer(buffer, 0); + hammer_rel_buffer(buffer, 0); } else { r = HAMMER_ALIST_BLOCK_NONE; } @@ -1281,8 +1824,8 @@ static int buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix, int32_t count, int32_t atblk, int32_t *fullp) { - struct hammer_cluster *cluster = info; - struct hammer_buffer *buffer; + hammer_cluster_t cluster = info; + hammer_buffer_t buffer; int32_t buf_no; int32_t r; int error = 0; @@ -1296,7 +1839,7 @@ buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix, if (r != HAMMER_ALIST_BLOCK_NONE) r += blk; *fullp = hammer_alist_isfull(&buffer->alist); - hammer_put_buffer(buffer, 0); + hammer_rel_buffer(buffer, 0); } else { r = HAMMER_ALIST_BLOCK_NONE; *fullp = 0; @@ -1308,8 +1851,8 @@ static void buffer_alist_free(void *info, int32_t blk, int32_t radix, int32_t base_blk, int32_t count, int32_t *emptyp) { - struct hammer_cluster *cluster = info; - struct hammer_buffer *buffer; + hammer_cluster_t cluster = info; + hammer_buffer_t buffer; int32_t buf_no; int error = 0; @@ -1319,7 +1862,7 @@ buffer_alist_free(void *info, int32_t blk, int32_t radix, KKASSERT(buffer->ondisk->head.buf_type != 0); hammer_alist_free(&buffer->alist, base_blk, count); *emptyp = hammer_alist_isempty(&buffer->alist); - hammer_put_buffer(buffer, 0); + hammer_rel_buffer(buffer, 0); } else { *emptyp = 0; } @@ -1341,8 +1884,8 @@ buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab) static int super_alist_init(void *info, int32_t blk, int32_t radix) { - struct hammer_volume *volume = info; - struct hammer_supercl *supercl; + hammer_volume_t volume = info; + hammer_supercl_t supercl; int32_t scl_no; int error = 0; @@ -1353,7 +1896,7 @@ super_alist_init(void *info, int32_t blk, int32_t radix) scl_no = blk / HAMMER_SCL_MAXCLUSTERS; supercl = hammer_get_supercl(volume, scl_no, &error, 1); if (supercl) - hammer_put_supercl(supercl, 0); + hammer_rel_supercl(supercl, 0); return (error); } @@ -1367,8 +1910,8 @@ static int super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix, int32_t count, int32_t atblk, int32_t *fullp) { - struct hammer_volume *volume = info; - struct hammer_supercl *supercl; + hammer_volume_t volume = info; + hammer_supercl_t supercl; int32_t scl_no; int32_t r; int error = 0; @@ -1380,7 +1923,7 @@ super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix, if (r != HAMMER_ALIST_BLOCK_NONE) r += blk; *fullp = hammer_alist_isfull(&supercl->alist); - hammer_put_supercl(supercl, 0); + hammer_rel_supercl(supercl, 0); } else { r = HAMMER_ALIST_BLOCK_NONE; *fullp = 0; @@ -1392,8 +1935,8 @@ static int super_alist_alloc_rev(void *info, int32_t blk, int32_t radix, int32_t count, int32_t atblk, int32_t *fullp) { - struct hammer_volume *volume = info; - struct hammer_supercl *supercl; + hammer_volume_t volume = info; + hammer_supercl_t supercl; int32_t scl_no; int32_t r; int error = 0; @@ -1405,7 +1948,7 @@ super_alist_alloc_rev(void *info, int32_t blk, int32_t radix, if (r != HAMMER_ALIST_BLOCK_NONE) r += blk; *fullp = hammer_alist_isfull(&supercl->alist); - hammer_put_supercl(supercl, 0); + hammer_rel_supercl(supercl, 0); } else { r = HAMMER_ALIST_BLOCK_NONE; *fullp = 0; @@ -1417,8 +1960,8 @@ static void super_alist_free(void *info, int32_t blk, int32_t radix, int32_t base_blk, int32_t count, int32_t *emptyp) { - struct hammer_volume *volume = info; - struct hammer_supercl *supercl; + hammer_volume_t volume = info; + hammer_supercl_t supercl; int32_t scl_no; int error = 0; @@ -1427,7 +1970,7 @@ super_alist_free(void *info, int32_t blk, int32_t radix, if (supercl) { hammer_alist_free(&supercl->alist, base_blk, count); *emptyp = hammer_alist_isempty(&supercl->alist); - hammer_put_supercl(supercl, 0); + hammer_rel_supercl(supercl, 0); } else { *emptyp = 0; } diff --git a/sys/vfs/hammer/hammer_subs.c b/sys/vfs/hammer/hammer_subs.c index 3b3502d71e..b9f8e812a1 100644 --- a/sys/vfs/hammer/hammer_subs.c +++ b/sys/vfs/hammer/hammer_subs.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.2 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_subs.c,v 1.3 2007/11/19 00:53:40 dillon Exp $ */ /* * HAMMER structural locking @@ -40,12 +40,12 @@ #include "hammer.h" void -hammer_lock(struct hammer_lock *lock) +hammer_lock_ex(struct hammer_lock *lock) { thread_t td = curthread; + KKASSERT(lock->refs > 0); crit_enter(); - ++lock->refs; if (lock->locktd != td) { while (lock->locktd != NULL) { lock->wanted = 1; @@ -53,55 +53,101 @@ hammer_lock(struct hammer_lock *lock) } lock->locktd = td; } + ++lock->lockcount; crit_exit(); } +/* + * Try to obtain an exclusive lock + */ +int +hammer_lock_ex_try(struct hammer_lock *lock) +{ + thread_t td = curthread; + + KKASSERT(lock->refs > 0); + crit_enter(); + if (lock->locktd != td) { + if (lock->locktd != NULL) + return(EAGAIN); + lock->locktd = td; + } + ++lock->lockcount; + crit_exit(); + return(0); +} + + void -hammer_unlock(struct hammer_lock *lock) +hammer_lock_sh(struct hammer_lock *lock) { - KKASSERT(lock->locktd == curthread); KKASSERT(lock->refs > 0); crit_enter(); - if (--lock->refs == 0) { - lock->locktd = NULL; - if (lock->wanted) { - lock->wanted = 0; - wakeup(lock); + while (lock->locktd != NULL) { + if (lock->locktd == curthread) { + ++lock->lockcount; + crit_exit(); + return; } + lock->wanted = 1; + tsleep(lock, 0, "hmrlck", 0); } + KKASSERT(lock->lockcount <= 0); + --lock->lockcount; crit_exit(); } void -hammer_ref(struct hammer_lock *lock) +hammer_downgrade(struct hammer_lock *lock) { + KKASSERT(lock->lockcount == 1); crit_enter(); - ++lock->refs; + lock->lockcount = -1; + lock->locktd = NULL; + if (lock->wanted) { + lock->wanted = 0; + wakeup(lock); + } crit_exit(); + /* XXX memory barrier */ } void -hammer_unref(struct hammer_lock *lock) +hammer_unlock(struct hammer_lock *lock) { crit_enter(); - --lock->refs; + KKASSERT(lock->lockcount != 0); + if (lock->lockcount < 0) { + if (++lock->lockcount == 0 && lock->wanted) { + lock->wanted = 0; + wakeup(lock); + } + } else { + KKASSERT(lock->locktd == curthread); + if (--lock->lockcount == 0) { + lock->locktd = NULL; + if (lock->wanted) { + lock->wanted = 0; + wakeup(lock); + } + } + + } crit_exit(); } void -hammer_lock_to_ref(struct hammer_lock *lock) +hammer_ref(struct hammer_lock *lock) { crit_enter(); ++lock->refs; - hammer_unlock(lock); crit_exit(); } void -hammer_ref_to_lock(struct hammer_lock *lock) +hammer_unref(struct hammer_lock *lock) { crit_enter(); - hammer_lock(lock); --lock->refs; crit_exit(); } @@ -113,12 +159,29 @@ hammer_to_unix_xid(uuid_t *uuid) } void -hammer_to_timespec(u_int64_t hammerts, struct timespec *ts) +hammer_guid_to_uuid(uuid_t *uuid, u_int32_t guid) +{ + bzero(uuid, sizeof(*uuid)); + *(u_int32_t *)&uuid->node[2] = guid; +} + +void +hammer_to_timespec(hammer_tid_t tid, struct timespec *ts) { - ts->tv_sec = hammerts / 1000000000; - ts->tv_nsec = hammerts % 1000000000; + ts->tv_sec = tid / 1000000000; + ts->tv_nsec = tid % 1000000000; } +hammer_tid_t +hammer_timespec_to_transid(struct timespec *ts) +{ + hammer_tid_t tid; + + tid = ts->tv_nsec + (unsigned long)ts->tv_sec * 1000000000LL; + return(tid); +} + + /* * Convert a HAMMER filesystem object type to a vnode type */ diff --git a/sys/vfs/hammer/hammer_transaction.c b/sys/vfs/hammer/hammer_transaction.c index 35d5fede48..3879b0baef 100644 --- a/sys/vfs/hammer/hammer_transaction.c +++ b/sys/vfs/hammer/hammer_transaction.c @@ -31,14 +31,14 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.1 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.2 2007/11/19 00:53:40 dillon Exp $ */ #include "hammer.h" void -hammer_start_transaction(struct hammer_mount *hmp, - struct hammer_transaction *trans) +hammer_start_transaction(struct hammer_transaction *trans, + struct hammer_mount *hmp) { struct timespec ts; diff --git a/sys/vfs/hammer/hammer_vfsops.c b/sys/vfs/hammer/hammer_vfsops.c index 9ac2cff487..41f2e9fcbb 100644 --- a/sys/vfs/hammer/hammer_vfsops.c +++ b/sys/vfs/hammer/hammer_vfsops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.3 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.4 2007/11/19 00:53:40 dillon Exp $ */ #include @@ -118,7 +118,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data, if (error == 0) error = copyinstr(upath, path, MAXPATHLEN, NULL); if (error == 0) - error = hammer_load_volume(hmp, path); + error = hammer_install_volume(hmp, path); if (error) break; } diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index 077605efce..21a048933e 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.2 2007/11/07 00:43:24 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.3 2007/11/19 00:53:40 dillon Exp $ */ #include @@ -106,6 +106,11 @@ struct vop_ops hammer_vnode_vops = { .vop_nwhiteout = hammer_vop_nwhiteout }; +static int hammer_dounlink(struct nchandle *nch, struct vnode *dvp, + struct ucred *cred, int flags); +static int hammer_vop_strategy_read(struct vop_strategy_args *ap); +static int hammer_vop_strategy_write(struct vop_strategy_args *ap); + #if 0 static int @@ -139,13 +144,15 @@ hammer_vop_read(struct vop_read_args *ap) struct uio *uio; int error; int n; + int seqcount; if (ap->a_vp->v_type != VREG) return (EINVAL); ip = VTOI(ap->a_vp); error = 0; + seqcount = ap->a_ioflag >> 16; - hammer_start_transaction(ip->hmp, &trans); + hammer_start_transaction(&trans, ip->hmp); /* * Access the data in HAMMER_BUFSIZE blocks via the buffer cache. @@ -153,12 +160,14 @@ hammer_vop_read(struct vop_read_args *ap) uio = ap->a_uio; while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) { offset = uio->uio_offset & HAMMER_BUFMASK; - error = bread(ap->a_vp, uio->uio_offset - offset, - HAMMER_BUFSIZE, &bp); + error = cluster_read(ap->a_vp, ip->ino_rec.ino_size, + uio->uio_offset - offset, HAMMER_BUFSIZE, + MAXBSIZE, seqcount, &bp); if (error) { brelse(bp); break; } + bp->b_flags |= B_CLUSTEROK; n = HAMMER_BUFSIZE - offset; if (n > uio->uio_resid) n = uio->uio_resid; @@ -166,7 +175,7 @@ hammer_vop_read(struct vop_read_args *ap) n = (int)(ip->ino_rec.ino_size - uio->uio_offset); error = uiomove((char *)bp->b_data + offset, n, uio); if (error) { - brelse(bp); + bqrelse(bp); break; } ip->ino_rec.ino_atime = trans.tid; @@ -200,7 +209,7 @@ hammer_vop_write(struct vop_write_args *ap) /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(ip->hmp, &trans); + hammer_start_transaction(&trans, ip->hmp); uio = ap->a_uio; /* @@ -243,6 +252,7 @@ hammer_vop_write(struct vop_write_args *ap) brelse(bp); break; } + bp->b_flags |= B_CLUSTEROK; if (ip->ino_rec.ino_size < uio->uio_offset) { ip->ino_rec.ino_size = uio->uio_offset; ip->ino_rec.ino_mtime = trans.tid; @@ -252,10 +262,8 @@ hammer_vop_write(struct vop_write_args *ap) if (ap->a_ioflag & IO_SYNC) { bwrite(bp); } else if (ap->a_ioflag & IO_DIRECT) { - /* XXX B_CLUSTEROK SUPPORT */ bawrite(bp); } else { - /* XXX B_CLUSTEROK SUPPORT */ bdwrite(bp); } } @@ -330,20 +338,19 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(dip->hmp, &trans); + hammer_start_transaction(&trans, dip->hmp); /* * Create a new filesystem object of the requested type. The - * returned inode will be locked. We cannot hold the new - * inode locked while doing other manipulations. + * returned inode will be referenceds but not locked. */ - error = hammer_alloc_inode(&trans, ap->a_vap, ap->a_cred, &nip); + + error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip); if (error) { hammer_abort_transaction(&trans); *ap->a_vpp = NULL; return (error); } - hammer_lock_to_ref(&nip->lock); /* * Add the new filesystem object to the directory. This will also @@ -355,13 +362,13 @@ hammer_vop_ncreate(struct vop_ncreate_args *ap) * Finish up. */ if (error) { - hammer_put_inode_ref(nip); + hammer_rel_inode(nip); hammer_abort_transaction(&trans); *ap->a_vpp = NULL; } else { hammer_commit_transaction(&trans); error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp); - hammer_put_inode_ref(nip); + hammer_rel_inode(nip); } return (error); } @@ -423,58 +430,58 @@ static int hammer_vop_nresolve(struct vop_nresolve_args *ap) { - struct hammer_base_elm key; struct namecache *ncp; struct hammer_inode *dip; - struct hammer_btree_info info; + struct hammer_cursor cursor; + union hammer_record_ondisk *rec; struct vnode *vp; int64_t namekey; int error; - const int flags = HAMMER_BTREE_GET_RECORD | HAMMER_BTREE_GET_DATA; + /* + * Calculate the namekey and setup the key range for the scan. This + * works kinda like a chained hash table where the lower 32 bits + * of the namekey synthesize the chain. + * + * The key range is inclusive of both key_beg and key_end. + */ dip = VTOI(ap->a_dvp); ncp = ap->a_nch->ncp; namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen); - hammer_btree_info_init(&info, dip->hmp->rootcl); - key.obj_id = dip->obj_id; - key.key = namekey; - key.create_tid = dip->obj_asof; - key.delete_tid = 0; - key.rec_type = HAMMER_RECTYPE_DIRENTRY; - key.obj_type = 0; + hammer_init_cursor_ip(&cursor, dip); + cursor.key_beg.obj_id = dip->obj_id; + cursor.key_beg.key = namekey; + cursor.key_beg.create_tid = dip->obj_asof; + cursor.key_beg.delete_tid = 0; + cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; + cursor.key_beg.obj_type = 0; - /* - * Issue a lookup on the namekey. The entry should not be found - * since the low bits of the key are 0. This positions our cursor - * properly for the iteration. - */ - error = hammer_btree_lookup(&info, &key, 0); - if (error != ENOENT) { - if (error == 0) - error = EIO; - goto done; - } + cursor.key_end = cursor.key_beg; + cursor.key_end.key |= 0xFFFFFFFFULL; /* - * Iterate through the keys as long as the upper 32 bits are - * the same. + * Scan all matching records (the chain), locate the one matching + * the requested path component. info->last_error contains the + * error code on search termination and could be 0, ENOENT, or + * something else. + * + * The hammer_ip_*() functions merge in-memory records with on-disk + * records for the purposes of the search. */ - while ((error = hammer_btree_iterate(&info.cursor, &key)) == 0) { - if ((error = hammer_btree_extract(&info, flags)) != 0) - break; - if ((namekey ^ info.rec->base.base.key) & - (int64_t)0xFFFFFFFF00000000ULL) { - error = ENOENT; + rec = hammer_ip_first(&cursor, dip); + while (rec) { + if (hammer_ip_resolve_data(&cursor) != 0) /* sets last_error */ break; - } - if (ncp->nc_nlen == info.rec->base.data_len && - bcmp(ncp->nc_name, (void *)info.data, ncp->nc_nlen) == 0) { + if (ncp->nc_nlen == rec->entry.base.data_len && + bcmp(ncp->nc_name, (void *)cursor.data, ncp->nc_nlen) == 0) { break; } + rec = hammer_ip_next(&cursor); } + error = cursor.last_error; if (error == 0) { - error = hammer_vfs_vget(dip->hmp->mp, info.rec->entry.obj_id, &vp); + error = hammer_vfs_vget(dip->hmp->mp, rec->entry.obj_id, &vp); if (error == 0) { vn_unlock(vp); cache_setvp(ap->a_nch, vp); @@ -483,8 +490,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap) } else if (error == ENOENT) { cache_setvp(ap->a_nch, NULL); } -done: - hammer_btree_info_done(&info); + hammer_done_cursor(&cursor); return (error); } @@ -533,7 +539,7 @@ hammer_vop_nlink(struct vop_nlink_args *ap) /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(dip->hmp, &trans); + hammer_start_transaction(&trans, dip->hmp); /* * Add the filesystem object to the directory. Note that neither @@ -575,20 +581,18 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(dip->hmp, &trans); + hammer_start_transaction(&trans, dip->hmp); /* * Create a new filesystem object of the requested type. The - * returned inode will be locked. We cannot hold the new - * inode locked while doing other manipulations. + * returned inode will be referenced but not locked. */ - error = hammer_alloc_inode(&trans, ap->a_vap, ap->a_cred, &nip); + error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip); if (error) { hammer_abort_transaction(&trans); *ap->a_vpp = NULL; return (error); } - hammer_lock_to_ref(&nip->lock); /* * Add the new filesystem object to the directory. This will also @@ -600,13 +604,13 @@ hammer_vop_nmkdir(struct vop_nmkdir_args *ap) * Finish up. */ if (error) { - hammer_put_inode_ref(nip); + hammer_rel_inode(nip); hammer_abort_transaction(&trans); *ap->a_vpp = NULL; } else { hammer_commit_transaction(&trans); error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp); - hammer_put_inode_ref(nip); + hammer_rel_inode(nip); } return (error); } @@ -633,20 +637,18 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) /* * Create a transaction to cover the operations we perform. */ - hammer_start_transaction(dip->hmp, &trans); + hammer_start_transaction(&trans, dip->hmp); /* * Create a new filesystem object of the requested type. The - * returned inode will be locked. We cannot hold the new - * inode locked while doing other manipulations. + * returned inode will be referenced but not locked. */ - error = hammer_alloc_inode(&trans, ap->a_vap, ap->a_cred, &nip); + error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip); if (error) { hammer_abort_transaction(&trans); *ap->a_vpp = NULL; return (error); } - hammer_lock_to_ref(&nip->lock); /* * Add the new filesystem object to the directory. This will also @@ -658,13 +660,13 @@ hammer_vop_nmknod(struct vop_nmknod_args *ap) * Finish up. */ if (error) { - hammer_put_inode_ref(nip); + hammer_rel_inode(nip); hammer_abort_transaction(&trans); *ap->a_vpp = NULL; } else { hammer_commit_transaction(&trans); error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp); - hammer_put_inode_ref(nip); + hammer_rel_inode(nip); } return (error); } @@ -726,7 +728,7 @@ static int hammer_vop_nremove(struct vop_nremove_args *ap) { - return EOPNOTSUPP; + return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0)); } /* @@ -736,7 +738,93 @@ static int hammer_vop_nrename(struct vop_nrename_args *ap) { - return EOPNOTSUPP; + struct hammer_transaction trans; + struct namecache *fncp; + struct namecache *tncp; + struct hammer_inode *fdip; + struct hammer_inode *tdip; + struct hammer_inode *ip; + struct hammer_cursor cursor; + union hammer_record_ondisk *rec; + int64_t namekey; + int error; + + fdip = VTOI(ap->a_fdvp); + tdip = VTOI(ap->a_tdvp); + fncp = ap->a_fnch->ncp; + tncp = ap->a_tnch->ncp; + hammer_start_transaction(&trans, fdip->hmp); + + /* + * Extract the hammer_inode from fncp and add link to the target + * directory. + */ + ip = VTOI(fncp->nc_vp); + KKASSERT(ip != NULL); + + error = hammer_add_directory(&trans, tdip, tncp, ip); + + /* + * Locate the record in the originating directory and remove it. + * + * Calculate the namekey and setup the key range for the scan. This + * works kinda like a chained hash table where the lower 32 bits + * of the namekey synthesize the chain. + * + * The key range is inclusive of both key_beg and key_end. + */ + namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen); + + hammer_init_cursor_ip(&cursor, fdip); + cursor.key_beg.obj_id = fdip->obj_id; + cursor.key_beg.key = namekey; + cursor.key_beg.create_tid = fdip->obj_asof; + cursor.key_beg.delete_tid = 0; + cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; + cursor.key_beg.obj_type = 0; + + cursor.key_end = cursor.key_beg; + cursor.key_end.key |= 0xFFFFFFFFULL; + + /* + * Scan all matching records (the chain), locate the one matching + * the requested path component. info->last_error contains the + * error code on search termination and could be 0, ENOENT, or + * something else. + * + * The hammer_ip_*() functions merge in-memory records with on-disk + * records for the purposes of the search. + */ + rec = hammer_ip_first(&cursor, fdip); + while (rec) { + if (hammer_ip_resolve_data(&cursor) != 0) + break; + if (fncp->nc_nlen == rec->entry.base.data_len && + bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) { + break; + } + rec = hammer_ip_next(&cursor); + } + error = cursor.last_error; + + /* + * If all is ok we have to get the inode so we can adjust nlinks. + */ + if (error) + goto done; + error = hammer_del_directory(&trans, &cursor, fdip, ip); + if (error == 0) { + cache_rename(ap->a_fnch, ap->a_tnch); + cache_setvp(ap->a_tnch, ip->vp); + } +done: + if (error == 0) { + hammer_commit_transaction(&trans); + } else { + hammer_abort_transaction(&trans); + } + hammer_done_cursor(&cursor); + return (error); } /* @@ -746,7 +834,9 @@ static int hammer_vop_nrmdir(struct vop_nrmdir_args *ap) { - return EOPNOTSUPP; + /* XXX check that directory is empty */ + + return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0)); } /* @@ -756,7 +846,97 @@ static int hammer_vop_setattr(struct vop_setattr_args *ap) { - return EOPNOTSUPP; + struct hammer_transaction trans; + struct vattr *vap; + struct hammer_inode *ip; + int modflags; + int error; + u_int32_t flags; + uuid_t uuid; + + vap = ap->a_vap; + ip = ap->a_vp->v_data; + modflags = 0; + + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) + return(EROFS); + + hammer_start_transaction(&trans, ip->hmp); + error = 0; + + if (vap->va_flags != VNOVAL) { + flags = ip->ino_data.uflags; + error = vop_helper_setattr_flags(&flags, vap->va_flags, + hammer_to_unix_xid(&ip->ino_data.uid), + ap->a_cred); + if (error == 0) { + if (ip->ino_data.uflags != flags) { + ip->ino_data.uflags = flags; + modflags |= HAMMER_INODE_DDIRTY; + } + if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { + error = 0; + goto done; + } + } + goto done; + } + if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) { + error = EPERM; + goto done; + } + if (vap->va_uid != (uid_t)VNOVAL) { + hammer_guid_to_uuid(&uuid, vap->va_uid); + if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) == 0) { + ip->ino_data.uid = uuid; + modflags |= HAMMER_INODE_DDIRTY; + } + } + if (vap->va_gid != (uid_t)VNOVAL) { + hammer_guid_to_uuid(&uuid, vap->va_uid); + if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) == 0) { + ip->ino_data.gid = uuid; + modflags |= HAMMER_INODE_DDIRTY; + } + } + if (vap->va_size != VNOVAL) { + switch(ap->a_vp->v_type) { + case VREG: + case VDATABASE: + error = hammer_delete_range(&trans, ip, + vap->va_size, + 0x7FFFFFFFFFFFFFFFLL); + break; + default: + error = EINVAL; + goto done; + } + } + if (vap->va_atime.tv_sec != VNOVAL) { + ip->ino_rec.ino_atime = + hammer_timespec_to_transid(&vap->va_atime); + modflags |= HAMMER_INODE_ITIMES; + } + if (vap->va_mtime.tv_sec != VNOVAL) { + ip->ino_rec.ino_mtime = + hammer_timespec_to_transid(&vap->va_mtime); + modflags |= HAMMER_INODE_ITIMES; + } + if (vap->va_mode != (mode_t)VNOVAL) { + if (ip->ino_data.mode != vap->va_mode) { + ip->ino_data.mode = vap->va_mode; + modflags |= HAMMER_INODE_DDIRTY; + } + } +done: + if (error) { + hammer_abort_transaction(&trans); + } else { + if (modflags) + hammer_modify_inode(&trans, ip, modflags); + hammer_commit_transaction(&trans); + } + return (error); } /* @@ -776,115 +956,287 @@ static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap) { - return EOPNOTSUPP; + return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags)); } /* * hammer_vop_strategy { vp, bio } + * + * Strategy call, used for regular file read & write only. Note that the + * bp may represent a cluster. + * + * To simplify operation and allow better optimizations in the future, + * this code does not make any assumptions with regards to buffer alignment + * or size. */ static int hammer_vop_strategy(struct vop_strategy_args *ap) { - return EOPNOTSUPP; + struct buf *bp; + int error; + + bp = ap->a_bio->bio_buf; + + switch(bp->b_cmd) { + case BUF_CMD_READ: + error = hammer_vop_strategy_read(ap); + break; + case BUF_CMD_WRITE: + error = hammer_vop_strategy_write(ap); + break; + default: + error = EINVAL; + break; + } + bp->b_error = error; + if (error) + bp->b_flags |= B_ERROR; + biodone(ap->a_bio); + return (error); } -#if 0 - struct hammer_data_record *data; - struct hammer_base_elm_t key; - hammer_btree_info info; - const int flags = HAMMER_BTREE_GET_RECORD | HAMMER_BTREE_GET_DATA; - int64_t base_offset; - int didinit; - int o; - - hammer_btree_info_init(&info, ip->hmp->rootcl); - key.obj_id = ip->obj_id; - key.create_tid = ip->obj_asof; - key.delete_tid = 0; - key.rec_type = HAMMER_RECTYPE_DATA; - key.obj_type = 0; - key.key = uio->uio_offset; - - /* - * Iterate through matching records. Note that for data records - * the base offset is the key - data_len, NOT the key. This way - * we don't have to special case a ranged search. - */ - error = hammer_btree_lookup(&info, &key, 0); - if (error && error != ENOENT) - goto done; - while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) { - if ((error = hammer_btree_iterate(&info.cursor, &key)) != 0) +/* + * Read from a regular file. Iterate the related records and fill in the + * BIO/BUF. Gaps are zero-filled. + * + * The support code in hammer_object.c should be used to deal with mixed + * in-memory and on-disk records. + * + * XXX atime update + */ +static +int +hammer_vop_strategy_read(struct vop_strategy_args *ap) +{ + struct hammer_inode *ip = ap->a_vp->v_data; + struct hammer_cursor cursor; + hammer_record_ondisk_t rec; + hammer_base_elm_t base; + struct bio *bio; + struct buf *bp; + int64_t rec_offset; + int error; + int boff; + int roff; + int n; + + bio = ap->a_bio; + bp = bio->bio_buf; + + hammer_init_cursor_ip(&cursor, ip); + + /* + * Key range (begin and end inclusive) to scan. Note that the key's + * stored in the actual records represent the + */ + cursor.key_beg.obj_id = ip->obj_id; + cursor.key_beg.create_tid = ip->obj_asof; + cursor.key_beg.delete_tid = 0; + cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; + cursor.key_beg.obj_type = 0; + cursor.key_beg.key = bio->bio_offset; + + cursor.key_end = cursor.key_beg; + cursor.key_end.key = bio->bio_offset + bp->b_bufsize - 1; + + rec = hammer_ip_first(&cursor, ip); + boff = 0; + + while (rec) { + if (hammer_ip_resolve_data(&cursor) != 0) break; + base = &rec->base.base; + + rec_offset = base->key - rec->data.base.data_len; + /* - * XXX - possible to optimize the extract + * Zero-fill any gap */ - if ((error = hammer_btree_extract(&info, flags)) != 0) - break; - data = &info.rec->data; - base_offset = data->base.key - data->base.data_len; - if (uio->uio_offset < base_offset) { - if (base_offset - uio->uio_offset > HAMMER_BUFSIZE) - n = HAMMER_BUFSIZE; - else - n = (int)(base_offset - uio->uio_offset); - error = uiomove(ip->hmp->zbuf, n, uio); - } else { - o = (int)uio->uio_offset - base_offset; - if (o < data->base.data_len) { - n = data->base.data_len - o; - if (n > uio->uio_resid) - n = uio->uio_resid; - error = uiomove((char *)info.data + o, n, uio); - } + n = (int)(rec_offset - (bio->bio_offset + boff)); + if (n > 0) { + kprintf("zfill %d bytes\n", n); + bzero((char *)bp->b_data + boff, n); + boff += n; + n = 0; } - if (error) + + /* + * Calculate the data offset in the record and the number + * of bytes we can copy. + */ + roff = -n; + n = rec->data.base.data_len - roff; + KKASSERT(n > 0); + if (n > bp->b_bufsize - boff) + n = bp->b_bufsize - boff; + bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n); + boff += n; + if (boff == bp->b_bufsize) break; + rec = hammer_ip_next(&cursor); } + hammer_done_cursor(&cursor); /* - * Issue a lookup on the namekey. The entry should not be found - * since the low bits of the key are 0. This positions our cursor - * properly for the iteration. + * There may have been a gap after the last record */ - if (error != ENOENT) { - if (error == 0) - error = EIO; - goto done; + error = cursor.last_error; + if (error == ENOENT) + error = 0; + if (error == 0 && boff != bp->b_bufsize) { + bzero((char *)bp->b_data + boff, bp->b_bufsize - boff); + /* boff = bp->b_bufsize; */ + } + bp->b_resid = 0; + return(error); +} + +/* + * Write to a regular file. Iterate the related records and mark for + * deletion. If existing edge records (left and right side) overlap our + * write they have to be marked deleted and new records created, usually + * referencing a portion of the original data. Then add a record to + * represent the buffer. + * + * The support code in hammer_object.c should be used to deal with mixed + * in-memory and on-disk records. + */ +static +int +hammer_vop_strategy_write(struct vop_strategy_args *ap) +{ + struct hammer_transaction trans; + hammer_inode_t ip; + struct bio *bio; + struct buf *bp; + int error; + + bio = ap->a_bio; + bp = bio->bio_buf; + ip = ap->a_vp->v_data; + hammer_start_transaction(&trans, ip->hmp); + + /* + * Delete any records overlapping our range. This function will + * properly + */ + error = hammer_delete_range(&trans, ip, bio->bio_offset, + bio->bio_offset + bp->b_bufsize - 1); + + /* + * Add a single record to cover the write + */ + if (error == 0) { + error = hammer_add_data(&trans, ip, bio->bio_offset, + bp->b_data, bp->b_bufsize); } /* - * Iterate through the keys as long as the upper 32 bits are - * the same. + * If an error occured abort the transaction */ - while ((error = hammer_btree_iterate(&info, &key, flags)) == 0) { - if ((namekey ^ info.rec->base.base.key) & - (int64_t)0xFFFFFFFF00000000ULL) { - error = ENOENT; + if (error) { + /* XXX undo deletion */ + hammer_abort_transaction(&trans); + bp->b_resid = bp->b_bufsize; + } else { + hammer_commit_transaction(&trans); + bp->b_resid = 0; + } + return(error); +} + +/* + * dounlink - disconnect a directory entry + * + * XXX whiteout support not really in yet + */ +static int +hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred, + int flags) +{ + struct hammer_transaction trans; + struct namecache *ncp; + hammer_inode_t dip; + hammer_inode_t ip; + hammer_record_ondisk_t rec; + struct hammer_cursor cursor; + struct vnode *vp; + int64_t namekey; + int error; + + /* + * Calculate the namekey and setup the key range for the scan. This + * works kinda like a chained hash table where the lower 32 bits + * of the namekey synthesize the chain. + * + * The key range is inclusive of both key_beg and key_end. + */ + dip = VTOI(dvp); + ncp = nch->ncp; + namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen); + + hammer_init_cursor_ip(&cursor, dip); + cursor.key_beg.obj_id = dip->obj_id; + cursor.key_beg.key = namekey; + cursor.key_beg.create_tid = dip->obj_asof; + cursor.key_beg.delete_tid = 0; + cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY; + cursor.key_beg.obj_type = 0; + + cursor.key_end = cursor.key_beg; + cursor.key_end.key |= 0xFFFFFFFFULL; + + hammer_start_transaction(&trans, dip->hmp); + + /* + * Scan all matching records (the chain), locate the one matching + * the requested path component. info->last_error contains the + * error code on search termination and could be 0, ENOENT, or + * something else. + * + * The hammer_ip_*() functions merge in-memory records with on-disk + * records for the purposes of the search. + */ + rec = hammer_ip_first(&cursor, dip); + while (rec) { + if (hammer_ip_resolve_data(&cursor) != 0) break; - } - if (ncp->nc_nlen == info.rec->base.data_len && - bcmp(ncp->nc_name, (void *)info->data, ncp->nc_nlen) == 0) { + if (ncp->nc_nlen == rec->entry.base.data_len && + bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) { break; } + rec = hammer_ip_next(&cursor); } + error = cursor.last_error; + + /* + * If all is ok we have to get the inode so we can adjust nlinks. + */ if (error == 0) { - error = hammer_vfs_vget(dip->hmp->mp, info->rec->entry.obj_id, &vp); + ip = hammer_get_inode(dip->hmp, rec->entry.obj_id, &error); + if (error == 0) + error = hammer_del_directory(&trans, &cursor, dip, ip); + if (error == 0) { + cache_setunresolved(nch); + cache_setvp(nch, NULL); + /* XXX locking */ + if (ip->vp) + cache_inval_vp(ip->vp, CINV_DESTROY); + } + hammer_rel_inode(ip); + + error = hammer_vfs_vget(dip->hmp->mp, rec->entry.obj_id, &vp); if (error == 0) { vn_unlock(vp); cache_setvp(nch, vp); vrele(vp); + hammer_commit_transaction(&trans); + } else { + hammer_abort_transaction(&trans); } - } else if (error == ENOENT) { - cache_setvp(nch, NULL); } -done: - hammer_btree_info_done(&info); + hammer_done_cursor(&cursor); return (error); - - - return EOPNOTSUPP; } -#endif -- 2.41.0