HAMMER 11/many - initial spike commit.
authorMatthew Dillon <dillon@dragonflybsd.org>
Sat, 29 Dec 2007 09:01:27 +0000 (09:01 +0000)
committerMatthew Dillon <dillon@dragonflybsd.org>
Sat, 29 Dec 2007 09:01:27 +0000 (09:01 +0000)
* Fix a bug in the cluster disk offset calculation.
* Implement the cluster allocator and related header initialization.
* Remove wildcarding from hammer_btree_cmp().
* Move the historical filter out of hammer_btree_cmp() and into its own
  procedure.
* Allow the ending element in a B-Tree range iteration to be inclusive or
  exclusive of the range.
* Add infrastructure for cluster-localizes searches.
* Initial commit of the spike code (still in progress).

This commit brings in most of the infrastructure needed for the spike code.
The spike code is what glues one cluster's B-Tree to another cluster's B-Tree.
At the moment we spike by taking the B-Tree leaf node at the cursor after
a search and moving all of its elements to a new cluster, then replacing
the pointer to that leaf node in the parent B-Tree node with a cluster
reference.

This is non-optimal at the moment.  Several optimizations are possible and
will be eventually be implemented.

15 files changed:
sbin/hammer/ondisk.c
sbin/newfs_hammer/ondisk.c
sys/vfs/hammer/hammer.h
sys/vfs/hammer/hammer_btree.c
sys/vfs/hammer/hammer_btree.h
sys/vfs/hammer/hammer_cursor.c
sys/vfs/hammer/hammer_cursor.h
sys/vfs/hammer/hammer_inode.c
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_object.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_spike.c
sys/vfs/hammer/hammer_transaction.c
sys/vfs/hammer/hammer_vfsops.c
sys/vfs/hammer/hammer_vnops.c

index 1b250c2..34b551b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sbin/hammer/ondisk.c,v 1.5 2007/12/14 08:05:37 dillon Exp $
+ * $DragonFly: src/sbin/hammer/ondisk.c,v 1.6 2007/12/29 09:01:26 dillon Exp $
  */
 
 #include "newfs_hammer.h"
@@ -187,7 +187,7 @@ get_cluster(struct volume_info *vol, int32_t clu_no)
                                scl_group * scl_group_size +
                                (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
                                 ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS * HAMMER_VOL_SUPERCLUSTER_GROUP)) *
-                                HAMMER_BUFSIZE;
+                                ClusterSize;
                } else {
                        cl->clu_offset = vol->ondisk->vol_clo_beg +
                                         (int64_t)clu_no * ClusterSize;
index 70077ef..b7cbf1c 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sbin/newfs_hammer/Attic/ondisk.c,v 1.5 2007/12/14 08:05:37 dillon Exp $
+ * $DragonFly: src/sbin/newfs_hammer/Attic/ondisk.c,v 1.6 2007/12/29 09:01:26 dillon Exp $
  */
 
 #include "newfs_hammer.h"
@@ -187,7 +187,7 @@ get_cluster(struct volume_info *vol, int32_t clu_no)
                                scl_group * scl_group_size +
                                (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
                                 ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS * HAMMER_VOL_SUPERCLUSTER_GROUP)) *
-                                HAMMER_BUFSIZE;
+                                ClusterSize;
                } else {
                        cl->clu_offset = vol->ondisk->vol_clo_beg +
                                         (int64_t)clu_no * ClusterSize;
index c331a21..5001e50 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.13 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer.h,v 1.14 2007/12/29 09:01:27 dillon Exp $
  */
 /*
  * This header file contains structures used internally by the HAMMERFS
@@ -255,6 +255,7 @@ struct hammer_volume {
        struct hammer_alist_live alist;
        int32_t vol_no;
        int32_t vol_clsize;
+       int32_t clu_iterator;   /* cluster allocation iterator */
        int64_t nblocks;        /* note: special calculation for statfs */
        int64_t cluster_base;   /* base offset of cluster 0 */
        char    *vol_name;
@@ -382,10 +383,11 @@ struct hammer_mount {
        struct hammer_vol_rb_tree rb_vols_root;
        struct hammer_volume *rootvol;
        struct hammer_cluster *rootcl;
-       char *zbuf;     /* HAMMER_BUFSIZE bytes worth of all-zeros */
+       char    *zbuf;  /* HAMMER_BUFSIZE bytes worth of all-zeros */
        int     hflags;
        int     ronly;
        int     nvolumes;
+       int     volume_iterator;
        uuid_t  fsid;
        udev_t  fsid_udev;
        hammer_tid_t asof;
@@ -444,7 +446,7 @@ int hammer_sync_buffer(hammer_buffer_t buffer, void *data);
 hammer_record_t
        hammer_alloc_mem_record(hammer_inode_t ip);
 void   hammer_rel_mem_record(struct hammer_record **recordp);
-void   hammer_free_mem_record(hammer_record_t record);
+void   hammer_drop_mem_record(hammer_record_t record, int delete);
 
 int    hammer_cursor_up(hammer_cursor_t cursor, int nonblock);
 int    hammer_cursor_toroot(hammer_cursor_t cursor);
@@ -472,18 +474,20 @@ u_int8_t hammer_get_obj_type(enum vtype vtype);
 int64_t hammer_directory_namekey(void *name, int len);
 
 int    hammer_init_cursor_hmp(hammer_cursor_t cursor, hammer_mount_t hmp);
+int    hammer_init_cursor_cluster(hammer_cursor_t cursor, hammer_cluster_t cluster);
 int    hammer_init_cursor_ip(hammer_cursor_t cursor, hammer_inode_t ip);
 
 void   hammer_done_cursor(hammer_cursor_t cursor);
 void   hammer_mem_done(hammer_cursor_t cursor);
 
 int    hammer_btree_lookup(hammer_cursor_t cursor);
+int    hammer_btree_first(hammer_cursor_t cursor);
 int    hammer_btree_extract(hammer_cursor_t cursor, int flags);
 int    hammer_btree_iterate(hammer_cursor_t cursor);
 int    hammer_btree_insert(hammer_cursor_t cursor, hammer_btree_elm_t elm);
 int    hammer_btree_delete(hammer_cursor_t cursor);
 int    hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2);
-int    hammer_btree_range_cmp(hammer_cursor_t cursor, hammer_base_elm_t key2);
+int    hammer_btree_chkts(hammer_tid_t ts, hammer_base_elm_t key);
 void   hammer_print_btree_node(hammer_node_ondisk_t ondisk);
 void   hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i);
 
@@ -527,6 +531,11 @@ void hammer_dup_buffer(struct hammer_buffer **bufferp,
                        struct hammer_buffer *buffer);
 void hammer_dup_cluster(struct hammer_cluster **clusterp,
                        struct hammer_cluster *cluster);
+hammer_cluster_t hammer_alloc_cluster(hammer_mount_t hmp,
+                       hammer_cluster_t cluster_hint, int *errorp);
+void hammer_init_cluster(hammer_cluster_t cluster,
+                       hammer_base_elm_t left_bound,
+                       hammer_base_elm_t right_bound);
 hammer_node_t hammer_alloc_btree(struct hammer_cluster *cluster, int *errorp);
 void *hammer_alloc_data(struct hammer_cluster *cluster, int32_t bytes,
                        int *errorp, struct hammer_buffer **bufferp);
@@ -536,6 +545,7 @@ void hammer_free_data_ptr(struct hammer_buffer *buffer,
                        void *data, int bytes);
 void hammer_free_record_ptr(struct hammer_buffer *buffer,
                        union hammer_record_ondisk *rec);
+void hammer_free_cluster(hammer_cluster_t cluster);
 void hammer_free_btree(struct hammer_cluster *cluster, int32_t bclu_offset);
 void hammer_free_data(struct hammer_cluster *cluster, int32_t bclu_offset,
                        int32_t bytes);
@@ -568,11 +578,17 @@ int  hammer_ip_del_directory(struct hammer_transaction *trans,
                        hammer_cursor_t cursor, hammer_inode_t dip,
                        hammer_inode_t ip);
 int  hammer_ip_delete_range(struct hammer_transaction *trans,
-                       hammer_inode_t ip, int64_t ran_beg, int64_t ran_end);
+                       hammer_inode_t ip, int64_t ran_beg, int64_t ran_end,
+                       struct hammer_cursor **spikep);
 int  hammer_ip_sync_data(struct hammer_transaction *trans,
                        hammer_inode_t ip, int64_t offset,
-                       void *data, int bytes);
-int  hammer_ip_sync_record(hammer_record_t rec);
+                       void *data, int bytes, struct hammer_cursor **spikep);
+int  hammer_ip_sync_record(hammer_record_t rec, struct hammer_cursor **spikep);
+int  hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t rec,
+                       void *data, int cursor_flags);
+
+void hammer_load_spike(hammer_cursor_t cursor, struct hammer_cursor **spikep);
+int hammer_spike(struct hammer_cursor **spikep);
 
 int hammer_io_read(struct vnode *devvp, struct hammer_io *io);
 int hammer_io_new(struct vnode *devvp, struct hammer_io *io);
index e73033b..ef8f428 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.10 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.c,v 1.11 2007/12/29 09:01:27 dillon Exp $
  */
 
 /*
@@ -107,12 +107,13 @@ static void hammer_make_separator(hammer_base_elm_t key1,
  * is found.  ENOENT is returned if the iteration goes past the ending
  * key.
  *
- * key_beg/key_end is an INCLUSVE range.  i.e. if you are scanning to load
- * a 4096 byte buffer key_beg might specify an offset of 0 and key_end an
- * offset of 4095.
+ * The iteration is inclusive of key_beg and can be inclusive or exclusive
+ * of key_end depending on whether HAMMER_CURSOR_END_INCLUSIVE is set.
  *
  * cursor->key_beg may or may not be modified by this function during
- * the iteration.
+ * the iteration.  XXX future - in case of an inverted lock we may have
+ * to reinitiate the lookup and set key_beg to properly pick up where we
+ * left off.
  */
 int
 hammer_btree_iterate(hammer_cursor_t cursor)
@@ -121,10 +122,7 @@ hammer_btree_iterate(hammer_cursor_t cursor)
        hammer_btree_elm_t elm;
        int error;
        int r;
-#if 0
        int s;
-       int64_t save_key;
-#endif
 
        /*
         * Skip past the current record
@@ -170,89 +168,61 @@ hammer_btree_iterate(hammer_cursor_t cursor)
                }
 
                /*
-                * Iterate down the tree while we are at an internal node.
-                * Nodes cannot be empty, assert the case because if one is
-                * we will wind up in an infinite loop.
+                * Check internal or leaf element.  Determine if the record
+                * at the cursor has gone beyond the end of our range.
+                * Remember that our key range is end-exclusive.
                 *
-                * We can avoid iterating through large swaths of transaction
-                * id space if the left and right separators are the same
-                * except for their transaction spaces.  We can then skip
-                * the node if the left and right transaction spaces are the
-                * same sign.  This directly optimized accesses to files with
-                * HUGE transactional histories, such as database files,
-                * allowing us to avoid having to iterate through the entire
-                * history.
+                * Generally we recurse down through internal nodes.  An
+                * internal node can only be returned if INCLUSTER is set
+                * and the node represents a cluster-push record.  Internal
+                * elements do not contain create_tid/delete_tid information.
                 */
                if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
-                       KKASSERT(node->count != 0);
                        elm = &node->elms[cursor->index];
-#if 0
-                       /*
-                        * temporarily disable this optimization, it needs
-                        * more of a theoretical review.
-                        */
-                       if (elm[0].base.obj_id == elm[1].base.obj_id &&
-                           elm[0].base.rec_type == elm[1].base.rec_type &&
-                           elm[0].base.key == elm[1].base.key) {
-                               /*
-                                * Left side transaction space
-                                */
-                               save_key = cursor->key_beg.key;
-                               cursor->key_beg.key = elm[0].base.key;
-                               r = hammer_btree_cmp(&cursor->key_beg,
-                                                    &elm[0].base);
-                               cursor->key_beg.key = save_key;
-
-                               /*
-                                * Right side transaction space
-                                */
-                               save_key = cursor->key_end.key;
-                               cursor->key_end.key = elm[1].base.key;
-                               s = hammer_btree_cmp(&cursor->key_end,
-                                                    &elm[1].base);
-                               cursor->key_end.key = save_key;
-
-                               /*
-                                * If our range is entirely on one side or
-                                * the other side we can skip the sub-tree.
-                                */
-                               if ((r < 0 && s < 0) || (r > 0 && s > 0)) {
-                                       ++cursor->index;
-                                       continue;
-                               }
+                       r = hammer_btree_cmp(&cursor->key_end, &elm[0].base);
+                       s = hammer_btree_cmp(&cursor->key_beg, &elm[1].base);
+                       kprintf("SCAN %d r/s %d/%d\n", cursor->index, r, s);
+                       if (r < 0) {
+                               error = ENOENT;
+                               break;
                        }
-#endif
-                       error = hammer_cursor_down(cursor);
-                       if (error)
+                       if (r == 0 && (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
+                               error = ENOENT;
                                break;
-                       KKASSERT(cursor->index == 0);
-                       node = cursor->node->ondisk;
-                       continue;
-               }
-
-               /*
-                * We are at a leaf.
-                *
-                * Determine if the record at the cursor has gone beyond the
-                * end of our range.  Remember that our key range is inclusive.
-                *
-                * When iterating we may have to 'pick out' records matching
-                * our transaction requirements.  A comparison return of
-                * +1 or -1 indicates a transactional record that is too
-                * old or too new but does not terminate the search.
-                */
-               elm = &node->elms[cursor->index];
-               r = hammer_btree_range_cmp(cursor, &elm->base);
-               if (r == -1 || r == 1) {
-                       ++cursor->index;
-                       continue;
+                       }
+                       KKASSERT(s <= 0);
+                       if ((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0 ||
+                           elm->internal.rec_offset == 0) {
+                               error = hammer_cursor_down(cursor);
+                               if (error)
+                                       break;
+                               KKASSERT(cursor->index == 0);
+                               node = cursor->node->ondisk;
+                               continue;
+                       }
+               } else {
+                       elm = &node->elms[cursor->index];
+                       r = hammer_btree_cmp(&cursor->key_end, &elm->base);
+                       if (r < 0) {
+                               error = ENOENT;
+                               break;
+                       }
+                       if (r == 0 && (cursor->flags & HAMMER_CURSOR_END_INCLUSIVE) == 0) {
+                               error = ENOENT;
+                               break;
+                       }
+                       if ((cursor->flags & HAMMER_CURSOR_ALLHISTORY) == 0 &&
+                           hammer_btree_chkts(cursor->key_beg.create_tid,
+                                              &elm->base) != 0) {
+                               ++cursor->index;
+                               continue;
+                       }
                }
 
                /*
-                * We either found a match or are now out of bounds.
+                * Return entry
                 */
-               error = r ? ENOENT : 0;
-               break;
+               return(0);
        }
        return(error);
 }
@@ -278,14 +248,33 @@ hammer_btree_lookup(hammer_cursor_t cursor)
        return(error);
 }
 
+/*
+ * Execute the logic required to start an iteration.  The first record
+ * located within the specified range is returned and iteration control
+ * flags are adjusted for successive hammer_btree_iterate() calls.
+ */
+int
+hammer_btree_first(hammer_cursor_t cursor)
+{
+       int error;
+
+       error = hammer_btree_lookup(cursor);
+       if (error == ENOENT) {
+               cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
+               error = hammer_btree_iterate(cursor);
+       }
+       cursor->flags |= HAMMER_CURSOR_ATEDISK;
+       return(error);
+}
+
 /*
  * Extract the record and/or data associated with the cursor's current
  * position.  Any prior record or data stored in the cursor is replaced.
  * The cursor must be positioned at a leaf node.
  *
- * NOTE: Only records can be extracted from internal B-Tree nodes, and
- *       only for inter-cluster references.  At the moment we only support
- *      extractions from leaf nodes.
+ * NOTE: Most extractions occur at the leaf of the B-Tree.  The only
+ *      extraction allowed at an internal element is at a cluster-push.
+ *      Cluster-push elements have records but no data.
  */
 int
 hammer_btree_extract(hammer_cursor_t cursor, int flags)
@@ -295,6 +284,7 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags)
        hammer_cluster_t cluster;
        u_int64_t buf_type;
        int32_t cloff;
+       int32_t roff;
        int error;
 
        /*
@@ -305,11 +295,28 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags)
         * as the record reference must be handled.
         */
        node = cursor->node->ondisk;
-       KKASSERT(node->type == HAMMER_BTREE_TYPE_LEAF);
        elm = &node->elms[cursor->index];
        cluster = cursor->node->cluster;
+       cursor->flags &= ~HAMMER_CURSOR_DATA_EMBEDDED;
+       cursor->data = NULL;
        error = 0;
 
+       /*
+        * Internal elements can only be cluster pushes.  A cluster push has
+        * no data reference.
+        */
+       if (node->type == HAMMER_BTREE_TYPE_INTERNAL) {
+               cloff = elm->leaf.rec_offset;
+               KKASSERT(cloff != 0);
+               cursor->record = hammer_bread(cluster, cloff,
+                                             HAMMER_FSBUF_RECORDS, &error,
+                                             &cursor->record_buffer);
+               return(error);
+       }
+
+       /*
+        * Leaf element.
+        */
        if ((flags & HAMMER_CURSOR_GET_RECORD) && error == 0) {
                cloff = elm->leaf.rec_offset;
                cursor->record = hammer_bread(cluster, cloff,
@@ -347,11 +354,17 @@ hammer_btree_extract(hammer_cursor_t cursor, int flags)
                         * other records extracted during an iteration
                         * go back to it.
                         *
+                        * The data must be embedded in the record for this
+                        * case to be hit.
+                        *
                         * Just assume the buffer type is correct.
                         */
                        cursor->data = (void *)
                                ((char *)cursor->record_buffer->ondisk +
                                 (elm->leaf.data_offset & HAMMER_BUFMASK));
+                       roff = (char *)cursor->data - (char *)cursor->record;
+                       KKASSERT (roff >= 0 && roff < HAMMER_RECORD_SIZE);
+                       cursor->flags |= HAMMER_CURSOR_DATA_EMBEDDED;
                }
        }
        return(error);
@@ -519,21 +532,22 @@ hammer_btree_delete(hammer_cursor_t cursor)
  *
  * Search a cluster's B-Tree for cursor->key_beg, return the matching node.
  *
- * The search begins at the current node and will instantiate a NULL
- * parent if necessary and if not at the root of the cluster.  On return
- * parent will be non-NULL unless the cursor is sitting at a root-leaf.
- *
- * The search code may be forced to iterate up the tree if the conditions
- * required for an insertion or deletion are not met.  This does not occur
- * very often.
- *
+ * The search can begin ANYWHERE in the B-Tree.  As a first step the search
+ * iterates up the tree as necessary to properly position itself prior to
+ * actually doing the sarch.
+ * 
  * INSERTIONS: The search will split full nodes and leaves on its way down
- * and guarentee that the leaf it ends up on is not full.
+ * and guarentee that the leaf it ends up on is not full.  If we run out
+ * of space the search continues to the leaf (to position the cursor for
+ * the spike), but ENOSPC is returned.
  *
- * DELETIONS: The search will rebalance the tree on its way down.
+ * DELETIONS: The search will rebalance the tree on its way down. XXX
  *
  * The search is only guarenteed to end up on a leaf if an error code of 0
  * is returned, or if inserting and an error code of ENOENT is returned.
+ * Otherwise it can stop at an internal node.  On success a search returns
+ * a leaf node unless INCLUSTER is set and the search located a cluster push
+ * node (which is an internal node).
  */
 static 
 int
@@ -542,6 +556,7 @@ btree_search(hammer_cursor_t cursor, int flags)
        hammer_node_ondisk_t node;
        hammer_cluster_t cluster;
        int error;
+       int enospc = 0;
        int i;
        int r;
 
@@ -559,6 +574,9 @@ btree_search(hammer_cursor_t cursor, int flags)
         * First see if we can skip the whole cluster.  hammer_cursor_up()
         * handles both cases but this way we don't check the cluster
         * bounds when going up the tree within a cluster.
+        *
+        * NOTE: If INCLUSTER is set and we are at the root of the cluster,
+        * hammer_cursor_up() will return ENOENT.
         */
        cluster = cursor->node->cluster;
        while (
@@ -661,6 +679,7 @@ btree_search(hammer_cursor_t cursor, int flags)
                if ((flags & HAMMER_CURSOR_DELETE) && cursor->parent == NULL) {
                        error = btree_collapse(cursor);
                        /* node becomes stale after call */
+                       /* XXX ENOSPC */
                        if (error)
                                goto done;
                }
@@ -693,9 +712,16 @@ btree_search(hammer_cursor_t cursor, int flags)
                 * actually related to the internal element's child
                 * specification and not to the key.  These have to be
                 * retained.
+                *
+                * If we terminate at i == count it is still possible to
+                * be to the RIGHT of the RIGHT boundary but still to the
+                * LEFT of the parent's RIGHT boundary.  The solution is to
+                * adjust the RIGHT boundary to match the parent.  This
+                * case can occur due to deletions (see btree_remove()).
                 */
                if (i == 0) {
                        u_int8_t save;
+
                        if ((flags & HAMMER_CURSOR_INSERT) == 0) {
                                cursor->index = 0;
                                return(ENOENT);
@@ -704,6 +730,32 @@ btree_search(hammer_cursor_t cursor, int flags)
                        node->elms[0].base = *cursor->left_bound;
                        node->elms[0].subtree_type = save;
                        hammer_modify_node(cursor->node);
+               } else if (i == node->count) {
+                       /*
+                        * Terminate early if not inserting and the key is
+                        * beyond the uncorrected right hand boundary.  The
+                        * index must be PAST the last element to prevent an
+                        * iteration from returning elements to the left of
+                        * key_beg.
+                        */
+                       if ((flags & HAMMER_CURSOR_INSERT) == 0 &&
+                           hammer_btree_cmp(&cursor->key_beg,
+                                            &node->elms[i].base) >= 0
+                       ) {
+                               cursor->index = i;
+                               return(ENOENT);
+                       }
+
+                       /*
+                        * Correct a right-hand boundary mismatch.  The push
+                        * index is the last element (i-1).
+                        */
+                       if (hammer_btree_cmp(&node->elms[i].base,
+                                            cursor->right_bound) != 0) {
+                               node->elms[i].base = *cursor->right_bound;
+                               hammer_modify_node(cursor->node);
+                       }
+                       --i;
                } else {
                        /*
                         * The push-down index is now i - 1.
@@ -722,8 +774,12 @@ btree_search(hammer_cursor_t cursor, int flags)
                if (flags & HAMMER_CURSOR_INSERT) {
                        if (node->count == HAMMER_BTREE_INT_ELMS) {
                                error = btree_split_internal(cursor);
-                               if (error)
-                                       goto done;
+                               if (error) {
+                                       if (error != ENOSPC)
+                                               goto done;
+                                       enospc = 1;
+                                       flags &= ~HAMMER_CURSOR_INSERT;
+                               }
                                /*
                                 * reload stale pointers
                                 */
@@ -759,10 +815,26 @@ btree_search(hammer_cursor_t cursor, int flags)
                        }
                }
 #endif
+               /*
+                * Cluster pushes are done with internal elements.  If this
+                * is a cluster push (rec_offset != 0), and INCLUSTER is set,
+                * we stop here.
+                *
+                * However, because this is an internal node we have to
+                * determine whether key_beg is within its range and return
+                * 0 or ENOENT appropriately.
+                */
+               if ((flags & HAMMER_CURSOR_INCLUSTER) &&
+                   node->elms[i].internal.rec_offset) {
+                       r = hammer_btree_cmp(&cursor->key_beg,
+                                            &node->elms[i+1].base);
+                       error = (r < 0) ? 0 : (enospc ? ENOSPC : ENOENT);
+                       goto done;
+               }
 
                /*
                 * Push down (push into new node, existing node becomes
-                * the parent).
+                * the parent) and continue the search.
                 */
                error = hammer_cursor_down(cursor);
                /* node and cluster become stale */
@@ -774,9 +846,12 @@ btree_search(hammer_cursor_t cursor, int flags)
 
        /*
         * We are at a leaf, do a linear search of the key array.
-        * (XXX do a binary search).  On success the index is set to the
-        * matching element, on failure the index is set to the insertion
-        * point.
+        *
+        * On success the index is set to the matching element and 0
+        * is returned.
+        *
+        * On failure the index is set to the insertion point and ENOENT
+        * is returned.
         *
         * Boundaries are not stored in leaf nodes, so the index can wind
         * up to the left of element 0 (index == 0) or past the end of
@@ -814,14 +889,27 @@ btree_search(hammer_cursor_t cursor, int flags)
        if ((flags & HAMMER_CURSOR_INSERT) &&
            node->count == HAMMER_BTREE_LEAF_ELMS) {
                error = btree_split_leaf(cursor);
+               if (error) {
+                       if (error != ENOSPC)
+                               goto done;
+                       enospc = 1;
+                       flags &= ~HAMMER_CURSOR_INSERT;
+               }
+               /*
+                * reload stale pointers
+                */
                /* NOT USED
                i = cursor->index;
                node = &cursor->node->internal;
                */
-               if (error)
-                       goto done;
        }
-       error = ENOENT;
+
+       /*
+        * We reached a leaf but did not find the key we were looking for.
+        * If this is an insert we will be properly positioned for an insert
+        * (ENOENT) or spike (ENOSPC) operation.
+        */
+       error = enospc ? ENOSPC : ENOENT;
 done:
        return(error);
 }
@@ -970,6 +1058,7 @@ btree_split_internal(hammer_cursor_t cursor)
         * Remember that base.count does not include the right-hand boundary.
         */
        ondisk = parent->ondisk;
+       KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
        ondisk->elms[parent_index].internal.subtree_count = split;
        parent_elm = &ondisk->elms[parent_index+1];
        bcopy(parent_elm, parent_elm + 1,
@@ -1154,12 +1243,11 @@ btree_split_leaf(hammer_cursor_t cursor)
         * We are copying parent_index+1 to parent_index+2, not +0 to +1.
         */
        ondisk = parent->ondisk;
+       KKASSERT(ondisk->count != HAMMER_BTREE_INT_ELMS);
        ondisk->elms[parent_index].internal.subtree_count = split;
        parent_elm = &ondisk->elms[parent_index+1];
-       if (parent_index + 1 != ondisk->count) {
-               bcopy(parent_elm, parent_elm + 1,
-                     (ondisk->count - parent_index - 1) * esize);
-       }
+       bcopy(parent_elm, parent_elm + 1,
+             (ondisk->count - parent_index) * esize);
        hammer_make_separator(&elm[-1].base, &elm[0].base, &parent_elm->base);
        parent_elm->internal.subtree_offset = new_leaf->node_offset;
        parent_elm->internal.subtree_count = new_leaf->ondisk->count;
@@ -1308,13 +1396,22 @@ btree_remove(hammer_cursor_t cursor)
         * Remove the element at (node, index) and adjust the parent's
         * subtree_count.
         *
-        * NOTE!  Removing the element will cause the left-hand boundary
-        * to no longer match the child node:
+        * NOTE! If removing element 0 an internal node's left-hand boundary
+        * will no longer match its parent.  If removing a mid-element the
+        * boundary will no longer match a child's left hand or right hand
+        * boundary.
+        *
+        *      BxBxBxB         remove a (x[0]): internal node's left-hand
+        *       | | |                           boundary no longer matches
+        *       a b c                           parent.
+        *
+        *                      remove b (x[1]): a's right hand boundary no
+        *                                       longer matches parent.
         *
-        * LxMxR (left, middle, right).  If the first 'x' element is
-        * removed then you get LxR and L no longer matches the left-hand
-        * boundary of the sub-node pointed to by what used to be the
-        * second x.  This case is handled in btree_search().
+        *                      remove c (x[2]): b's right hand boundary no
+        *                                       longer matches parent.
+        *
+        * These cases are corrected in btree_search().
         */
 #if 0
        kprintf("BTREE_REMOVE: Removing element %d\n", cursor->index);
@@ -1383,11 +1480,11 @@ btree_set_parent(hammer_node_t node, hammer_btree_elm_t elm)
                break;
        case HAMMER_BTREE_TYPE_CLUSTER:
                volume = hammer_get_volume(node->cluster->volume->hmp,
-                                       elm->internal.subtree_volno, &error);
+                                       elm->internal.subtree_vol_no, &error);
                if (error)
                        break;
                cluster = hammer_get_cluster(volume,
-                                       elm->internal.subtree_cluid,
+                                       elm->internal.subtree_clu_no,
                                        &error, 0);
                 hammer_rel_volume(volume, 0);
                if (error)
@@ -1795,132 +1892,41 @@ done:
  ************************************************************************/
 
 /*
- * Compare two B-Tree elements, return -1, 0, or +1 (e.g. similar to strcmp).
+ * Compare two B-Tree elements, return -N, 0, or +N (e.g. similar to strcmp).
  *
  * See also hammer_rec_rb_compare() and hammer_rec_cmp() in hammer_object.c.
- *
- * Note that key1 and key2 are treated differently.  key1 is allowed to
- * wildcard some of its fields by setting them to 0, while key2 is expected
- * to be in an on-disk form (no wildcards).
  */
 int
 hammer_btree_cmp(hammer_base_elm_t key1, hammer_base_elm_t key2)
 {
-#if 0
-       kprintf("compare obj_id %016llx %016llx\n",
-               key1->obj_id, key2->obj_id);
-       kprintf("compare rec_type %04x %04x\n",
-               key1->rec_type, key2->rec_type);
-       kprintf("compare key %016llx %016llx\n",
-               key1->key, key2->key);
-#endif
+       if (key1->obj_id < key2->obj_id)
+               return(-4);
+       if (key1->obj_id > key2->obj_id)
+               return(4);
 
-       /*
-        * A key1->obj_id of 0 matches any object id
-        */
-       if (key1->obj_id) {
-               if (key1->obj_id < key2->obj_id)
-                       return(-4);
-               if (key1->obj_id > key2->obj_id)
-                       return(4);
-       }
-
-       /*
-        * A key1->rec_type of 0 matches any record type.
-        */
-       if (key1->rec_type) {
-               if (key1->rec_type < key2->rec_type)
-                       return(-3);
-               if (key1->rec_type > key2->rec_type)
-                       return(3);
-       }
+       if (key1->rec_type < key2->rec_type)
+               return(-3);
+       if (key1->rec_type > key2->rec_type)
+               return(3);
 
-       /*
-        * There is no special case for key.  0 means 0.
-        */
        if (key1->key < key2->key)
                return(-2);
        if (key1->key > key2->key)
                return(2);
-
-       /*
-        * This test has a number of special cases.  create_tid in key1 is
-        * the as-of transction id, and delete_tid in key1 is NOT USED.
-        *
-        * A key1->create_tid of 0 matches any record regardles of when
-        * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
-        * used to search for the most current state of the object.
-        *
-        * key2->create_tid is a HAMMER record and will never be
-        * 0.   key2->delete_tid is the deletion transaction id or 0 if 
-        * the record has not yet been deleted.
-        */
-       if (key1->create_tid) {
-               if (key1->create_tid < key2->create_tid)
-                       return(-1);
-               if (key2->delete_tid && key1->create_tid >= key2->delete_tid)
-                       return(1);
-       }
-
        return(0);
 }
 
 /*
- * Compare the element against the cursor's beginning and ending keys
+ * Test a non-zero timestamp against an element to determine whether the
+ * element is visible.
  */
 int
-hammer_btree_range_cmp(hammer_cursor_t cursor, hammer_base_elm_t key2)
+hammer_btree_chkts(hammer_tid_t create_tid, hammer_base_elm_t base)
 {
-       /*
-        * A cursor->key_beg.obj_id of 0 matches any object id
-        */
-       if (cursor->key_beg.obj_id) {
-               if (cursor->key_end.obj_id < key2->obj_id)
-                       return(-4);
-               if (cursor->key_beg.obj_id > key2->obj_id)
-                       return(4);
-       }
-
-       /*
-        * A cursor->key_beg.rec_type of 0 matches any record type.
-        */
-       if (cursor->key_beg.rec_type) {
-               if (cursor->key_end.rec_type < key2->rec_type)
-                       return(-3);
-               if (cursor->key_beg.rec_type > key2->rec_type)
-                       return(3);
-       }
-
-       /*
-        * There is no special case for key.  0 means 0.
-        */
-       if (cursor->key_end.key < key2->key)
-               return(-2);
-       if (cursor->key_beg.key > key2->key)
-               return(2);
-
-       /*
-        * This test has a number of special cases.  create_tid in key1 is
-        * the as-of transction id, and delete_tid in key1 is NOT USED.
-        *
-        * A key1->create_tid of 0 matches any record regardles of when
-        * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
-        * used to search for the most current state of the object.
-        *
-        * key2->create_tid is a HAMMER record and will never be
-        * 0.   key2->delete_tid is the deletion transaction id or 0 if 
-        * the record has not yet been deleted.
-        *
-        * NOTE: only key_beg.create_tid is used for create_tid, we can only
-        * do as-of scans at the moment.
-        */
-       if (cursor->key_beg.create_tid) {
-               if (cursor->key_beg.create_tid < key2->create_tid)
-                       return(-1);
-               if (key2->delete_tid && cursor->key_beg.create_tid >= key2->delete_tid)
-                       return(1);
-       }
-
+       if (create_tid < base->create_tid)
+               return(-1);
+       if (base->delete_tid && create_tid >= base->delete_tid)
+               return(1);
        return(0);
 }
 
@@ -2029,9 +2035,9 @@ hammer_print_btree_elm(hammer_btree_elm_t elm, u_int8_t type, int i)
                        kprintf("\tcluster_rec  = %08x\n",
                                elm->internal.rec_offset);
                        kprintf("\tcluster_id   = %08x\n",
-                               elm->internal.subtree_cluid);
+                               elm->internal.subtree_clu_no);
                        kprintf("\tvolno        = %08x\n",
-                               elm->internal.subtree_volno);
+                               elm->internal.subtree_vol_no);
                } else {
                        kprintf("\tsubtree_off  = %08x\n",
                                elm->internal.subtree_offset);
index 171aca1..4a04c80 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.6 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_btree.h,v 1.7 2007/12/29 09:01:27 dillon Exp $
  */
 
 /*
@@ -128,11 +128,11 @@ struct hammer_btree_internal_elm {
        struct hammer_base_elm base;
        int32_t rec_offset;     /* 0 indicates internal reference */
        int32_t subtree_offset; /* offset or cluster id */
-       int32_t subtree_volno;  /* unused or volume number */
+       int32_t subtree_vol_no; /* unused or volume number */
        int32_t subtree_count;  /* hint: can be too small, but not too big */
 };
 
-#define subtree_cluid  subtree_offset
+#define subtree_clu_no subtree_offset
 #define subtree_type   base.subtree_type
 
 /*
index a0b4f31..192315b 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.5 2007/11/30 00:16:56 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.c,v 1.6 2007/12/29 09:01:27 dillon Exp $
  */
 
 /*
@@ -70,6 +70,27 @@ hammer_init_cursor_hmp(hammer_cursor_t cursor, hammer_mount_t hmp)
        return(error);
 }
 
+/*
+ * Initialize a fresh cursor at the root of the specified cluster and
+ * limit operations to within the cluster.
+ */
+int
+hammer_init_cursor_cluster(hammer_cursor_t cursor, hammer_cluster_t cluster)
+{
+       int error;
+
+       bzero(cursor, sizeof(*cursor));
+       cursor->flags |= HAMMER_CURSOR_INCLUSTER;
+       cursor->node = hammer_get_node(cluster,
+                                      cluster->ondisk->clu_btree_root,
+                                      &error);
+       if (error == 0) {
+               hammer_lock_ex(&cursor->node->lock);
+               error = hammer_load_cursor_parent(cursor);
+       }
+       return(error);
+}
+
 /*
  * Initialize a fresh cursor using the B-Tree node cached by the
  * in-memory inode.
@@ -183,6 +204,7 @@ hammer_get_parent_node(hammer_node_t node, int *errorp)
                /*
                 * At root of root cluster, there is no parent.
                 */
+               KKASSERT(cluster->ondisk->clu_btree_parent_vol_no == -1);
                parent = NULL;
                *errorp = 0;
        }
@@ -194,6 +216,10 @@ hammer_get_parent_node(hammer_node_t node, int *errorp)
  * cases.  (1) The parent is in the current cluster.  (2) We are at the
  * root of the cluster and the parent is in another cluster.  (3) We are at
  * the root of the root cluster.
+ *
+ * If HAMMER_CURSOR_INCLUSTER is set and we are at the root of the cluster,
+ * we do not access the parent cluster at all and make the cursor look like
+ * its at the root.
  */
 static
 int
@@ -206,7 +232,9 @@ hammer_load_cursor_parent(hammer_cursor_t cursor)
 
        if (cursor->node->ondisk->parent) {
                error = hammer_load_cursor_parent_local(cursor);
-       } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0) {
+       } else if (cluster->ondisk->clu_btree_parent_vol_no >= 0 &&
+                  ((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0)
+       ) {
                error = hammer_load_cursor_parent_cluster(cursor);
        } else {
                cursor->parent = NULL;
@@ -261,7 +289,8 @@ int
 hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
 {
        hammer_cluster_ondisk_t ondisk;
-       hammer_cluster_t cluster;
+       hammer_cluster_t pcluster;
+       hammer_cluster_t ccluster;
        hammer_volume_t volume;
        hammer_node_t node;
        hammer_node_t parent;
@@ -272,23 +301,25 @@ hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
        int i;
 
        node = cursor->node;
-       ondisk = node->cluster->ondisk;
+       ccluster = node->cluster;
+       ondisk = ccluster->ondisk;
        vol_no = ondisk->clu_btree_parent_vol_no;
        clu_no = ondisk->clu_btree_parent_clu_no;
 
        /*
         * Acquire the node from (volume, cluster, offset)
         */
-       volume = hammer_get_volume(node->cluster->volume->hmp, vol_no, &error);
+       volume = hammer_get_volume(ccluster->volume->hmp, vol_no, &error);
        if (error)
                return (error);
-       cluster = hammer_get_cluster(volume, clu_no, &error, 0);
+       pcluster = hammer_get_cluster(volume, clu_no, &error, 0);
        hammer_rel_volume(volume, 0);
        if (error)
                return (error);
-       parent = hammer_get_node(cluster, ondisk->clu_btree_parent_offset,
+       parent = hammer_get_node(pcluster, ondisk->clu_btree_parent_offset,
                                 &error);
-       hammer_rel_cluster(cluster, 0);
+       hammer_rel_cluster(pcluster, 0);
+       kprintf("parent %p clu_no %d\n", parent, clu_no);
        if (error)
                return (error);
 
@@ -298,8 +329,11 @@ hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
        elm = NULL;
        for (i = 0; i < parent->ondisk->count; ++i) {
                elm = &parent->ondisk->elms[i];
+               if (elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER)
+                       kprintf("SUBTEST CLU %d\n", elm->internal.subtree_clu_no);
                if (elm->internal.rec_offset != 0 &&
-                   elm->internal.subtree_cluid == clu_no) {
+                   elm->internal.subtree_type == HAMMER_BTREE_TYPE_CLUSTER &&
+                   elm->internal.subtree_clu_no == cursor->node->cluster->clu_no) {
                        break;
                }
        }
@@ -310,9 +344,9 @@ hammer_load_cursor_parent_cluster(hammer_cursor_t cursor)
        cursor->right_bound = &elm[1].internal.base;
 
        KKASSERT(hammer_btree_cmp(cursor->left_bound,
-                &parent->cluster->ondisk->clu_btree_beg) == 0);
+                &ccluster->ondisk->clu_btree_beg) == 0);
        KKASSERT(hammer_btree_cmp(cursor->right_bound,
-                &parent->cluster->ondisk->clu_btree_end) == 0);
+                &ccluster->ondisk->clu_btree_end) == 0);
 
        if (hammer_lock_ex_try(&parent->lock) != 0) {
                hammer_unlock(&cursor->node->lock);
@@ -367,6 +401,10 @@ hammer_cursor_up(hammer_cursor_t cursor, int nonblock)
 
        if (cursor->node == NULL) {
                error = ENOENT;
+       } else if ((cursor->flags & HAMMER_CURSOR_INCLUSTER) &&
+                  cursor->node->ondisk->parent == 0
+       ) {
+               error = ENOENT;
        } else {
                error = hammer_load_cursor_parent(cursor);
        }
@@ -464,10 +502,14 @@ hammer_cursor_down(hammer_cursor_t cursor)
        /*
         * Ok, push down into elm.  If rec_offset is non-zero this is
         * an inter-cluster push, otherwise it is a intra-cluster push.
+        *
+        * Cursoring down through a cluster transition when the INCLUSTER
+        * flag is set is not legal.
         */
        if (elm->internal.rec_offset) {
-               vol_no = elm->internal.subtree_volno;
-               clu_no = elm->internal.subtree_cluid;
+               KKASSERT((cursor->flags & HAMMER_CURSOR_INCLUSTER) == 0);
+               vol_no = elm->internal.subtree_vol_no;
+               clu_no = elm->internal.subtree_clu_no;
                volume = hammer_get_volume(node->cluster->volume->hmp,
                                           vol_no, &error);
                if (error)
@@ -481,9 +523,14 @@ hammer_cursor_down(hammer_cursor_t cursor)
                                       &error);
                hammer_rel_cluster(cluster, 0);
        } else {
+               KKASSERT(elm->internal.subtree_offset != 0);
                node = hammer_get_node(node->cluster,
                                       elm->internal.subtree_offset,
                                       &error);
+               if (error == 0) {
+                       KKASSERT(elm->internal.subtree_type == node->ondisk->type);
+                       KKASSERT(node->ondisk->parent == cursor->parent->node_offset);
+               }
        }
        if (error == 0) {
                hammer_lock_ex(&node->lock);
index 29a36af..b5c8adc 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.4 2007/11/30 00:16:56 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_cursor.h,v 1.5 2007/12/29 09:01:27 dillon Exp $
  */
 
 /*
@@ -100,13 +100,17 @@ typedef struct hammer_cursor *hammer_cursor_t;
 
 #define HAMMER_CURSOR_GET_RECORD       0x0001
 #define HAMMER_CURSOR_GET_DATA         0x0002
-#define HAMMER_CURSOR_CLUSTER_TAG      0x0004  /* stop at the cluster tag */
+#define HAMMER_CURSOR_INCLUSTER                0x0004  /* stay in the cluster */
 #define HAMMER_CURSOR_INSERT           0x0008  /* adjust for insert */
 #define HAMMER_CURSOR_DELETE           0x0010  /* adjust for delete */
+#define HAMMER_CURSOR_END_INCLUSIVE    0x0020  /* key_end is inclusive */
+#define HAMMER_CURSOR_END_EXCLUSIVE    0x0040  /* key_end is exclusive (def) */
+#define HAMMER_CURSOR_ALLHISTORY       0x0080  /* return entire history */
 
 #define HAMMER_CURSOR_ATEDISK          0x0100
 #define HAMMER_CURSOR_ATEMEM           0x0200
 #define HAMMER_CURSOR_DISKEOF          0x0400
 #define HAMMER_CURSOR_MEMEOF           0x0800
 #define HAMMER_CURSOR_DELBTREE         0x1000  /* ip_delete from b-tree */
+#define HAMMER_CURSOR_DATA_EMBEDDED    0x2000  /* embedded flag on extract */
 
index be8b1e0..341489a 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.11 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.12 2007/12/29 09:01:27 dillon Exp $
  */
 
 #include "hammer.h"
@@ -274,7 +274,6 @@ hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
        ip->ino_rec.ino_size = 0;
        ip->ino_rec.ino_nlinks = 0;
        /* XXX */
-       kprintf("rootvol %p ondisk %p\n", hmp->rootvol, hmp->rootvol->ondisk);
        ip->ino_rec.base.rec_id = hammer_alloc_recid(trans);
        KKASSERT(ip->ino_rec.base.rec_id != 0);
        ip->ino_rec.base.base.obj_id = ip->obj_id;
@@ -321,6 +320,7 @@ int
 hammer_update_inode(hammer_inode_t ip)
 {
        struct hammer_cursor cursor;
+       struct hammer_cursor *spike = NULL;
        hammer_record_t record;
        int error;
 
@@ -335,6 +335,7 @@ hammer_update_inode(hammer_inode_t ip)
         * XXX Update the inode record and data in-place if the retention
         * policy allows it.
         */
+retry:
        error = 0;
 
        if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
@@ -373,17 +374,26 @@ hammer_update_inode(hammer_inode_t ip)
                record->rec.inode.base.base.create_tid = ip->last_tid;
                record->rec.inode.base.data_len = sizeof(ip->ino_data);
                record->data = (void *)&ip->ino_data;
-               error = hammer_ip_sync_record(record);
-               hammer_free_mem_record(record);
-               ip->flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
-                              HAMMER_INODE_DELONDISK);
-               if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
-                       ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
-                       hammer_modify_volume(ip->hmp->rootvol);
-                       ip->flags |= HAMMER_INODE_ONDISK;
+               error = hammer_ip_sync_record(record, &spike);
+               hammer_drop_mem_record(record, 1);
+               if (error == ENOSPC) {
+                       error = hammer_spike(&spike);
+                       if (error == 0)
+                               goto retry;
+               }
+               KKASSERT(spike == NULL);
+               if (error == 0) {
+                       ip->flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
+                                      HAMMER_INODE_DELONDISK);
+                       if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
+                               ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
+                               hammer_modify_volume(ip->hmp->rootvol);
+                               ip->flags |= HAMMER_INODE_ONDISK;
+                       }
                }
        }
-       ip->flags &= ~HAMMER_INODE_TID;
+       if (error == 0)
+               ip->flags &= ~HAMMER_INODE_TID;
        return(error);
 }
 
@@ -448,22 +458,36 @@ hammer_modify_inode(struct hammer_transaction *trans,
  * overriding any intermediate TIDs that were used for records.  Note
  * that the dirty buffer cache buffers do not have any knowledge of
  * the transaction id they were modified under.
+ *
+ * If we can't sync due to a cluster becoming full the spike structure
+ * will be filled in and ENOSPC returned.  We must return -ENOSPC to
+ * terminate the RB_SCAN.
  */
 static int
-hammer_sync_inode_callback(hammer_record_t rec, void *data __unused)
+hammer_sync_inode_callback(hammer_record_t rec, void *data)
 {
+       struct hammer_cursor **spike = data;
        int error;
 
-       error = 0;
+       hammer_ref(&rec->lock);
+       hammer_lock_ex(&rec->lock);
        if ((rec->flags & HAMMER_RECF_DELETED) == 0)
-               error = hammer_ip_sync_record(rec);
+               error = hammer_ip_sync_record(rec, spike);
+       else
+               error = 0;
+
+       if (error == ENOSPC) {
+               hammer_drop_mem_record(rec, 0);
+               return(-error);
+       }
 
        if (error) {
                kprintf("hammer_sync_inode_callback: sync failed rec %p\n",
                        rec);
-               return(-1);
+               hammer_drop_mem_record(rec, 0);
+               return(-error);
        }
-       hammer_free_mem_record(rec);
+       hammer_drop_mem_record(rec, 1); /* ref & lock eaten by call */
        return(0);
 }
 
@@ -474,8 +498,8 @@ int
 hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete)
 {
        struct hammer_transaction trans;
+       struct hammer_cursor *spike = NULL;
        int error;
-       int r;
 
        hammer_lock_ex(&ip->lock);
        hammer_start_transaction(&trans, ip->hmp);
@@ -492,12 +516,17 @@ hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete)
         * force the inode update later on to use our transaction id or
         * the delete_tid of the inode may be less then the create_tid of
         * the inode update.  XXX shouldn't happen but don't take the chance.
+        *
+        * NOTE: The call to hammer_ip_delete_range() cannot return ENOSPC
+        * so we can pass a NULL spike structure, because no partial data
+        * deletion can occur (yet).
         */
        if (ip->ino_rec.ino_nlinks == 0 && handle_delete) {
                if (ip->vp)
                        vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
                error = hammer_ip_delete_range(&trans, ip,
-                                              HAMMER_MIN_KEY, HAMMER_MAX_KEY);
+                                              HAMMER_MIN_KEY, HAMMER_MAX_KEY,
+                                              NULL);
                KKASSERT(RB_EMPTY(&ip->rec_tree));
                ip->flags &= ~HAMMER_INODE_TID;
                ip->ino_rec.base.base.delete_tid = trans.tid;
@@ -518,11 +547,18 @@ hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete)
        /*
         * Now sync related records
         */
-       if (error == 0) {
-               r = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
-                           hammer_sync_inode_callback, NULL);
-               if (r < 0)
-                       error = EIO;
+       for (;;) {
+               error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
+                               hammer_sync_inode_callback, &spike);
+               KKASSERT(error <= 0);
+               if (error < 0)
+                       error = -error;
+               if (error == ENOSPC) {
+                       error = hammer_spike(&spike);
+                       if (error == 0)
+                               continue;
+               }
+               break;
        }
 
        /*
@@ -541,8 +577,12 @@ hammer_sync_inode(hammer_inode_t ip, int waitfor, int handle_delete)
                 * flushed to the disk in the first place.
                 */
                ip->flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY);
-               while (RB_ROOT(&ip->rec_tree))
-                       hammer_free_mem_record(RB_ROOT(&ip->rec_tree));
+               while (RB_ROOT(&ip->rec_tree)) {
+                       hammer_record_t rec = RB_ROOT(&ip->rec_tree);
+                       hammer_ref(&rec->lock);
+                       hammer_lock_ex(&rec->lock);
+                       hammer_drop_mem_record(rec, 1);
+               }
                break;
        case HAMMER_INODE_ONDISK:
                /*
index 37864c3..eaf7493 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.6 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.7 2007/12/29 09:01:27 dillon Exp $
  */
 /*
  * IO Primitives and buffer cache management
@@ -267,12 +267,12 @@ hammer_io_release(struct hammer_io *io, int flush)
                 */
                if (io->released)
                        regetblk(bp);
+               io->released = 1;
                bp->b_flags &= ~B_LOCKED;
                if (io->modified || (bp->b_flags & B_DELWRI))
                        bawrite(bp);
                else
                        bqrelse(bp);
-               io->released = 1;
                hammer_io_disassociate(iou);
        }
 }
@@ -312,8 +312,14 @@ hammer_io_flush(struct hammer_io *io, struct hammer_sync_info *info)
                        io->modified = 0;
                        bawrite(bp);
                } else {
-                       kprintf("can't flush, %d refs\n", io->lock.refs);
-                       /* structure is in-use, don't race the write */
+                       /*
+                        * structure is in-use, don't race the write, but
+                        * also set B_LOCKED so we know something tried to
+                        * flush it.
+                        */
+                       kprintf("can't flush bp %p, %d refs - delaying\n",
+                               bp, io->lock.refs);
+                       bp->b_flags |= B_LOCKED;
                        bqrelse(bp);
                }
        }
index ab0e2a8..bb0bd65 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.9 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.10 2007/12/29 09:01:27 dillon Exp $
  */
 
 #include "hammer.h"
@@ -40,6 +40,7 @@ static int hammer_mem_add(hammer_transaction_t trans,
                             hammer_record_t record);
 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
+static void hammer_free_mem_record(hammer_record_t record);
 
 /*
  * Red-black tree support.
@@ -67,19 +68,11 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
 static int
 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
 {
-        /*
-         * A key1->rec_type of 0 matches any record type.
-         */
-        if (info->rec_type) {
-                if (info->rec_type < rec->rec.base.base.rec_type)
-                        return(-3);
-                if (info->rec_type > rec->rec.base.base.rec_type)
-                        return(3);
-        }
+       if (info->rec_type < rec->rec.base.base.rec_type)
+               return(-3);
+       if (info->rec_type > rec->rec.base.base.rec_type)
+               return(3);
 
-        /*
-         * There is no special case for key.  0 means 0.
-         */
         if (info->key < rec->rec.base.base.key)
                 return(-2);
         if (info->key > rec->rec.base.base.key)
@@ -141,7 +134,8 @@ RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
                    hammer_rec_compare, hammer_base_elm_t);
 
 /*
- * Allocate a record for the caller to finish filling in
+ * Allocate a record for the caller to finish filling in.  The record is
+ * returned referenced and locked.
  */
 hammer_record_t
 hammer_alloc_mem_record(hammer_inode_t ip)
@@ -150,12 +144,14 @@ hammer_alloc_mem_record(hammer_inode_t ip)
 
        record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
        record->ip = ip;
+       hammer_ref(&record->lock);
+       hammer_lock_ex(&record->lock);
        return (record);
 }
 
 /*
- * Release a memory record.  If the record is marked for defered deletion,
- * destroy the record when the last reference goes away.
+ * Release a memory record.  If the record was marked for defered deletion,
+ * and no references remain, the record is physically destroyed.
  */
 void
 hammer_rel_mem_record(struct hammer_record **recordp)
@@ -163,24 +159,37 @@ hammer_rel_mem_record(struct hammer_record **recordp)
        hammer_record_t rec;
 
        if ((rec = *recordp) != NULL) {
-               if (hammer_islastref(&rec->lock)) {
-                       hammer_unref(&rec->lock);
+               hammer_unref(&rec->lock);
+               if (rec->lock.refs == 0) {
                        if (rec->flags & HAMMER_RECF_DELETED)
                                hammer_free_mem_record(rec);
-               } else {
-                       hammer_unref(&rec->lock);
                }
                *recordp = NULL;
        }
 }
 
+/*
+ * Drop a locked hammer in-memory record.  This function unlocks and
+ * dereferences the record.  If delete != 0 the record is marked for
+ * deletion.  Physical deletion only occurs when the last reference goes
+ * away.
+ */
+void
+hammer_drop_mem_record(hammer_record_t rec, int delete)
+{
+       if (delete)
+               rec->flags |= HAMMER_RECF_DELETED;
+       hammer_unlock(&rec->lock);
+       hammer_rel_mem_record(&rec);
+}
+
 /*
  * Free a record.  Clean the structure up even though we are throwing it
  * away as a sanity check.  The actual free operation is delayed while
  * the record is referenced.  However, the record is removed from the RB
  * tree immediately.
  */
-void
+static void
 hammer_free_mem_record(hammer_record_t record)
 {
        if (record->flags & HAMMER_RECF_ONRBTREE) {
@@ -401,7 +410,8 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
  */
 int
 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
-                      int64_t offset, void *data, int bytes)
+                      int64_t offset, void *data, int bytes,
+                      struct hammer_cursor **spike)
 {
        struct hammer_cursor cursor;
        hammer_record_ondisk_t rec;
@@ -473,6 +483,12 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
 fail1:
        hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
 done:
+       /*
+        * If ENOSPC in cluster fill in the spike structure and return
+        * ENOSPC.
+        */
+       if (error == ENOSPC)
+               hammer_load_spike(&cursor, spike);
        hammer_done_cursor(&cursor);
        return(error);
 }
@@ -483,7 +499,7 @@ done:
  * writing a record out to the disk.
  */
 int
-hammer_ip_sync_record(hammer_record_t record)
+hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
 {
        struct hammer_cursor cursor;
        hammer_record_ondisk_t rec;
@@ -498,7 +514,12 @@ hammer_ip_sync_record(hammer_record_t record)
        cursor.flags = HAMMER_CURSOR_INSERT;
 
        /*
-        * Issue a lookup to position the cursor and locate the cluster
+        * Issue a lookup to position the cursor and locate the cluster.  The
+        * target key should not exist.
+        *
+        * If we run out of space trying to adjust the B-Tree for the
+        * insert, re-lookup without the insert flag so the cursor
+        * is properly positioned for the spike.
         */
        error = hammer_btree_lookup(&cursor);
        if (error == 0) {
@@ -574,19 +595,135 @@ hammer_ip_sync_record(hammer_record_t record)
 fail1:
        if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
                hammer_free_data_ptr(cursor.data_buffer, bdata,
-                                    rec->base.data_len);
+                                    record->rec.base.data_len);
        }
 done:
+       /*
+        * If ENOSPC in cluster fill in the spike structure and return
+        * ENOSPC.
+        */
+       if (error == ENOSPC)
+               hammer_load_spike(&cursor, spike);
        hammer_done_cursor(&cursor);
        return(error);
 }
 
+/*
+ * Write out a record using the specified cursor.  The caller does not have
+ * to seek the cursor.  The flags are used to determine whether the data
+ * (if any) is embedded in the record or not.
+ *
+ * The target cursor will be modified by this call.  Note in particular
+ * that HAMMER_CURSOR_INSERT is set.
+ */
+int
+hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
+                   void *data, int cursor_flags)
+{
+       union hammer_btree_elm elm;
+       hammer_record_ondisk_t nrec;
+       void *bdata;
+       int error;
+
+       cursor->key_beg = orec->base.base;
+       cursor->flags |= HAMMER_CURSOR_INSERT;
+
+       /*
+        * Issue a lookup to position the cursor and locate the cluster.  The
+        * target key should not exist.
+        *
+        * If we run out of space trying to adjust the B-Tree for the
+        * insert, re-lookup without the insert flag so the cursor
+        * is properly positioned for the spike.
+        */
+       error = hammer_btree_lookup(cursor);
+       if (error == 0) {
+               kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
+                       orec->base.base.key);
+               error = EIO;
+       }
+       if (error != ENOENT)
+               goto done;
+
+       /*
+        * Allocate record and data space now that we know which cluster
+        * the B-Tree node ended up in.
+        */
+       if (data == NULL ||
+           (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
+               bdata = data;
+       } else {
+               bdata = hammer_alloc_data(cursor->node->cluster,
+                                         orec->base.data_len, &error,
+                                         &cursor->data_buffer);
+               if (bdata == NULL)
+                       goto done;
+       }
+       nrec = hammer_alloc_record(cursor->node->cluster, &error,
+                                 &cursor->record_buffer);
+       if (nrec == NULL)
+               goto fail1;
+
+       /*
+        * Fill everything in and insert our B-Tree node.
+        *
+        * XXX assign rec_id here
+        */
+       *nrec = *orec;
+       nrec->base.data_offset = 0;
+       if (bdata) {
+               nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
+               if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
+                       /*
+                        * Data embedded in record
+                        */
+                       nrec->base.data_offset = ((char *)bdata - (char *)orec);
+                       KKASSERT(nrec->base.data_offset >= 0 && 
+                                nrec->base.data_offset + nrec->base.data_len <
+                                 sizeof(*nrec));
+                       nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
+               } else {
+                       /*
+                        * Data separate from record
+                        */
+                       nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
+                       bcopy(data, bdata, nrec->base.data_len);
+                       hammer_modify_buffer(cursor->data_buffer);
+               }
+       }
+       nrec->base.rec_id = 0;  /* XXX */
+
+       hammer_modify_buffer(cursor->record_buffer);
+
+       elm.leaf.base = nrec->base.base;
+       elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
+       elm.leaf.data_offset = nrec->base.data_offset;
+       elm.leaf.data_len = nrec->base.data_len;
+       elm.leaf.data_crc = nrec->base.data_crc;
+
+       error = hammer_btree_insert(cursor, &elm);
+       if (error == 0)
+               goto done;
+
+       hammer_free_record_ptr(cursor->record_buffer, nrec);
+fail1:
+       if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
+               hammer_free_data_ptr(cursor->data_buffer, bdata,
+                                    orec->base.data_len);
+       }
+done:
+       /* leave cursor intact */
+       return(error);
+}
 
 /*
  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
  * entry's key is used to deal with hash collisions in the upper 32 bits.
  * A unique 64 bit key is generated in-memory and may be regenerated a
  * second time when the directory record is flushed to the on-disk B-Tree.
+ *
+ * A locked and referenced record is passed to this function.  This function
+ * eats the lock and reference.
  */
 static
 int
@@ -594,7 +731,7 @@ hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
 {
        while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
                if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
-                       hammer_free_mem_record(record);
+                       hammer_drop_mem_record(record, 1);
                        return (EEXIST);
                }
                if (++trans->hmp->namekey_iterator == 0)
@@ -603,6 +740,7 @@ hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
                record->rec.base.base.key |= trans->hmp->namekey_iterator;
        }
        record->flags |= HAMMER_RECF_ONRBTREE;
+       hammer_drop_mem_record(record, 0);
        return(0);
 }
 
@@ -676,7 +814,8 @@ hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
         * function to validate the first record after the begin key.
         *
         * The ATEDISK flag is used by hammer_btree_iterate to determine
-        * whether it must index forwards or not.
+        * whether it must index forwards or not.  It is also used here
+        * to select the next record from in-memory or on-disk.
         */
        if (ip->flags & HAMMER_INODE_ONDISK) {
                error = hammer_btree_lookup(cursor);
@@ -793,8 +932,7 @@ hammer_ip_next(hammer_cursor_t cursor)
                 * Both entries valid
                 */
                elm = &cursor->node->ondisk->elms[cursor->index];
-               r = hammer_btree_cmp(&elm->base,
-                                    &cursor->iprec->rec.base.base);
+               r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
                if (r < 0) {
                        error = hammer_btree_extract(cursor,
                                                     HAMMER_CURSOR_GET_RECORD);
@@ -857,10 +995,13 @@ hammer_ip_resolve_data(hammer_cursor_t cursor)
  *
  * NOTE: Record keys for regular file data have to be special-cased since
  * they indicate the end of the range (key = base + bytes).
+ *
+ * NOTE: The spike structure must be filled in if we return ENOSPC.
  */
 int
 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
-                      int64_t ran_beg, int64_t ran_end)
+                      int64_t ran_beg, int64_t ran_end,
+                      struct hammer_cursor **spike)
 {
        struct hammer_cursor cursor;
        hammer_record_ondisk_t rec;
@@ -901,6 +1042,7 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
                        cursor.key_end.key = ran_end + MAXPHYS + 1;
                isregfile = 1;
        }
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 
        error = hammer_ip_first(&cursor, ip);
 
@@ -992,7 +1134,7 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
         */
        cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
        if (cursor->record == &cursor->iprec->rec) {
-               hammer_free_mem_record(cursor->iprec);
+               hammer_free_mem_record(cursor->iprec); /* XXX */
                return(0);
        }
 
@@ -1043,8 +1185,8 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
                                cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
                        }
                        hammer_free_record(cluster, rec_offset);
-                       if (data_offset - rec_offset < 0 ||
-                           data_offset - rec_offset >= HAMMER_RECORD_SIZE) {
+                       if (data_offset && (data_offset - rec_offset < 0 ||
+                           data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
                                hammer_free_data(cluster, data_offset,data_len);
                        }
                }
index 8e06bac..6436f26 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.10 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.11 2007/12/29 09:01:27 dillon Exp $
  */
 /*
  * Manage HAMMER's on-disk structures.  These routines are primarily
@@ -495,7 +495,6 @@ hammer_load_volume(hammer_volume_t volume)
                        volume->alist.meta = ondisk->vol_almeta.normal;
                        volume->alist.info = NULL;
                }
-               hammer_alist_init(&volume->alist);
        } else {
                error = 0;
        }
@@ -786,8 +785,10 @@ hammer_load_cluster(hammer_cluster_t cluster, int isnew)
                cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
                cluster->alist_mdata.info = cluster;
 
-               cluster->clu_btree_beg = ondisk->clu_btree_beg;
-               cluster->clu_btree_end = ondisk->clu_btree_end;
+               if (isnew == 0) {
+                       cluster->clu_btree_beg = ondisk->clu_btree_beg;
+                       cluster->clu_btree_end = ondisk->clu_btree_end;
+               }
        } else if (isnew) {
                error = hammer_io_new(volume->devvp, &cluster->io);
        } else {
@@ -800,8 +801,12 @@ hammer_load_cluster(hammer_cluster_t cluster, int isnew)
                 * responsible for the remainder.
                 */
                struct hammer_alist_live dummy;
+               hammer_node_t croot;
+               hammer_volume_ondisk_t voldisk;
+               int32_t nbuffers;
 
                ondisk = cluster->ondisk;
+               voldisk = volume->ondisk;
 
                dummy.config = &Buf_alist_config;
                dummy.meta = ondisk->head.buf_almeta;
@@ -812,6 +817,46 @@ hammer_load_cluster(hammer_cluster_t cluster, int isnew)
                hammer_alist_init(&cluster->alist_btree);
                hammer_alist_init(&cluster->alist_record);
                hammer_alist_init(&cluster->alist_mdata);
+
+               ondisk->vol_fsid = voldisk->vol_fsid;
+               ondisk->vol_fstype = voldisk->vol_fstype;
+               ondisk->clu_gen = 1;
+               ondisk->clu_id = 0;     /* XXX */
+               ondisk->clu_no = cluster->clu_no;
+               ondisk->clu_flags = 0;
+               ondisk->clu_start = HAMMER_BUFSIZE;
+               KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
+               if (voldisk->vol_clo_end - cluster->io.offset >
+                   voldisk->vol_clsize) {
+                       ondisk->clu_limit = voldisk->vol_clsize;
+               } else {
+                       ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
+                                                     cluster->io.offset);
+               }
+               nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
+               hammer_alist_free(&cluster->alist_master, 1, nbuffers - 1);
+               ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
+               ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
+               ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
+
+               /*
+                * Initialize the B-Tree.  We don't know what the caller
+                * intends to do with the cluster so make sure it causes
+                * an assertion if the caller makes no changes.
+                */
+               ondisk->clu_btree_parent_vol_no = -2;
+               ondisk->clu_btree_parent_clu_no = -2;
+               ondisk->clu_btree_parent_offset = -2;
+               ondisk->clu_btree_parent_clu_gen = -2;
+
+               croot = hammer_alloc_btree(cluster, &error);
+               if (error == 0) {
+                       bzero(croot->ondisk, sizeof(*croot->ondisk));
+                       croot->ondisk->count = 0;
+                       croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
+                       ondisk->clu_btree_root = croot->node_offset;
+                       hammer_modify_cluster(cluster);
+               }
        }
        hammer_unlock(&cluster->io.lock);
        return (error);
@@ -1415,6 +1460,109 @@ hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node)
  *                             A-LIST ALLOCATORS                       *
  ************************************************************************/
 
+/*
+ * Allocate HAMMER clusters
+ */
+hammer_cluster_t
+hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
+                    int *errorp)
+{
+       hammer_volume_t volume;
+       hammer_cluster_t cluster;
+       int32_t clu_no;
+       int32_t clu_hint;
+       int32_t vol_beg;
+       int32_t vol_no;
+
+       /*
+        * Figure out our starting volume and hint.
+        */
+       if (cluster_hint) {
+               vol_beg = cluster_hint->volume->vol_no;
+               clu_hint = cluster_hint->clu_no;
+       } else {
+               vol_beg = hmp->volume_iterator;
+               clu_hint = -1;
+       }
+
+       /*
+        * Loop through volumes looking for a free cluster.  If allocating
+        * a new cluster relative to an existing cluster try to find a free
+        * cluster on either side (clu_hint >= 0), otherwise just do a
+        * forwards iteration.
+        */
+       vol_no = vol_beg;
+       do {
+               volume = hammer_get_volume(hmp, vol_no, errorp);
+               kprintf("VOLUME %p %d\n", volume, vol_no);
+               if (*errorp) {
+                       clu_no = HAMMER_ALIST_BLOCK_NONE;
+                       break;
+               }
+               if (clu_hint == -1) {
+                       clu_hint = volume->clu_iterator;
+                       clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
+                                                       clu_hint);
+                       if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
+                               clu_no = hammer_alist_alloc_fwd(&volume->alist,
+                                                               1, 0);
+                       }
+               } else {
+                       clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
+                                                       clu_hint);
+                       if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
+                               clu_no = hammer_alist_alloc_rev(&volume->alist,
+                                                               1, clu_hint);
+                       }
+               }
+               if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
+                       hammer_modify_volume(volume);
+                       break;
+               }
+               hammer_rel_volume(volume, 0);
+               volume = NULL;
+               *errorp = ENOSPC;
+               vol_no = (vol_no + 1) % hmp->nvolumes;
+               clu_hint = -1;
+       } while (vol_no != vol_beg);
+
+       /*
+        * Acquire the cluster.  On success this will force *errorp to 0.
+        */
+       if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
+               kprintf("ALLOC CLUSTER %d\n", clu_no);
+               cluster = hammer_get_cluster(volume, clu_no, errorp, 1);
+               volume->clu_iterator = clu_no;
+               hammer_rel_volume(volume, 0);
+       } else {
+               cluster = NULL;
+       }
+       if (cluster)
+               hammer_lock_ex(&cluster->io.lock);
+       return(cluster);
+}
+
+void
+hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
+                   hammer_base_elm_t right_bound)
+{
+       hammer_cluster_ondisk_t ondisk = cluster->ondisk;
+
+       ondisk->clu_btree_beg = *left_bound;
+       ondisk->clu_btree_end = *right_bound;
+       cluster->clu_btree_beg = ondisk->clu_btree_beg;
+       cluster->clu_btree_end = ondisk->clu_btree_end;
+}
+
+/*
+ * Deallocate a cluster
+ */
+void
+hammer_free_cluster(hammer_cluster_t cluster)
+{
+       hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
+}
+
 /*
  * Allocate HAMMER elements - btree nodes, data storage, and record elements
  *
@@ -1742,7 +1890,12 @@ alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live,
 
        isfwd = (type != HAMMER_FSBUF_RECORDS);
        buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
-       KKASSERT(buf_no != HAMMER_ALIST_BLOCK_NONE); /* XXX */
+       if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
+               *errorp = ENOSPC;
+               *bufferp = NULL;
+               return;
+       }
+
        hammer_modify_cluster(cluster);
 
        /*
@@ -1936,7 +2089,7 @@ calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
                      (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
                      ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
                       HAMMER_VOL_SUPERCLUSTER_GROUP))
-                     * HAMMER_BUFSIZE;
+                     * volume->vol_clsize;
        } else {
                off = volume->cluster_base +
                      (int64_t)clu_no * volume->vol_clsize;
@@ -2226,7 +2379,7 @@ super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
        int error = 0;
 
        scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
-       supercl = hammer_get_supercl(volume, scl_no, &error, 1);
+       supercl = hammer_get_supercl(volume, scl_no, &error, 0);
        if (supercl) {
                r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
                if (r != HAMMER_ALIST_BLOCK_NONE)
@@ -2252,7 +2405,7 @@ super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
        int error = 0;
 
        scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
-       supercl = hammer_get_supercl(volume, scl_no, &error, 1);
+       supercl = hammer_get_supercl(volume, scl_no, &error, 0);
        if (supercl) {
                r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
                if (r != HAMMER_ALIST_BLOCK_NONE)
@@ -2277,7 +2430,7 @@ super_alist_free(void *info, int32_t blk, int32_t radix,
        int error = 0;
 
        scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
-       supercl = hammer_get_supercl(volume, scl_no, &error, 1);
+       supercl = hammer_get_supercl(volume, scl_no, &error, 0);
        if (supercl) {
                hammer_alist_free(&supercl->alist, base_blk, count);
                *emptyp = hammer_alist_isempty(&supercl->alist);
index 14dbae8..6ee7271 100644 (file)
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/Attic/hammer_spike.c,v 1.1 2007/11/07 00:43:24 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/Attic/hammer_spike.c,v 1.2 2007/12/29 09:01:27 dillon Exp $
  */
 
+#include "hammer.h"
+
+/*
+ * Load spike info given a cursor.  The cursor must point to the leaf node
+ * that needs to be spiked.
+ */
+void
+hammer_load_spike(hammer_cursor_t cursor, struct hammer_cursor **spikep)
+{
+       hammer_cursor_t spike;
+
+       KKASSERT(cursor->node->ondisk->type == HAMMER_BTREE_TYPE_LEAF);
+       KKASSERT(*spikep == NULL);
+       *spikep = spike = kmalloc(sizeof(*spike), M_HAMMER, M_WAITOK|M_ZERO);
+
+       spike->parent = cursor->parent;
+       spike->parent_index = cursor->parent_index;
+       spike->node = cursor->node;
+       spike->index = cursor->index;
+       spike->left_bound = cursor->left_bound;
+       spike->right_bound = cursor->right_bound;
+
+       if (spike->parent) {
+               hammer_ref_node(spike->parent);
+               hammer_lock_ex(&spike->parent->lock);
+       }
+       hammer_ref_node(spike->node);
+       hammer_lock_ex(&spike->node->lock);
+       kprintf("LOAD SPIKE %p\n", spike);
+}
+
 /*
  * Spike code - make room in a cluster by spiking in a new cluster.
+ *
+ * The spike structure contains a locked and reference B-Tree leaf node.
+ * The spike at a minimum must replace the node with a cluster reference
+ * and then delete the contents of the node.
+ *
+ * Various optimizations are desireable, including merging the spike node
+ * with an adjacent node that has already been spiked, if its cluster is
+ * not full, or promoting the spike node to the parent cluster of the current
+ * cluster when it represents the right hand boundary leaf node in the
+ * cluster (to avoid append chains).
  */
+int
+hammer_spike(struct hammer_cursor **spikep)
+{
+       hammer_cursor_t spike;
+       struct hammer_cursor ncursor;
+       hammer_cluster_t ocluster;
+       hammer_cluster_t ncluster;
+       hammer_node_t onode;
+       int error;
+
+       kprintf("hammer_spike: ENOSPC in cluster, spiking\n");
+       /*Debugger("ENOSPC");*/
+
+       /*
+        * Lock and flush the cluster XXX
+        */
+       spike = *spikep;
+       KKASSERT(spike != NULL);
+       KKASSERT(spike->parent &&
+                spike->parent->cluster == spike->node->cluster);
+
+       error = 0;
+       onode = spike->node;
+       ocluster = onode->cluster;
+       hammer_lock_ex(&ocluster->io.lock);
+
+       /* XXX */
+
+       /*
+        * Allocate and lock a new cluster, initialize its bounds.
+        */
+       ncluster = hammer_alloc_cluster(ocluster->volume->hmp, ocluster,
+                                       &error);
+       if (ncluster == NULL)
+               goto failed3;
+       hammer_init_cluster(ncluster, spike->left_bound, spike->right_bound);
+
+       /*
+        * Get a cursor for the new cluster.  Operations will be limited to
+        * this cluster.
+        */
+       error = hammer_init_cursor_cluster(&ncursor, ncluster);
+       if (error)
+               goto failed2;
+
+       /*
+        * Copy the elements in the leaf node.
+        */
+       for (spike->index = 0; spike->index < onode->ondisk->count; 
+            ++spike->index) {
+               error = hammer_btree_extract(spike, HAMMER_CURSOR_GET_RECORD |
+                                                   HAMMER_CURSOR_GET_DATA);
+               if (error)
+                       goto failed1;
+               kprintf("EXTRACT %04x %016llx %016llx\n",
+                       spike->record->base.base.rec_type,
+                       spike->record->base.base.obj_id,
+                       spike->record->base.base.key);
+               error = hammer_write_record(&ncursor, spike->record,
+                                           spike->data, spike->flags);
+               if (error == ENOSPC) {
+                       kprintf("impossible ENOSPC error on spike\n");
+                       error = EIO;
+               }
+               if (error)
+                       goto failed1;
+       }
+
+       /*
+        * Success!  Replace the parent reference to the leaf node with a
+        * parent reference to the new cluster and fixup the new cluster's
+        * parent reference.  This completes the spike operation.
+        */
+       {
+               hammer_node_ondisk_t ondisk;
+               hammer_btree_elm_t elm;
+               
+               ondisk = spike->parent->ondisk;
+               elm = &ondisk->elms[spike->parent_index];
+               elm->internal.subtree_type = HAMMER_BTREE_TYPE_CLUSTER;
+               elm->internal.rec_offset = -1; /* XXX */
+               elm->internal.subtree_clu_no = ncluster->clu_no;
+               elm->internal.subtree_vol_no = ncluster->volume->vol_no;
+               elm->internal.subtree_count = onode->ondisk->count; /*XXX*/
+               hammer_modify_node(spike->parent);
+               hammer_flush_node(onode);
+       }
+       {
+               hammer_cluster_ondisk_t ondisk;
+
+               ondisk = ncluster->ondisk;
+               ondisk->clu_btree_parent_vol_no = ocluster->volume->vol_no;
+               ondisk->clu_btree_parent_clu_no = ocluster->clu_no;
+               ondisk->clu_btree_parent_offset = spike->parent->node_offset;
+               ondisk->clu_btree_parent_clu_gen = ocluster->ondisk->clu_gen;
+               hammer_modify_cluster(ncluster);
+       }
+
+       /*
+        * Delete the records and data associated with the old leaf node,
+        * then free the old leaf node (nothing references it any more).
+        */
+       for (spike->index = 0; spike->index < onode->ondisk->count; 
+            ++spike->index) {
+               hammer_btree_elm_t elm;
+               int32_t roff;
+
+               elm = &onode->ondisk->elms[spike->index];
+               KKASSERT(elm->leaf.rec_offset > 0);
+               hammer_free_record(ocluster, elm->leaf.rec_offset);
+               if (elm->leaf.data_offset) {
+                       roff = elm->leaf.data_offset - elm->leaf.rec_offset;
+                       if (roff < 0 || roff >= HAMMER_RECORD_SIZE) {
+                               hammer_free_data(ocluster,
+                                                elm->leaf.data_offset,
+                                                elm->leaf.data_len);
+                       }
+               }
+       }
+       hammer_free_btree(ocluster, onode->node_offset);
+
+       /*
+        * XXX I/O dependancy - new cluster must be flushed before current
+        * cluster can be flushed.
+        */
+       /*Debugger("COPY COMPLETE");*/
+       hammer_done_cursor(&ncursor);
+       goto success;
+
+       /*
+        * Cleanup
+        */
+failed1:
+       hammer_done_cursor(&ncursor);
+failed2:
+       hammer_free_cluster(ncluster);
+success:
+       hammer_unlock(&ncluster->io.lock);
+       hammer_rel_cluster(ncluster, 0);
+failed3:
+       kprintf("UNLOAD SPIKE %p %d\n", spike, error);
+       hammer_unlock(&ocluster->io.lock);
+       hammer_done_cursor(spike);
+       kfree(spike, M_HAMMER);
+       *spikep = NULL;
+       return (error);
+}
 
index ce83a58..7d004ab 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.3 2007/11/20 07:16:28 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_transaction.c,v 1.4 2007/12/29 09:01:27 dillon Exp $
  */
 
 #include "hammer.h"
@@ -52,7 +52,6 @@ void
 hammer_abort_transaction(struct hammer_transaction *trans)
 {
        hammer_rel_volume(trans->rootvol, 0);
-       KKASSERT(0);
 }
 
 void
index f267bf9..75171fd 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.9 2007/12/14 08:05:39 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vfsops.c,v 1.10 2007/12/29 09:01:27 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -143,6 +143,7 @@ hammer_vfs_mount(struct mount *mp, char *mntpt, caddr_t data,
         * Load volumes
         */
        path = objcache_get(namei_oc, M_WAITOK);
+       hmp->nvolumes = info.nvolumes;
        for (i = 0; i < info.nvolumes; ++i) {
                error = copyin(&info.volumes[i], &upath, sizeof(char *));
                if (error == 0)
index 1ae9964..dec31cf 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.9 2007/11/30 00:16:56 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.10 2007/12/29 09:01:27 dillon Exp $
  */
 
 #include <sys/param.h>
@@ -535,6 +535,7 @@ hammer_vop_nresolve(struct vop_nresolve_args *ap)
 
        cursor.key_end = cursor.key_beg;
        cursor.key_end.key |= 0xFFFFFFFFULL;
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 
        /*
         * Scan all matching records (the chain), locate the one matching
@@ -829,6 +830,7 @@ hammer_vop_readdir(struct vop_readdir_args *ap)
 
        cursor.key_end = cursor.key_beg;
        cursor.key_end.key = HAMMER_MAX_KEY;
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 
        if (ap->a_ncookies) {
                ncookies = uio->uio_resid / 16 + 1;
@@ -965,6 +967,7 @@ hammer_vop_nrename(struct vop_nrename_args *ap)
 
        cursor.key_end = cursor.key_beg;
        cursor.key_end.key |= 0xFFFFFFFFULL;
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 
        /*
         * Scan all matching records (the chain), locate the one matching
@@ -1025,6 +1028,7 @@ int
 hammer_vop_setattr(struct vop_setattr_args *ap)
 {
        struct hammer_transaction trans;
+       struct hammer_cursor *spike = NULL;
        struct vattr *vap;
        struct hammer_inode *ip;
        int modflags;
@@ -1078,7 +1082,7 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                        modflags |= HAMMER_INODE_DDIRTY;
                }
        }
-       if (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
+       while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
                switch(ap->a_vp->v_type) {
                case VREG:
                        if (vap->va_size < ip->ino_rec.ino_size) {
@@ -1091,14 +1095,16 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                                        ~(int64_t)HAMMER_BUFMASK;
                        error = hammer_ip_delete_range(&trans, ip,
                                                    aligned_size,
-                                                   0x7FFFFFFFFFFFFFFFLL);
+                                                   0x7FFFFFFFFFFFFFFFLL,
+                                                   &spike);
                        ip->ino_rec.ino_size = vap->va_size;
                        modflags |= HAMMER_INODE_RDIRTY;
                        break;
                case VDATABASE:
                        error = hammer_ip_delete_range(&trans, ip,
                                                    vap->va_size,
-                                                   0x7FFFFFFFFFFFFFFFLL);
+                                                   0x7FFFFFFFFFFFFFFFLL,
+                                                   &spike);
                        ip->ino_rec.ino_size = vap->va_size;
                        modflags |= HAMMER_INODE_RDIRTY;
                        break;
@@ -1106,6 +1112,13 @@ hammer_vop_setattr(struct vop_setattr_args *ap)
                        error = EINVAL;
                        goto done;
                }
+               if (error == ENOSPC) {
+                       error = hammer_spike(&spike);
+                       if (error == 0)
+                               continue;
+               }
+               KKASSERT(spike == NULL);
+               break;
        }
        if (vap->va_atime.tv_sec != VNOVAL) {
                ip->ino_rec.ino_atime =
@@ -1250,6 +1263,7 @@ hammer_vop_strategy_read(struct vop_strategy_args *ap)
                else
                        cursor.key_end.key = ran_end + MAXPHYS + 1;
        }
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 
        error = hammer_ip_first(&cursor, ip);
        boff = 0;
@@ -1325,6 +1339,7 @@ int
 hammer_vop_strategy_write(struct vop_strategy_args *ap)
 {
        struct hammer_transaction trans;
+       struct hammer_cursor *spike = NULL;
        hammer_inode_t ip;
        struct bio *bio;
        struct buf *bp;
@@ -1335,16 +1350,19 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
        ip = ap->a_vp->v_data;
        hammer_start_transaction(&trans, ip->hmp);
 
+retry:
        /*
         * Delete any records overlapping our range.  This function will
-        * properly
+        * (eventually) properly truncate partial overlaps.
         */
        if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
                error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
-                                              bio->bio_offset);
+                                              bio->bio_offset, &spike);
        } else {
                error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
-                                      bio->bio_offset + bp->b_bufsize - 1);
+                                              bio->bio_offset +
+                                               bp->b_bufsize - 1,
+                                              &spike);
        }
 
        /*
@@ -1352,8 +1370,20 @@ hammer_vop_strategy_write(struct vop_strategy_args *ap)
         */
        if (error == 0) {
                error = hammer_ip_sync_data(&trans, ip, bio->bio_offset,
-                                           bp->b_data, bp->b_bufsize);
+                                           bp->b_data, bp->b_bufsize,
+                                           &spike);
+       }
+
+       /*
+        * If we ran out of space the spike structure will be filled in
+        * and we must call hammer_spike with it, then retry.
+        */
+       if (error == ENOSPC) {
+               error = hammer_spike(&spike);
+               if (error == 0)
+                       goto retry;
        }
+       KKASSERT(spike == NULL);
 
        /*
         * If an error occured abort the transaction
@@ -1408,6 +1438,7 @@ hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred,
 
        cursor.key_end = cursor.key_beg;
        cursor.key_end.key |= 0xFFFFFFFFULL;
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 
        hammer_start_transaction(&trans, dip->hmp);