HAMMER 21/many: B-Tree node locking finalization.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
index f1b3d22..ce571f1 100644 (file)
@@ -31,7 +31,7 @@
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * 
- * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.18 2008/01/10 07:41:03 dillon Exp $
+ * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.21 2008/01/18 07:02:41 dillon Exp $
  */
 
 #include "hammer.h"
@@ -57,9 +57,17 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
        if (rec1->rec.base.base.key > rec2->rec.base.base.key)
                return(1);
 
-       if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
+       if (rec1->rec.base.base.delete_tid == 0) {
+               if (rec2->rec.base.base.delete_tid == 0)
+                       return(0);
+               return(1);
+       }
+       if (rec2->rec.base.base.delete_tid == 0)
+               return(-1);
+
+       if (rec1->rec.base.base.delete_tid < rec2->rec.base.base.delete_tid)
                return(-1);
-       if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
+       if (rec1->rec.base.base.delete_tid > rec2->rec.base.base.delete_tid)
                return(1);
         return(0);
 }
@@ -77,26 +85,17 @@ hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
         if (info->key > rec->rec.base.base.key)
                 return(2);
 
-        /*
-         * This test has a number of special cases.  create_tid in key1 is
-         * the as-of transction id, and delete_tid in key1 is NOT USED.
-         *
-         * A key1->create_tid of 0 matches any record regardles of when
-         * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
-         * used to search for the most current state of the object.
-         *
-         * key2->create_tid is a HAMMER record and will never be
-         * 0.   key2->delete_tid is the deletion transaction id or 0 if
-         * the record has not yet been deleted.
-         */
-        if (info->create_tid) {
-                if (info->create_tid < rec->rec.base.base.create_tid)
-                        return(-1);
-                if (rec->rec.base.base.delete_tid &&
-                   info->create_tid >= rec->rec.base.base.delete_tid) {
-                        return(1);
-               }
-        }
+       if (info->delete_tid == 0) {
+               if (rec->rec.base.base.delete_tid == 0)
+                       return(0);
+               return(1);
+       }
+       if (rec->rec.base.base.delete_tid == 0)
+               return(-1);
+       if (info->delete_tid < rec->rec.base.base.delete_tid)
+               return(-1);
+       if (info->delete_tid > rec->rec.base.base.delete_tid)
+               return(1);
         return(0);
 }
 
@@ -120,8 +119,6 @@ hammer_rec_scan_cmp(hammer_record_t rec, void *data)
        r = hammer_rec_compare(&cursor->key_beg, rec);
        if (r > 1)
                return(-1);
-       if (r == 0)
-               return(0);
        r = hammer_rec_compare(&cursor->key_end, rec);
        if (r < -1)
                return(1);
@@ -144,6 +141,7 @@ hammer_alloc_mem_record(hammer_inode_t ip)
        ++hammer_count_records;
        record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
        record->ip = ip;
+       record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
        hammer_ref(&record->lock);
        return (record);
 }
@@ -227,12 +225,11 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data)
        /*
         * Skip if not visible due to our as-of TID
         */
-        if (cursor->key_beg.create_tid) {
-                if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
+        if (cursor->flags & HAMMER_CURSOR_ASOF) {
+                if (cursor->asof < rec->rec.base.base.create_tid)
                         return(0);
                 if (rec->rec.base.base.delete_tid &&
-                   cursor->key_beg.create_tid >=
-                    rec->rec.base.base.delete_tid) {
+                   cursor->asof >= rec->rec.base.base.delete_tid) {
                         return(0);
                }
         }
@@ -352,6 +349,9 @@ hammer_ip_add_directory(struct hammer_transaction *trans,
  * cursor must be seeked to the directory entry record being deleted.
  *
  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
+ *
+ * This function can return EDEADLK requiring the caller to terminate
+ * the cursor and retry.
  */
 int
 hammer_ip_del_directory(struct hammer_transaction *trans,
@@ -366,12 +366,16 @@ hammer_ip_del_directory(struct hammer_transaction *trans,
         * One less link.  The file may still be open in the OS even after
         * all links have gone away so we only try to sync if the OS has
         * no references and nlinks falls to 0.
+        *
+        * We have to terminate the cursor before syncing the inode to
+        * avoid deadlocking against ourselves.
         */
        if (error == 0) {
                --ip->ino_rec.ino_nlinks;
                hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
                if (ip->ino_rec.ino_nlinks == 0 &&
                    (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
+                       hammer_done_cursor(cursor);
                        hammer_sync_inode(ip, MNT_NOWAIT, 1);
                }
 
@@ -436,15 +440,17 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
        void *bdata;
        int error;
 
+retry:
        error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
        if (error)
                return(error);
        cursor.key_beg.obj_id = ip->obj_id;
        cursor.key_beg.key = offset + bytes;
-       cursor.key_beg.create_tid = trans->tid;
+       cursor.key_beg.create_tid = 0;
        cursor.key_beg.delete_tid = 0;
        cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
-       cursor.flags = HAMMER_CURSOR_INSERT;
+       cursor.asof = trans->tid;
+       cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_ASOF;
 
        /*
         * Issue a lookup to position the cursor and locate the cluster
@@ -477,7 +483,12 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
         * Fill everything in and insert our B-Tree node.
         */
        hammer_modify_buffer(cursor.record_buffer);
-       rec->base.base = cursor.key_beg;
+       rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
+       rec->base.base.obj_id = ip->obj_id;
+       rec->base.base.key = offset + bytes;
+       rec->base.base.create_tid = trans->tid;
+       rec->base.base.delete_tid = 0;
+       rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
        rec->base.data_crc = crc32(data, bytes);
        rec->base.rec_id = 0;   /* XXX */
        rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
@@ -486,7 +497,7 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
        hammer_modify_buffer(cursor.data_buffer);
        bcopy(data, bdata, bytes);
 
-       elm.leaf.base = cursor.key_beg;
+       elm.leaf.base = rec->base.base;
        elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
        elm.leaf.data_offset = rec->base.data_offset;
        elm.leaf.data_len = bytes;
@@ -516,6 +527,8 @@ done:
        if (error == ENOSPC)
                hammer_load_spike(&cursor, spike);
        hammer_done_cursor(&cursor);
+       if (error == EDEADLK)
+               goto retry;
        return(error);
 }
 
@@ -534,12 +547,13 @@ hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
        void *bdata;
        int error;
 
+retry:
        error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
                                       record->ip->hmp);
        if (error)
                return(error);
        cursor.key_beg = record->rec.base.base;
-       cursor.flags = HAMMER_CURSOR_INSERT;
+       cursor.flags |= HAMMER_CURSOR_INSERT;
 
        /*
         * Issue a lookup to position the cursor and locate the cluster.  The
@@ -551,21 +565,23 @@ hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
         * insert, re-lookup without the insert flag so the cursor
         * is properly positioned for the spike.
         */
-again:
-       error = hammer_btree_lookup(&cursor);
-       if (error == 0) {
-               if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
-                       hmp = cursor.node->cluster->volume->hmp;
-                       if (++hmp->namekey_iterator == 0)
-                               ++hmp->namekey_iterator;
-                       record->rec.base.base.key &= ~(0xFFFFFFFFLL);
-                       record->rec.base.base.key |= hmp->namekey_iterator;
-                       goto again;
+       for (;;) {
+               error = hammer_btree_lookup(&cursor);
+               if (error)
+                       break;
+               if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
+                       kprintf("hammer_ip_sync_record: duplicate rec "
+                               "at (%016llx)\n", record->rec.base.base.key);
+                       Debugger("duplicate record1");
+                       error = EIO;
+                       break;
                }
-               kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
-                       record->rec.base.base.key);
-               Debugger("duplicate record1");
-               error = EIO;
+               hmp = cursor.node->cluster->volume->hmp;
+               if (++hmp->namekey_iterator == 0)
+                       ++hmp->namekey_iterator;
+               record->rec.base.base.key &= ~(0xFFFFFFFFLL);
+               record->rec.base.base.key |= hmp->namekey_iterator;
+               cursor.key_beg.key = record->rec.base.base.key;
        }
        if (error != ENOENT)
                goto done;
@@ -637,7 +653,7 @@ again:
        }
        rec->base.rec_id = 0;   /* XXX */
 
-       elm.leaf.base = cursor.key_beg;
+       elm.leaf.base = record->rec.base.base;
        elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
        elm.leaf.data_offset = rec->base.data_offset;
        elm.leaf.data_len = rec->base.data_len;
@@ -672,6 +688,8 @@ done:
        if (error == ENOSPC)
                hammer_load_spike(&cursor, spike);
        hammer_done_cursor(&cursor);
+       if (error == EDEADLK)
+               goto retry;
        return(error);
 }
 
@@ -682,6 +700,9 @@ done:
  *
  * The target cursor will be modified by this call.  Note in particular
  * that HAMMER_CURSOR_INSERT is set.
+ *
+ * NOTE: This can return EDEADLK, requiring the caller to release its cursor
+ * and retry the operation.
  */
 int
 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
@@ -865,6 +886,9 @@ hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
  *
  * When 0 is returned hammer_ip_next() may be used to iterate additional
  * records within the requested range.
+ *
+ * This function can return EDEADLK, requiring the caller to terminate
+ * the cursor and try again.
  */
 int
 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
@@ -890,10 +914,14 @@ hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
         * The ATEDISK flag is used by hammer_btree_iterate to determine
         * whether it must index forwards or not.  It is also used here
         * to select the next record from in-memory or on-disk.
+        *
+        * EDEADLK can only occur if the lookup hit an empty internal
+        * element and couldn't delete it.  Since this could only occur
+        * in-range, we can just iterate from the failure point.
         */
        if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
                error = hammer_btree_lookup(cursor);
-               if (error == ENOENT) {
+               if (error == ENOENT || error == EDEADLK) {
                        cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
                        error = hammer_btree_iterate(cursor);
                }
@@ -1087,12 +1115,15 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
        int error;
        int64_t off;
 
+retry:
        hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
 
        cursor.key_beg.obj_id = ip->obj_id;
-       cursor.key_beg.create_tid = ip->obj_asof;
+       cursor.key_beg.create_tid = 0;
        cursor.key_beg.delete_tid = 0;
        cursor.key_beg.obj_type = 0;
+       cursor.asof = ip->obj_asof;
+       cursor.flags |= HAMMER_CURSOR_ASOF;
 
        cursor.key_end = cursor.key_beg;
        if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
@@ -1187,6 +1218,8 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
                error = hammer_ip_next(&cursor);
        }
        hammer_done_cursor(&cursor);
+       if (error == EDEADLK)
+               goto retry;
        if (error == ENOENT)
                error = 0;
        return(error);
@@ -1204,10 +1237,11 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
        hammer_base_elm_t base;
        int error;
 
+retry:
        hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
 
        cursor.key_beg.obj_id = ip->obj_id;
-       cursor.key_beg.create_tid = ip->obj_asof;
+       cursor.key_beg.create_tid = 0;
        cursor.key_beg.delete_tid = 0;
        cursor.key_beg.obj_type = 0;
        cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
@@ -1217,7 +1251,8 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
        cursor.key_end.rec_type = 0xFFFF;
        cursor.key_end.key = HAMMER_MAX_KEY;
 
-       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
+       cursor.asof = ip->obj_asof;
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
 
        error = hammer_ip_first(&cursor, ip);
 
@@ -1243,13 +1278,18 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
                error = hammer_ip_next(&cursor);
        }
        hammer_done_cursor(&cursor);
+       if (error == EDEADLK)
+               goto retry;
        if (error == ENOENT)
                error = 0;
        return(error);
 }
 
 /*
- * Delete the record at the current cursor
+ * Delete the record at the current cursor.
+ *
+ * NOTE: This can return EDEADLK, requiring the caller to terminate the
+ * cursor and retry.
  */
 int
 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
@@ -1277,10 +1317,14 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
                hammer_modify_buffer(cursor->record_buffer);
                cursor->record->base.base.delete_tid = tid;
 
-               hammer_modify_node(cursor->node);
-               elm = &cursor->node->ondisk->elms[cursor->index];
-               elm->leaf.base.delete_tid = tid;
-               hammer_update_syncid(cursor->record_buffer->cluster, tid);
+               error = hammer_cursor_upgrade(cursor);
+               if (error == 0) {
+                       hammer_modify_node(cursor->node);
+                       elm = &cursor->node->ondisk->elms[cursor->index];
+                       elm->leaf.base.delete_tid = tid;
+                       hammer_update_syncid(cursor->record_buffer->cluster,
+                                            tid);
+               }
        }
 
        /*
@@ -1342,7 +1386,7 @@ hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
        hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
 
        cursor.key_beg.obj_id = ip->obj_id;
-       cursor.key_beg.create_tid = ip->obj_asof;
+       cursor.key_beg.create_tid = 0;
        cursor.key_beg.delete_tid = 0;
        cursor.key_beg.obj_type = 0;
        cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
@@ -1352,7 +1396,8 @@ hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
        cursor.key_end.rec_type = 0xFFFF;
        cursor.key_end.key = HAMMER_MAX_KEY;
 
-       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
+       cursor.asof = ip->obj_asof;
+       cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
 
        error = hammer_ip_first(&cursor, ip);
        if (error == ENOENT)