X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/blobdiff_plain/f5fd56a2b6e2c3063c3273222d56018a80e5e2db..6a37e7e4b492073cdb3c293d5a142462d05342eb:/sys/vfs/hammer/hammer_object.c diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index f1b3d22a6a..ce571f1783 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -31,7 +31,7 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.18 2008/01/10 07:41:03 dillon Exp $ + * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.21 2008/01/18 07:02:41 dillon Exp $ */ #include "hammer.h" @@ -57,9 +57,17 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) if (rec1->rec.base.base.key > rec2->rec.base.base.key) return(1); - if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid) + if (rec1->rec.base.base.delete_tid == 0) { + if (rec2->rec.base.base.delete_tid == 0) + return(0); + return(1); + } + if (rec2->rec.base.base.delete_tid == 0) + return(-1); + + if (rec1->rec.base.base.delete_tid < rec2->rec.base.base.delete_tid) return(-1); - if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid) + if (rec1->rec.base.base.delete_tid > rec2->rec.base.base.delete_tid) return(1); return(0); } @@ -77,26 +85,17 @@ hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec) if (info->key > rec->rec.base.base.key) return(2); - /* - * This test has a number of special cases. create_tid in key1 is - * the as-of transction id, and delete_tid in key1 is NOT USED. - * - * A key1->create_tid of 0 matches any record regardles of when - * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be - * used to search for the most current state of the object. - * - * key2->create_tid is a HAMMER record and will never be - * 0. key2->delete_tid is the deletion transaction id or 0 if - * the record has not yet been deleted. - */ - if (info->create_tid) { - if (info->create_tid < rec->rec.base.base.create_tid) - return(-1); - if (rec->rec.base.base.delete_tid && - info->create_tid >= rec->rec.base.base.delete_tid) { - return(1); - } - } + if (info->delete_tid == 0) { + if (rec->rec.base.base.delete_tid == 0) + return(0); + return(1); + } + if (rec->rec.base.base.delete_tid == 0) + return(-1); + if (info->delete_tid < rec->rec.base.base.delete_tid) + return(-1); + if (info->delete_tid > rec->rec.base.base.delete_tid) + return(1); return(0); } @@ -120,8 +119,6 @@ hammer_rec_scan_cmp(hammer_record_t rec, void *data) r = hammer_rec_compare(&cursor->key_beg, rec); if (r > 1) return(-1); - if (r == 0) - return(0); r = hammer_rec_compare(&cursor->key_end, rec); if (r < -1) return(1); @@ -144,6 +141,7 @@ hammer_alloc_mem_record(hammer_inode_t ip) ++hammer_count_records; record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO); record->ip = ip; + record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD; hammer_ref(&record->lock); return (record); } @@ -227,12 +225,11 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data) /* * Skip if not visible due to our as-of TID */ - if (cursor->key_beg.create_tid) { - if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid) + if (cursor->flags & HAMMER_CURSOR_ASOF) { + if (cursor->asof < rec->rec.base.base.create_tid) return(0); if (rec->rec.base.base.delete_tid && - cursor->key_beg.create_tid >= - rec->rec.base.base.delete_tid) { + cursor->asof >= rec->rec.base.base.delete_tid) { return(0); } } @@ -352,6 +349,9 @@ hammer_ip_add_directory(struct hammer_transaction *trans, * cursor must be seeked to the directory entry record being deleted. * * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag. + * + * This function can return EDEADLK requiring the caller to terminate + * the cursor and retry. */ int hammer_ip_del_directory(struct hammer_transaction *trans, @@ -366,12 +366,16 @@ hammer_ip_del_directory(struct hammer_transaction *trans, * One less link. The file may still be open in the OS even after * all links have gone away so we only try to sync if the OS has * no references and nlinks falls to 0. + * + * We have to terminate the cursor before syncing the inode to + * avoid deadlocking against ourselves. */ if (error == 0) { --ip->ino_rec.ino_nlinks; hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); if (ip->ino_rec.ino_nlinks == 0 && (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { + hammer_done_cursor(cursor); hammer_sync_inode(ip, MNT_NOWAIT, 1); } @@ -436,15 +440,17 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, void *bdata; int error; +retry: error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); if (error) return(error); cursor.key_beg.obj_id = ip->obj_id; cursor.key_beg.key = offset + bytes; - cursor.key_beg.create_tid = trans->tid; + cursor.key_beg.create_tid = 0; cursor.key_beg.delete_tid = 0; cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; - cursor.flags = HAMMER_CURSOR_INSERT; + cursor.asof = trans->tid; + cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_ASOF; /* * Issue a lookup to position the cursor and locate the cluster @@ -477,7 +483,12 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, * Fill everything in and insert our B-Tree node. */ hammer_modify_buffer(cursor.record_buffer); - rec->base.base = cursor.key_beg; + rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD; + rec->base.base.obj_id = ip->obj_id; + rec->base.base.key = offset + bytes; + rec->base.base.create_tid = trans->tid; + rec->base.base.delete_tid = 0; + rec->base.base.rec_type = HAMMER_RECTYPE_DATA; rec->base.data_crc = crc32(data, bytes); rec->base.rec_id = 0; /* XXX */ rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata); @@ -486,7 +497,7 @@ hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, hammer_modify_buffer(cursor.data_buffer); bcopy(data, bdata, bytes); - elm.leaf.base = cursor.key_beg; + elm.leaf.base = rec->base.base; elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); elm.leaf.data_offset = rec->base.data_offset; elm.leaf.data_len = bytes; @@ -516,6 +527,8 @@ done: if (error == ENOSPC) hammer_load_spike(&cursor, spike); hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; return(error); } @@ -534,12 +547,13 @@ hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike) void *bdata; int error; +retry: error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0], record->ip->hmp); if (error) return(error); cursor.key_beg = record->rec.base.base; - cursor.flags = HAMMER_CURSOR_INSERT; + cursor.flags |= HAMMER_CURSOR_INSERT; /* * Issue a lookup to position the cursor and locate the cluster. The @@ -551,21 +565,23 @@ hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike) * insert, re-lookup without the insert flag so the cursor * is properly positioned for the spike. */ -again: - error = hammer_btree_lookup(&cursor); - if (error == 0) { - if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) { - hmp = cursor.node->cluster->volume->hmp; - if (++hmp->namekey_iterator == 0) - ++hmp->namekey_iterator; - record->rec.base.base.key &= ~(0xFFFFFFFFLL); - record->rec.base.base.key |= hmp->namekey_iterator; - goto again; + for (;;) { + error = hammer_btree_lookup(&cursor); + if (error) + break; + if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) { + kprintf("hammer_ip_sync_record: duplicate rec " + "at (%016llx)\n", record->rec.base.base.key); + Debugger("duplicate record1"); + error = EIO; + break; } - kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n", - record->rec.base.base.key); - Debugger("duplicate record1"); - error = EIO; + hmp = cursor.node->cluster->volume->hmp; + if (++hmp->namekey_iterator == 0) + ++hmp->namekey_iterator; + record->rec.base.base.key &= ~(0xFFFFFFFFLL); + record->rec.base.base.key |= hmp->namekey_iterator; + cursor.key_beg.key = record->rec.base.base.key; } if (error != ENOENT) goto done; @@ -637,7 +653,7 @@ again: } rec->base.rec_id = 0; /* XXX */ - elm.leaf.base = cursor.key_beg; + elm.leaf.base = record->rec.base.base; elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); elm.leaf.data_offset = rec->base.data_offset; elm.leaf.data_len = rec->base.data_len; @@ -672,6 +688,8 @@ done: if (error == ENOSPC) hammer_load_spike(&cursor, spike); hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; return(error); } @@ -682,6 +700,9 @@ done: * * The target cursor will be modified by this call. Note in particular * that HAMMER_CURSOR_INSERT is set. + * + * NOTE: This can return EDEADLK, requiring the caller to release its cursor + * and retry the operation. */ int hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec, @@ -865,6 +886,9 @@ hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip) * * When 0 is returned hammer_ip_next() may be used to iterate additional * records within the requested range. + * + * This function can return EDEADLK, requiring the caller to terminate + * the cursor and try again. */ int hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) @@ -890,10 +914,14 @@ hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) * The ATEDISK flag is used by hammer_btree_iterate to determine * whether it must index forwards or not. It is also used here * to select the next record from in-memory or on-disk. + * + * EDEADLK can only occur if the lookup hit an empty internal + * element and couldn't delete it. Since this could only occur + * in-range, we can just iterate from the failure point. */ if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { error = hammer_btree_lookup(cursor); - if (error == ENOENT) { + if (error == ENOENT || error == EDEADLK) { cursor->flags &= ~HAMMER_CURSOR_ATEDISK; error = hammer_btree_iterate(cursor); } @@ -1087,12 +1115,15 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, int error; int64_t off; +retry: hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); cursor.key_beg.obj_id = ip->obj_id; - cursor.key_beg.create_tid = ip->obj_asof; + cursor.key_beg.create_tid = 0; cursor.key_beg.delete_tid = 0; cursor.key_beg.obj_type = 0; + cursor.asof = ip->obj_asof; + cursor.flags |= HAMMER_CURSOR_ASOF; cursor.key_end = cursor.key_beg; if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) { @@ -1187,6 +1218,8 @@ hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, error = hammer_ip_next(&cursor); } hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; if (error == ENOENT) error = 0; return(error); @@ -1204,10 +1237,11 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) hammer_base_elm_t base; int error; +retry: hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); cursor.key_beg.obj_id = ip->obj_id; - cursor.key_beg.create_tid = ip->obj_asof; + cursor.key_beg.create_tid = 0; cursor.key_beg.delete_tid = 0; cursor.key_beg.obj_type = 0; cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; @@ -1217,7 +1251,8 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) cursor.key_end.rec_type = 0xFFFF; cursor.key_end.key = HAMMER_MAX_KEY; - cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; + cursor.asof = ip->obj_asof; + cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; error = hammer_ip_first(&cursor, ip); @@ -1243,13 +1278,18 @@ hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) error = hammer_ip_next(&cursor); } hammer_done_cursor(&cursor); + if (error == EDEADLK) + goto retry; if (error == ENOENT) error = 0; return(error); } /* - * Delete the record at the current cursor + * Delete the record at the current cursor. + * + * NOTE: This can return EDEADLK, requiring the caller to terminate the + * cursor and retry. */ int hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) @@ -1277,10 +1317,14 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) hammer_modify_buffer(cursor->record_buffer); cursor->record->base.base.delete_tid = tid; - hammer_modify_node(cursor->node); - elm = &cursor->node->ondisk->elms[cursor->index]; - elm->leaf.base.delete_tid = tid; - hammer_update_syncid(cursor->record_buffer->cluster, tid); + error = hammer_cursor_upgrade(cursor); + if (error == 0) { + hammer_modify_node(cursor->node); + elm = &cursor->node->ondisk->elms[cursor->index]; + elm->leaf.base.delete_tid = tid; + hammer_update_syncid(cursor->record_buffer->cluster, + tid); + } } /* @@ -1342,7 +1386,7 @@ hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp); cursor.key_beg.obj_id = ip->obj_id; - cursor.key_beg.create_tid = ip->obj_asof; + cursor.key_beg.create_tid = 0; cursor.key_beg.delete_tid = 0; cursor.key_beg.obj_type = 0; cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; @@ -1352,7 +1396,8 @@ hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) cursor.key_end.rec_type = 0xFFFF; cursor.key_end.key = HAMMER_MAX_KEY; - cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; + cursor.asof = ip->obj_asof; + cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; error = hammer_ip_first(&cursor, ip); if (error == ENOENT)