2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.3 2007/11/20 07:16:28 dillon Exp $
39 static int hammer_mem_add(hammer_transaction_t trans,
40 hammer_record_t record);
41 static int hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip);
44 * Red-black tree support.
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
54 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
59 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
67 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
70 * A key1->rec_type of 0 matches any record type.
73 if (info->rec_type < rec->rec.base.base.rec_type)
75 if (info->rec_type > rec->rec.base.base.rec_type)
80 * There is no special case for key. 0 means 0.
82 if (info->key < rec->rec.base.base.key)
84 if (info->key > rec->rec.base.base.key)
88 * This test has a number of special cases. create_tid in key1 is
89 * the as-of transction id, and delete_tid in key1 is NOT USED.
91 * A key1->create_tid of 0 matches any record regardles of when
92 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
93 * used to search for the most current state of the object.
95 * key2->create_tid is a HAMMER record and will never be
96 * 0. key2->delete_tid is the deletion transaction id or 0 if
97 * the record has not yet been deleted.
99 if (info->create_tid) {
100 if (info->create_tid < rec->rec.base.base.create_tid)
102 if (rec->rec.base.base.delete_tid &&
103 info->create_tid >= rec->rec.base.base.delete_tid) {
110 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
111 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
112 hammer_rec_compare, hammer_base_elm_t);
115 * Allocate a record for the caller to finish filling in
118 hammer_alloc_mem_record(struct hammer_transaction *trans, hammer_inode_t ip)
120 hammer_record_t record;
122 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
128 * Release a memory record. If the record is marked for defered deletion,
129 * destroy the record when the last reference goes away.
132 hammer_rel_mem_record(struct hammer_record **recordp)
136 if ((rec = *recordp) != NULL) {
137 if (hammer_islastref(&rec->lock)) {
138 hammer_unref(&rec->lock);
139 if (rec->flags & HAMMER_RECF_DELETED)
140 hammer_free_mem_record(rec);
142 hammer_unref(&rec->lock);
149 * Free a record. Clean the structure up even though we are throwing it
150 * away as a sanity check. The actual free operation is delayed while
151 * the record is referenced.
154 hammer_free_mem_record(hammer_record_t record)
156 if (record->lock.refs) {
157 record->flags |= HAMMER_RECF_DELETED;
161 if (record->flags & HAMMER_RECF_ONRBTREE) {
162 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
163 record->flags &= ~HAMMER_RECF_ONRBTREE;
165 if (record->flags & HAMMER_RECF_ALLOCDATA) {
166 kfree(record->data, M_HAMMER);
167 record->flags &= ~HAMMER_RECF_ALLOCDATA;
170 kfree(record, M_HAMMER);
174 * Lookup an in-memory record given the key specified in the cursor. Works
175 * just like hammer_btree_lookup() but operates on an inode's in-memory
180 hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip)
185 hammer_rel_mem_record(&cursor->iprec);
187 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
188 &ip->rec_tree, &cursor->key_beg);
189 if (cursor->iprec == NULL) {
192 hammer_ref(&cursor->iprec->lock);
198 /************************************************************************
199 * HAMMER IN-MEMORY RECORD FUNCTIONS *
200 ************************************************************************
202 * These functions manipulate in-memory records. Such records typically
203 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
207 * Add a directory entry (dip,ncp) which references inode (ip).
209 * Note that the low 32 bits of the namekey are set temporarily to create
210 * a unique in-memory record, and may be modified a second time when the
211 * record is synchronized to disk. In particular, the low 32 bits cannot be
212 * all 0's when synching to disk, which is not handled here.
215 hammer_ip_add_directory(struct hammer_transaction *trans,
216 struct hammer_inode *dip, struct namecache *ncp,
217 struct hammer_inode *ip)
219 hammer_record_t record;
223 record = hammer_alloc_mem_record(trans, dip);
225 bytes = ncp->nc_nlen + 1;
227 record->rec.entry.base.base.obj_id = dip->obj_id;
228 record->rec.entry.base.base.key = hammer_directory_namekey(ncp->nc_name, bytes - 1);
229 record->rec.entry.base.base.key += trans->hmp->namekey_iterator++;
230 record->rec.entry.base.base.create_tid = trans->tid;
231 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
232 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
233 record->rec.entry.base.base.obj_id = ip->obj_id;
234 if (bytes <= sizeof(record->rec.entry.den_name)) {
235 record->data = (void *)record->rec.entry.den_name;
237 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
238 record->flags |= HAMMER_RECF_ALLOCDATA;
239 bcopy(ncp->nc_name, record->data, bytes);
241 record->data_len = bytes;
242 ++ip->ino_rec.ino_nlinks;
243 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
244 error = hammer_mem_add(trans, record);
249 * Delete the directory entry and update the inode link count. The
250 * cursor must be seeked to the directory entry record being deleted.
252 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
255 hammer_ip_del_directory(struct hammer_transaction *trans,
256 hammer_cursor_t cursor, struct hammer_inode *dip,
257 struct hammer_inode *ip)
261 if (cursor->record == &cursor->iprec->rec) {
263 * The directory entry was in-memory, just scrap the
266 hammer_free_mem_record(cursor->iprec);
270 * The directory entry was on-disk, mark the record and
271 * B-Tree entry as deleted. The B-Tree entry does not
272 * have to be reindexed because a 'current' delete transid
273 * will wind up in the same position as the live record.
275 KKASSERT(ip->flags & HAMMER_INODE_ONDISK);
276 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
278 cursor->node->ondisk->elms[cursor->index].base.delete_tid = trans->tid;
279 cursor->record->base.base.delete_tid = trans->tid;
280 hammer_modify_node(cursor->node);
281 hammer_modify_buffer(cursor->record_buffer);
290 --ip->ino_rec.ino_nlinks;
291 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
297 * Add a data record to the filesystem.
299 * This is called via the strategy code, typically when the kernel wants to
300 * flush a buffer cache buffer, so this operation goes directly to the disk.
303 hammer_ip_add_data(hammer_transaction_t trans, hammer_inode_t ip,
304 int64_t offset, void *data, int bytes)
306 panic("hammer_ip_add_data");
310 * Add the record to the inode's rec_tree. The low 32 bits of a directory
311 * entry's key is used to deal with hash collisions in the upper 32 bits.
312 * A unique 64 bit key is generated in-memory and may be regenerated a
313 * second time when the directory record is flushed to the on-disk B-Tree.
317 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
319 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
320 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
321 hammer_free_mem_record(record);
324 if (++trans->hmp->namekey_iterator == 0)
325 ++trans->hmp->namekey_iterator;
326 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
327 record->rec.base.base.key |= trans->hmp->namekey_iterator;
329 record->flags |= HAMMER_RECF_ONRBTREE;
333 /************************************************************************
334 * HAMMER INODE MERGED-RECORD FUNCTIONS *
335 ************************************************************************
337 * These functions augment the B-Tree scanning functions in hammer_btree.c
338 * by merging in-memory records with on-disk records.
342 * Locate a particular record either in-memory or on-disk.
344 * NOTE: This is basically a standalone routine, hammer_ip_next() may
345 * NOT be called to iterate results.
348 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
353 * If the element is in-memory return it without searching the
356 error = hammer_mem_search(cursor, ip);
358 cursor->record = &cursor->iprec->rec;
365 * If the inode has on-disk components search the on-disk B-Tree.
367 if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
369 error = hammer_btree_lookup(cursor);
371 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
376 * Locate the first record within the cursor's key_beg/key_end range,
377 * restricted to a particular inode. 0 is returned on success, ENOENT
378 * if no records matched the requested range, or some other error.
380 * When 0 is returned hammer_ip_next() may be used to iterate additional
381 * records within the requested range.
384 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
389 * Clean up fields and setup for merged scan
391 cursor->flags &= ~(HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM);
392 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
394 hammer_rel_mem_record(&cursor->iprec);
397 * Search the on-disk B-Tree
399 if (ip->flags & HAMMER_INODE_ONDISK) {
400 error = hammer_btree_lookup(cursor);
401 if (error && error != ENOENT)
404 cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
408 * Search the in-memory record list (Red-Black tree)
410 error = hammer_mem_search(cursor, ip);
411 if (error && error != ENOENT)
414 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
417 * This will return the first matching record.
419 return(hammer_ip_next(cursor));
423 * Retrieve the next record in a merged iteration within the bounds of the
424 * cursor. This call may be made multiple times after the cursor has been
425 * initially searched with hammer_ip_first().
427 * 0 is returned on success, ENOENT if no further records match the
428 * requested range, or some other error code is returned.
431 hammer_ip_next(hammer_cursor_t cursor)
433 hammer_btree_elm_t elm;
439 * Load the current on-disk and in-memory record. If we ate any
440 * records we have to get the next one.
442 * Get the next on-disk record
444 if (cursor->flags & HAMMER_CURSOR_ATEDISK) {
445 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
446 error = hammer_btree_iterate(cursor);
448 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
450 cursor->flags |= HAMMER_CURSOR_DISKEOF;
455 * Get the next in-memory record. Records marked for defered
456 * deletion must be skipped.
458 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
459 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
462 rec = hammer_rec_rb_tree_RB_NEXT(rec);
463 } while(rec && (rec->flags & HAMMER_RECF_DELETED));
465 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
466 hammer_ref(&rec->lock);
468 cursor->flags |= HAMMER_CURSOR_MEMEOF;
470 hammer_rel_mem_record(&cursor->iprec);
476 * Extract either the disk or memory record depending on their
480 switch(cursor->flags & (HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF)) {
485 elm = &cursor->node->ondisk->elms[cursor->index];
486 r = hammer_btree_cmp(&elm->base,
487 &cursor->iprec->rec.base.base);
489 error = hammer_btree_extract(cursor,
490 HAMMER_CURSOR_GET_RECORD);
491 cursor->flags |= HAMMER_CURSOR_ATEDISK;
494 /* fall through to the memory entry */
495 case HAMMER_CURSOR_DISKEOF:
497 * Only the memory entry is valid
499 cursor->record = &cursor->iprec->rec;
500 cursor->flags |= HAMMER_CURSOR_ATEMEM;
502 case HAMMER_CURSOR_MEMEOF:
504 * Only the disk entry is valid
506 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
507 cursor->flags |= HAMMER_CURSOR_ATEDISK;
511 * Neither entry is valid
513 * XXX error not set properly
515 cursor->record = NULL;
523 * Resolve the cursor->data pointer for the current cursor position in
524 * a merged iteration.
527 hammer_ip_resolve_data(hammer_cursor_t cursor)
531 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
532 cursor->data = cursor->iprec->data;
535 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
541 * Delete all records within the specified range for inode ip.
543 * NOTE: An unaligned range will cause new records to be added to cover
546 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
549 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
550 int64_t ran_beg, int64_t ran_end)
552 struct hammer_cursor cursor;
553 hammer_record_ondisk_t rec;
554 hammer_base_elm_t base;
558 hammer_init_cursor_ip(&cursor, ip);
560 cursor.key_beg.obj_id = ip->obj_id;
561 cursor.key_beg.create_tid = ip->obj_asof;
562 cursor.key_beg.delete_tid = 0;
563 cursor.key_beg.obj_type = 0;
564 cursor.key_beg.key = ran_beg;
565 cursor.key_end = cursor.key_beg;
566 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
567 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
568 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
569 cursor.key_end.key = ran_end;
571 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
572 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
573 if (ran_end + MAXPHYS < ran_end)
574 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
576 cursor.key_end.key = ran_end + MAXPHYS;
579 error = hammer_ip_first(&cursor, ip);
582 * Iterate through matching records and mark them as deleted.
586 base = &rec->base.base;
588 KKASSERT(base->delete_tid == 0);
591 * There may be overlap cases for regular file data. Also
592 * remember the key for a regular file record is the offset
593 * of the last byte of the record (base + len - 1), NOT the
596 if (base->rec_type == HAMMER_RECTYPE_DATA) {
597 off = base->key - rec->base.data_len + 1;
599 * Check the left edge case
602 panic("hammer left edge case\n");
606 * Check the right edge case. Note that the
607 * record can be completely out of bounds, which
608 * terminates the search.
610 * base->key is (base_offset + bytes - 1), ran_end
611 * works the same way.
613 if (base->key > ran_end) {
614 if (base->key - rec->base.data_len + 1 > ran_end) {
615 kprintf("right edge OOB\n");
618 panic("hammer right edge case\n");
623 * Mark the record and B-Tree entry as deleted
625 if (cursor.record == &cursor.iprec->rec) {
626 hammer_free_mem_record(cursor.iprec);
629 cursor.node->ondisk->elms[cursor.index].base.delete_tid = trans->tid;
630 cursor.record->base.base.delete_tid = trans->tid;
631 hammer_modify_node(cursor.node);
632 hammer_modify_buffer(cursor.record_buffer);
634 error = hammer_ip_next(&cursor);
636 hammer_done_cursor(&cursor);