2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.4 2007/11/20 22:55:40 dillon Exp $
39 static int hammer_mem_add(hammer_transaction_t trans,
40 hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip);
45 * Red-black tree support.
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
50 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
52 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
55 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
57 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
60 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
62 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
71 * A key1->rec_type of 0 matches any record type.
74 if (info->rec_type < rec->rec.base.base.rec_type)
76 if (info->rec_type > rec->rec.base.base.rec_type)
81 * There is no special case for key. 0 means 0.
83 if (info->key < rec->rec.base.base.key)
85 if (info->key > rec->rec.base.base.key)
89 * This test has a number of special cases. create_tid in key1 is
90 * the as-of transction id, and delete_tid in key1 is NOT USED.
92 * A key1->create_tid of 0 matches any record regardles of when
93 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
94 * used to search for the most current state of the object.
96 * key2->create_tid is a HAMMER record and will never be
97 * 0. key2->delete_tid is the deletion transaction id or 0 if
98 * the record has not yet been deleted.
100 if (info->create_tid) {
101 if (info->create_tid < rec->rec.base.base.create_tid)
103 if (rec->rec.base.base.delete_tid &&
104 info->create_tid >= rec->rec.base.base.delete_tid) {
112 * RB_SCAN comparison code for hammer_mem_search(). The argument order
113 * is reversed so the comparison result has to be negated. key_beg and
114 * key_end are both inclusive boundaries.
118 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
120 hammer_cursor_t cursor = data;
123 r = hammer_rec_compare(&cursor->key_beg, rec);
128 r = hammer_rec_compare(&cursor->key_end, rec);
134 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
135 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
136 hammer_rec_compare, hammer_base_elm_t);
139 * Allocate a record for the caller to finish filling in
142 hammer_alloc_mem_record(struct hammer_transaction *trans, hammer_inode_t ip)
144 hammer_record_t record;
146 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
152 * Release a memory record. If the record is marked for defered deletion,
153 * destroy the record when the last reference goes away.
156 hammer_rel_mem_record(struct hammer_record **recordp)
160 if ((rec = *recordp) != NULL) {
161 if (hammer_islastref(&rec->lock)) {
162 hammer_unref(&rec->lock);
163 if (rec->flags & HAMMER_RECF_DELETED)
164 hammer_free_mem_record(rec);
166 hammer_unref(&rec->lock);
173 * Free a record. Clean the structure up even though we are throwing it
174 * away as a sanity check. The actual free operation is delayed while
175 * the record is referenced. However, the record is removed from the RB
179 hammer_free_mem_record(hammer_record_t record)
181 if (record->flags & HAMMER_RECF_ONRBTREE) {
182 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
183 record->flags &= ~HAMMER_RECF_ONRBTREE;
185 if (record->lock.refs) {
186 record->flags |= HAMMER_RECF_DELETED;
189 if (record->flags & HAMMER_RECF_ALLOCDATA) {
190 kfree(record->data, M_HAMMER);
191 record->flags &= ~HAMMER_RECF_ALLOCDATA;
194 kfree(record, M_HAMMER);
198 * Lookup an in-memory record given the key specified in the cursor. Works
199 * just like hammer_btree_lookup() but operates on an inode's in-memory
202 * The lookup must fail if the record is marked for deferred deletion.
206 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
211 hammer_rel_mem_record(&cursor->iprec);
213 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
214 &cursor->ip->rec_tree);
217 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
218 cursor->scan.node = NULL;
219 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
220 &ip->rec_tree, &cursor->key_beg);
221 if (cursor->iprec == NULL) {
224 hammer_ref(&cursor->iprec->lock);
231 * hammer_mem_search() - locate the first in-memory record matching the
234 * The RB_SCAN function we use is designed as a callback. We terminate it
235 * (return -1) as soon as we get a match.
239 hammer_rec_scan_callback(hammer_record_t rec, void *data)
241 hammer_cursor_t cursor = data;
243 if (cursor->iprec == NULL) {
245 hammer_ref(&rec->lock);
253 hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip)
256 hammer_rel_mem_record(&cursor->iprec);
258 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
259 &cursor->ip->rec_tree);
262 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
263 cursor->scan.node = NULL;
264 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
265 hammer_rec_scan_callback, cursor);
267 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
274 hammer_mem_done(hammer_cursor_t cursor)
277 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
278 &cursor->ip->rec_tree);
282 hammer_rel_mem_record(&cursor->iprec);
285 /************************************************************************
286 * HAMMER IN-MEMORY RECORD FUNCTIONS *
287 ************************************************************************
289 * These functions manipulate in-memory records. Such records typically
290 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
294 * Add a directory entry (dip,ncp) which references inode (ip).
296 * Note that the low 32 bits of the namekey are set temporarily to create
297 * a unique in-memory record, and may be modified a second time when the
298 * record is synchronized to disk. In particular, the low 32 bits cannot be
299 * all 0's when synching to disk, which is not handled here.
302 hammer_ip_add_directory(struct hammer_transaction *trans,
303 struct hammer_inode *dip, struct namecache *ncp,
304 struct hammer_inode *ip)
306 hammer_record_t record;
310 record = hammer_alloc_mem_record(trans, dip);
312 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */
313 if (++trans->hmp->namekey_iterator == 0)
314 ++trans->hmp->namekey_iterator;
316 record->rec.entry.base.base.obj_id = dip->obj_id;
317 record->rec.entry.base.base.key =
318 hammer_directory_namekey(ncp->nc_name, bytes);
319 record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
320 record->rec.entry.base.base.create_tid = trans->tid;
321 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
322 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
323 record->rec.entry.obj_id = ip->obj_id;
324 if (bytes <= sizeof(record->rec.entry.den_name)) {
325 record->data = (void *)record->rec.entry.den_name;
327 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
328 record->flags |= HAMMER_RECF_ALLOCDATA;
330 bcopy(ncp->nc_name, record->data, bytes);
331 record->rec.entry.base.data_len = bytes;
332 ++ip->ino_rec.ino_nlinks;
333 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
334 error = hammer_mem_add(trans, record);
339 * Delete the directory entry and update the inode link count. The
340 * cursor must be seeked to the directory entry record being deleted.
342 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
345 hammer_ip_del_directory(struct hammer_transaction *trans,
346 hammer_cursor_t cursor, struct hammer_inode *dip,
347 struct hammer_inode *ip)
351 if (cursor->record == &cursor->iprec->rec) {
353 * The directory entry was in-memory, just scrap the
356 hammer_free_mem_record(cursor->iprec);
360 * The directory entry was on-disk, mark the record and
361 * B-Tree entry as deleted. The B-Tree entry does not
362 * have to be reindexed because a 'current' delete transid
363 * will wind up in the same position as the live record.
365 KKASSERT(ip->flags & HAMMER_INODE_ONDISK);
366 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
368 cursor->node->ondisk->elms[cursor->index].base.delete_tid = trans->tid;
369 cursor->record->base.base.delete_tid = trans->tid;
370 hammer_modify_node(cursor->node);
371 hammer_modify_buffer(cursor->record_buffer);
377 * One less link. Mark the inode and all of its records as deleted
378 * when the last link goes away. The inode will be automatically
379 * flushed when its last reference goes away.
382 --ip->ino_rec.ino_nlinks;
383 if (ip->ino_rec.ino_nlinks == 0)
384 ip->ino_rec.base.base.delete_tid = trans->tid;
385 error = hammer_ip_delete_range(trans, ip,
386 HAMMER_MIN_KEY, HAMMER_MAX_KEY);
387 KKASSERT(RB_EMPTY(&ip->rec_tree));
388 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
394 * Add a data record to the filesystem.
396 * This is called via the strategy code, typically when the kernel wants to
397 * flush a buffer cache buffer, so this operation goes directly to the disk.
400 hammer_ip_add_data(hammer_transaction_t trans, hammer_inode_t ip,
401 int64_t offset, void *data, int bytes)
403 panic("hammer_ip_add_data");
407 * Add the record to the inode's rec_tree. The low 32 bits of a directory
408 * entry's key is used to deal with hash collisions in the upper 32 bits.
409 * A unique 64 bit key is generated in-memory and may be regenerated a
410 * second time when the directory record is flushed to the on-disk B-Tree.
414 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
416 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
417 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
418 hammer_free_mem_record(record);
421 if (++trans->hmp->namekey_iterator == 0)
422 ++trans->hmp->namekey_iterator;
423 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
424 record->rec.base.base.key |= trans->hmp->namekey_iterator;
426 record->flags |= HAMMER_RECF_ONRBTREE;
430 /************************************************************************
431 * HAMMER INODE MERGED-RECORD FUNCTIONS *
432 ************************************************************************
434 * These functions augment the B-Tree scanning functions in hammer_btree.c
435 * by merging in-memory records with on-disk records.
439 * Locate a particular record either in-memory or on-disk.
441 * NOTE: This is basically a standalone routine, hammer_ip_next() may
442 * NOT be called to iterate results.
445 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
450 * If the element is in-memory return it without searching the
453 error = hammer_mem_lookup(cursor, ip);
455 cursor->record = &cursor->iprec->rec;
462 * If the inode has on-disk components search the on-disk B-Tree.
464 if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
466 error = hammer_btree_lookup(cursor);
468 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
473 * Locate the first record within the cursor's key_beg/key_end range,
474 * restricted to a particular inode. 0 is returned on success, ENOENT
475 * if no records matched the requested range, or some other error.
477 * When 0 is returned hammer_ip_next() may be used to iterate additional
478 * records within the requested range.
481 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
486 * Clean up fields and setup for merged scan
488 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
489 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
491 hammer_rel_mem_record(&cursor->iprec);
494 * Search the on-disk B-Tree
496 if (ip->flags & HAMMER_INODE_ONDISK) {
497 error = hammer_btree_lookup(cursor);
498 if (error && error != ENOENT)
501 cursor->flags &= ~HAMMER_CURSOR_DISKEOF ;
502 cursor->flags &= ~HAMMER_CURSOR_ATEDISK ;
507 * Search the in-memory record list (Red-Black tree)
509 error = hammer_mem_search(cursor, ip);
510 if (error && error != ENOENT)
513 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
514 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
518 * This will return the first matching record.
520 return(hammer_ip_next(cursor));
524 * Retrieve the next record in a merged iteration within the bounds of the
525 * cursor. This call may be made multiple times after the cursor has been
526 * initially searched with hammer_ip_first().
528 * 0 is returned on success, ENOENT if no further records match the
529 * requested range, or some other error code is returned.
532 hammer_ip_next(hammer_cursor_t cursor)
534 hammer_btree_elm_t elm;
540 * Load the current on-disk and in-memory record. If we ate any
541 * records we have to get the next one.
543 * Get the next on-disk record
545 if (cursor->flags & HAMMER_CURSOR_ATEDISK) {
546 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
547 error = hammer_btree_iterate(cursor);
549 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
551 cursor->flags |= HAMMER_CURSOR_DISKEOF;
556 * Get the next in-memory record. The record can be ripped out
557 * of the RB tree so we maintain a scan_info structure to track
560 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
561 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
562 rec = cursor->scan.node; /* next node */
564 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
565 hammer_ref(&rec->lock);
567 hammer_rec_rb_tree_RB_NEXT(rec);
569 cursor->flags |= HAMMER_CURSOR_MEMEOF;
571 hammer_rel_mem_record(&cursor->iprec);
577 * Extract either the disk or memory record depending on their
581 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
586 elm = &cursor->node->ondisk->elms[cursor->index];
587 r = hammer_btree_cmp(&elm->base,
588 &cursor->iprec->rec.base.base);
590 error = hammer_btree_extract(cursor,
591 HAMMER_CURSOR_GET_RECORD);
592 cursor->flags |= HAMMER_CURSOR_ATEDISK;
595 /* fall through to the memory entry */
596 case HAMMER_CURSOR_ATEDISK:
598 * Only the memory entry is valid
600 cursor->record = &cursor->iprec->rec;
601 cursor->flags |= HAMMER_CURSOR_ATEMEM;
603 case HAMMER_CURSOR_ATEMEM:
605 * Only the disk entry is valid
607 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
608 cursor->flags |= HAMMER_CURSOR_ATEDISK;
612 * Neither entry is valid
614 * XXX error not set properly
616 cursor->record = NULL;
624 * Resolve the cursor->data pointer for the current cursor position in
625 * a merged iteration.
628 hammer_ip_resolve_data(hammer_cursor_t cursor)
632 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
633 cursor->data = cursor->iprec->data;
636 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
642 * Delete all records within the specified range for inode ip.
644 * NOTE: An unaligned range will cause new records to be added to cover
647 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
650 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
651 int64_t ran_beg, int64_t ran_end)
653 struct hammer_cursor cursor;
654 hammer_record_ondisk_t rec;
655 hammer_base_elm_t base;
659 hammer_init_cursor_ip(&cursor, ip);
661 cursor.key_beg.obj_id = ip->obj_id;
662 cursor.key_beg.create_tid = ip->obj_asof;
663 cursor.key_beg.delete_tid = 0;
664 cursor.key_beg.obj_type = 0;
665 cursor.key_beg.key = ran_beg;
666 cursor.key_end = cursor.key_beg;
667 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
668 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
669 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
670 cursor.key_end.key = ran_end;
672 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
673 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
674 if (ran_end + MAXPHYS < ran_end)
675 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
677 cursor.key_end.key = ran_end + MAXPHYS;
680 error = hammer_ip_first(&cursor, ip);
683 * Iterate through matching records and mark them as deleted.
687 base = &rec->base.base;
689 KKASSERT(base->delete_tid == 0);
692 * There may be overlap cases for regular file data. Also
693 * remember the key for a regular file record is the offset
694 * of the last byte of the record (base + len - 1), NOT the
697 if (base->rec_type == HAMMER_RECTYPE_DATA) {
698 off = base->key - rec->base.data_len + 1;
700 * Check the left edge case
703 panic("hammer left edge case\n");
707 * Check the right edge case. Note that the
708 * record can be completely out of bounds, which
709 * terminates the search.
711 * base->key is (base_offset + bytes - 1), ran_end
712 * works the same way.
714 if (base->key > ran_end) {
715 if (base->key - rec->base.data_len + 1 > ran_end) {
716 kprintf("right edge OOB\n");
719 panic("hammer right edge case\n");
724 * Mark the record and B-Tree entry as deleted
726 if (cursor.record == &cursor.iprec->rec) {
727 hammer_free_mem_record(cursor.iprec);
730 cursor.node->ondisk->elms[cursor.index].base.delete_tid = trans->tid;
731 cursor.record->base.base.delete_tid = trans->tid;
732 hammer_modify_node(cursor.node);
733 hammer_modify_buffer(cursor.record_buffer);
735 error = hammer_ip_next(&cursor);
737 hammer_done_cursor(&cursor);