2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.8 2007/11/30 00:16:56 dillon Exp $
39 static int hammer_mem_add(hammer_transaction_t trans,
40 hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
45 * Red-black tree support.
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
50 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
52 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
55 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
57 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
60 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
62 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
71 * A key1->rec_type of 0 matches any record type.
74 if (info->rec_type < rec->rec.base.base.rec_type)
76 if (info->rec_type > rec->rec.base.base.rec_type)
81 * There is no special case for key. 0 means 0.
83 if (info->key < rec->rec.base.base.key)
85 if (info->key > rec->rec.base.base.key)
89 * This test has a number of special cases. create_tid in key1 is
90 * the as-of transction id, and delete_tid in key1 is NOT USED.
92 * A key1->create_tid of 0 matches any record regardles of when
93 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
94 * used to search for the most current state of the object.
96 * key2->create_tid is a HAMMER record and will never be
97 * 0. key2->delete_tid is the deletion transaction id or 0 if
98 * the record has not yet been deleted.
100 if (info->create_tid) {
101 if (info->create_tid < rec->rec.base.base.create_tid)
103 if (rec->rec.base.base.delete_tid &&
104 info->create_tid >= rec->rec.base.base.delete_tid) {
112 * RB_SCAN comparison code for hammer_mem_first(). The argument order
113 * is reversed so the comparison result has to be negated. key_beg and
114 * key_end are both range-inclusive.
116 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
117 * These do not stop the scan.
119 * Localized deletions are not cached in-memory.
123 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
125 hammer_cursor_t cursor = data;
128 r = hammer_rec_compare(&cursor->key_beg, rec);
133 r = hammer_rec_compare(&cursor->key_end, rec);
139 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
140 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
141 hammer_rec_compare, hammer_base_elm_t);
144 * Allocate a record for the caller to finish filling in
147 hammer_alloc_mem_record(hammer_inode_t ip)
149 hammer_record_t record;
151 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
157 * Release a memory record. If the record is marked for defered deletion,
158 * destroy the record when the last reference goes away.
161 hammer_rel_mem_record(struct hammer_record **recordp)
165 if ((rec = *recordp) != NULL) {
166 if (hammer_islastref(&rec->lock)) {
167 hammer_unref(&rec->lock);
168 if (rec->flags & HAMMER_RECF_DELETED)
169 hammer_free_mem_record(rec);
171 hammer_unref(&rec->lock);
178 * Free a record. Clean the structure up even though we are throwing it
179 * away as a sanity check. The actual free operation is delayed while
180 * the record is referenced. However, the record is removed from the RB
184 hammer_free_mem_record(hammer_record_t record)
186 if (record->flags & HAMMER_RECF_ONRBTREE) {
187 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
188 record->flags &= ~HAMMER_RECF_ONRBTREE;
190 if (record->lock.refs) {
191 record->flags |= HAMMER_RECF_DELETED;
194 if (record->flags & HAMMER_RECF_ALLOCDATA) {
195 kfree(record->data, M_HAMMER);
196 record->flags &= ~HAMMER_RECF_ALLOCDATA;
199 kfree(record, M_HAMMER);
203 * Lookup an in-memory record given the key specified in the cursor. Works
204 * just like hammer_btree_lookup() but operates on an inode's in-memory
207 * The lookup must fail if the record is marked for deferred deletion.
211 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
216 hammer_rel_mem_record(&cursor->iprec);
218 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
219 &cursor->ip->rec_tree);
222 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
223 cursor->scan.node = NULL;
224 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
225 &ip->rec_tree, &cursor->key_beg);
226 if (cursor->iprec == NULL) {
229 hammer_ref(&cursor->iprec->lock);
236 * hammer_mem_first() - locate the first in-memory record matching the
239 * The RB_SCAN function we use is designed as a callback. We terminate it
240 * (return -1) as soon as we get a match.
244 hammer_rec_scan_callback(hammer_record_t rec, void *data)
246 hammer_cursor_t cursor = data;
249 * Skip if not visible due to our as-of TID
251 if (cursor->key_beg.create_tid) {
252 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
254 if (rec->rec.base.base.delete_tid &&
255 cursor->key_beg.create_tid >=
256 rec->rec.base.base.delete_tid) {
262 * Return the first matching record and stop the scan
264 if (cursor->iprec == NULL) {
266 hammer_ref(&rec->lock);
274 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
277 hammer_rel_mem_record(&cursor->iprec);
279 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
280 &cursor->ip->rec_tree);
283 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
284 cursor->scan.node = NULL;
285 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
286 hammer_rec_scan_callback, cursor);
289 * Adjust scan.node and keep it linked into the RB-tree so we can
290 * hold the cursor through third party modifications of the RB-tree.
293 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
300 hammer_mem_done(hammer_cursor_t cursor)
303 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
304 &cursor->ip->rec_tree);
308 hammer_rel_mem_record(&cursor->iprec);
311 /************************************************************************
312 * HAMMER IN-MEMORY RECORD FUNCTIONS *
313 ************************************************************************
315 * These functions manipulate in-memory records. Such records typically
316 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
320 * Add a directory entry (dip,ncp) which references inode (ip).
322 * Note that the low 32 bits of the namekey are set temporarily to create
323 * a unique in-memory record, and may be modified a second time when the
324 * record is synchronized to disk. In particular, the low 32 bits cannot be
325 * all 0's when synching to disk, which is not handled here.
328 hammer_ip_add_directory(struct hammer_transaction *trans,
329 struct hammer_inode *dip, struct namecache *ncp,
330 struct hammer_inode *ip)
332 hammer_record_t record;
336 record = hammer_alloc_mem_record(dip);
338 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */
339 if (++trans->hmp->namekey_iterator == 0)
340 ++trans->hmp->namekey_iterator;
342 record->rec.entry.base.base.obj_id = dip->obj_id;
343 record->rec.entry.base.base.key =
344 hammer_directory_namekey(ncp->nc_name, bytes);
345 record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
346 record->rec.entry.base.base.create_tid = trans->tid;
347 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
348 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
349 record->rec.entry.obj_id = ip->obj_id;
350 if (bytes <= sizeof(record->rec.entry.den_name)) {
351 record->data = (void *)record->rec.entry.den_name;
352 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
354 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
355 record->flags |= HAMMER_RECF_ALLOCDATA;
357 bcopy(ncp->nc_name, record->data, bytes);
358 record->rec.entry.base.data_len = bytes;
359 ++ip->ino_rec.ino_nlinks;
360 hammer_modify_inode(trans, ip,
361 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
362 error = hammer_mem_add(trans, record);
367 * Delete the directory entry and update the inode link count. The
368 * cursor must be seeked to the directory entry record being deleted.
370 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
373 hammer_ip_del_directory(struct hammer_transaction *trans,
374 hammer_cursor_t cursor, struct hammer_inode *dip,
375 struct hammer_inode *ip)
379 error = hammer_ip_delete_record(cursor, trans->tid);
382 * One less link. The file may still be open in the OS even after
383 * all links have gone away so we don't destroy the inode's data
387 --ip->ino_rec.ino_nlinks;
388 hammer_modify_inode(trans, ip,
389 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
390 if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))
391 hammer_sync_inode(ip, MNT_NOWAIT, 1);
398 * Sync data from a buffer cache buffer (typically) to the filesystem. This
399 * is called via the strategy called from a cached data source. This code
400 * is responsible for actually writing a data record out to the disk.
403 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
404 int64_t offset, void *data, int bytes)
406 struct hammer_cursor cursor;
407 hammer_record_ondisk_t rec;
408 union hammer_btree_elm elm;
412 error = hammer_init_cursor_ip(&cursor, ip);
415 cursor.key_beg.obj_id = ip->obj_id;
416 cursor.key_beg.key = offset + bytes;
417 cursor.key_beg.create_tid = trans->tid;
418 cursor.key_beg.delete_tid = 0;
419 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
420 cursor.flags = HAMMER_CURSOR_INSERT;
423 * Issue a lookup to position the cursor and locate the cluster
425 error = hammer_btree_lookup(&cursor);
427 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
429 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
430 HAMMER_BTREE_TYPE_LEAF, cursor.index);
437 * Allocate record and data space now that we know which cluster
438 * the B-Tree node ended up in.
440 bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
441 &cursor.data_buffer);
444 rec = hammer_alloc_record(cursor.node->cluster, &error,
445 &cursor.record_buffer);
450 * Fill everything in and insert our B-Tree node.
452 rec->base.base = cursor.key_beg;
453 rec->base.data_crc = crc32(data, bytes);
454 rec->base.rec_id = 0; /* XXX */
455 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
456 rec->base.data_len = bytes;
457 hammer_modify_buffer(cursor.record_buffer);
459 bcopy(data, bdata, bytes);
460 hammer_modify_buffer(cursor.data_buffer);
462 elm.leaf.base = cursor.key_beg;
463 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
464 elm.leaf.data_offset = rec->base.data_offset;
465 elm.leaf.data_len = bytes;
466 elm.leaf.data_crc = rec->base.data_crc;
468 error = hammer_btree_insert(&cursor, &elm);
472 hammer_free_record_ptr(cursor.record_buffer, rec);
474 hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
476 hammer_done_cursor(&cursor);
481 * Sync an in-memory record to the disk. this is typically called via fsync
482 * from a cached record source. This code is responsible for actually
483 * writing a record out to the disk.
486 hammer_ip_sync_record(hammer_record_t record)
488 struct hammer_cursor cursor;
489 hammer_record_ondisk_t rec;
490 union hammer_btree_elm elm;
494 error = hammer_init_cursor_ip(&cursor, record->ip);
497 cursor.key_beg = record->rec.base.base;
498 cursor.flags = HAMMER_CURSOR_INSERT;
501 * Issue a lookup to position the cursor and locate the cluster
503 error = hammer_btree_lookup(&cursor);
505 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
506 record->rec.base.base.key);
513 * Allocate record and data space now that we know which cluster
514 * the B-Tree node ended up in.
516 if (record->data == NULL ||
517 (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
518 bdata = record->data;
520 bdata = hammer_alloc_data(cursor.node->cluster,
521 record->rec.base.data_len, &error,
522 &cursor.data_buffer);
526 rec = hammer_alloc_record(cursor.node->cluster, &error,
527 &cursor.record_buffer);
532 * Fill everything in and insert our B-Tree node.
534 * XXX assign rec_id here
538 rec->base.data_crc = crc32(record->data,
539 record->rec.base.data_len);
540 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
542 * Data embedded in record
544 rec->base.data_offset = ((char *)bdata -
545 (char *)&record->rec);
546 KKASSERT(rec->base.data_offset >= 0 &&
547 rec->base.data_offset + rec->base.data_len <
549 rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
552 * Data separate from record
554 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
555 bcopy(record->data, bdata, rec->base.data_len);
556 hammer_modify_buffer(cursor.data_buffer);
559 rec->base.rec_id = 0; /* XXX */
561 hammer_modify_buffer(cursor.record_buffer);
563 elm.leaf.base = cursor.key_beg;
564 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
565 elm.leaf.data_offset = rec->base.data_offset;
566 elm.leaf.data_len = rec->base.data_len;
567 elm.leaf.data_crc = rec->base.data_crc;
569 error = hammer_btree_insert(&cursor, &elm);
573 hammer_free_record_ptr(cursor.record_buffer, rec);
575 if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
576 hammer_free_data_ptr(cursor.data_buffer, bdata,
580 hammer_done_cursor(&cursor);
586 * Add the record to the inode's rec_tree. The low 32 bits of a directory
587 * entry's key is used to deal with hash collisions in the upper 32 bits.
588 * A unique 64 bit key is generated in-memory and may be regenerated a
589 * second time when the directory record is flushed to the on-disk B-Tree.
593 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
595 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
596 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
597 hammer_free_mem_record(record);
600 if (++trans->hmp->namekey_iterator == 0)
601 ++trans->hmp->namekey_iterator;
602 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
603 record->rec.base.base.key |= trans->hmp->namekey_iterator;
605 record->flags |= HAMMER_RECF_ONRBTREE;
609 /************************************************************************
610 * HAMMER INODE MERGED-RECORD FUNCTIONS *
611 ************************************************************************
613 * These functions augment the B-Tree scanning functions in hammer_btree.c
614 * by merging in-memory records with on-disk records.
618 * Locate a particular record either in-memory or on-disk.
620 * NOTE: This is basically a standalone routine, hammer_ip_next() may
621 * NOT be called to iterate results.
624 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
629 * If the element is in-memory return it without searching the
632 error = hammer_mem_lookup(cursor, ip);
634 cursor->record = &cursor->iprec->rec;
641 * If the inode has on-disk components search the on-disk B-Tree.
643 if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
645 error = hammer_btree_lookup(cursor);
647 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
652 * Locate the first record within the cursor's key_beg/key_end range,
653 * restricted to a particular inode. 0 is returned on success, ENOENT
654 * if no records matched the requested range, or some other error.
656 * When 0 is returned hammer_ip_next() may be used to iterate additional
657 * records within the requested range.
660 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
665 * Clean up fields and setup for merged scan
667 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
668 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
669 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
671 hammer_rel_mem_record(&cursor->iprec);
674 * Search the on-disk B-Tree. hammer_btree_lookup() only does an
675 * exact lookup so if we get ENOENT we have to call the iterate
676 * function to validate the first record after the begin key.
678 * The ATEDISK flag is used by hammer_btree_iterate to determine
679 * whether it must index forwards or not.
681 if (ip->flags & HAMMER_INODE_ONDISK) {
682 error = hammer_btree_lookup(cursor);
683 if (error == ENOENT) {
684 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
685 error = hammer_btree_iterate(cursor);
687 if (error && error != ENOENT)
690 cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
691 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
693 cursor->flags |= HAMMER_CURSOR_ATEDISK;
698 * Search the in-memory record list (Red-Black tree). Unlike the
699 * B-Tree search, mem_first checks for records in the range.
701 error = hammer_mem_first(cursor, ip);
702 if (error && error != ENOENT)
705 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
706 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
710 * This will return the first matching record.
712 return(hammer_ip_next(cursor));
716 * Retrieve the next record in a merged iteration within the bounds of the
717 * cursor. This call may be made multiple times after the cursor has been
718 * initially searched with hammer_ip_first().
720 * 0 is returned on success, ENOENT if no further records match the
721 * requested range, or some other error code is returned.
724 hammer_ip_next(hammer_cursor_t cursor)
726 hammer_btree_elm_t elm;
732 * Load the current on-disk and in-memory record. If we ate any
733 * records we have to get the next one.
735 * If we deleted the last on-disk record we had scanned ATEDISK will
736 * be clear and DELBTREE will be set, forcing a call to iterate. The
737 * fact that ATEDISK is clear causes iterate to re-test the 'current'
738 * element. If ATEDISK is set, iterate will skip the 'current'
741 * Get the next on-disk record
743 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
744 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
745 error = hammer_btree_iterate(cursor);
747 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
749 cursor->flags |= HAMMER_CURSOR_DISKEOF |
750 HAMMER_CURSOR_ATEDISK;
755 * Get the next in-memory record. The record can be ripped out
756 * of the RB tree so we maintain a scan_info structure to track
759 * hammer_rec_scan_cmp: Is the record still in our general range,
760 * (non-inclusive of snapshot exclusions)?
761 * hammer_rec_scan_callback: Is the record in our snapshot?
763 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
764 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
765 hammer_rel_mem_record(&cursor->iprec);
766 rec = cursor->scan.node; /* next node */
768 if (hammer_rec_scan_cmp(rec, cursor) != 0)
770 if (hammer_rec_scan_callback(rec, cursor) != 0)
772 rec = hammer_rec_rb_tree_RB_NEXT(rec);
775 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
776 hammer_ref(&cursor->iprec->lock);
778 hammer_rec_rb_tree_RB_NEXT(rec);
780 cursor->flags |= HAMMER_CURSOR_MEMEOF;
786 * Extract either the disk or memory record depending on their
790 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
795 elm = &cursor->node->ondisk->elms[cursor->index];
796 r = hammer_btree_cmp(&elm->base,
797 &cursor->iprec->rec.base.base);
799 error = hammer_btree_extract(cursor,
800 HAMMER_CURSOR_GET_RECORD);
801 cursor->flags |= HAMMER_CURSOR_ATEDISK;
804 /* fall through to the memory entry */
805 case HAMMER_CURSOR_ATEDISK:
807 * Only the memory entry is valid
809 cursor->record = &cursor->iprec->rec;
810 cursor->flags |= HAMMER_CURSOR_ATEMEM;
812 case HAMMER_CURSOR_ATEMEM:
814 * Only the disk entry is valid
816 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
817 cursor->flags |= HAMMER_CURSOR_ATEDISK;
821 * Neither entry is valid
823 * XXX error not set properly
825 cursor->record = NULL;
833 * Resolve the cursor->data pointer for the current cursor position in
834 * a merged iteration.
837 hammer_ip_resolve_data(hammer_cursor_t cursor)
841 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
842 cursor->data = cursor->iprec->data;
845 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
851 * Delete all records within the specified range for inode ip.
853 * NOTE: An unaligned range will cause new records to be added to cover
854 * the edge cases. (XXX not implemented yet).
856 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
858 * NOTE: Record keys for regular file data have to be special-cased since
859 * they indicate the end of the range (key = base + bytes).
862 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
863 int64_t ran_beg, int64_t ran_end)
865 struct hammer_cursor cursor;
866 hammer_record_ondisk_t rec;
867 hammer_base_elm_t base;
872 hammer_init_cursor_ip(&cursor, ip);
874 cursor.key_beg.obj_id = ip->obj_id;
875 cursor.key_beg.create_tid = ip->obj_asof;
876 cursor.key_beg.delete_tid = 0;
877 cursor.key_beg.obj_type = 0;
879 cursor.key_end = cursor.key_beg;
880 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
881 cursor.key_beg.key = ran_beg;
882 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
883 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
884 cursor.key_end.key = ran_end;
888 * The key in the B-Tree is (base+bytes), so the first possible
889 * matching key is ran_beg + 1.
893 cursor.key_beg.key = ran_beg + 1;
894 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
895 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
897 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */
899 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
901 cursor.key_end.key = ran_end + MAXPHYS + 1;
905 error = hammer_ip_first(&cursor, ip);
908 * Iterate through matching records and mark them as deleted.
912 base = &rec->base.base;
914 KKASSERT(base->delete_tid == 0);
917 * There may be overlap cases for regular file data. Also
918 * remember the key for a regular file record is the offset
919 * of the last byte of the record (base + len - 1), NOT the
922 kprintf("delete_range rec_type %02x\n", base->rec_type);
923 if (base->rec_type == HAMMER_RECTYPE_DATA) {
924 kprintf("delete_range loop key %016llx\n", base->key - rec->base.data_len);
925 off = base->key - rec->base.data_len;
927 * Check the left edge case. We currently do not
928 * split existing records.
931 panic("hammer left edge case %016llx %d\n",
932 base->key, rec->base.data_len);
936 * Check the right edge case. Note that the
937 * record can be completely out of bounds, which
938 * terminates the search.
940 * base->key is exclusive of the right edge while
941 * ran_end is inclusive of the right edge. The
942 * (key - data_len) left boundary is inclusive.
944 * XXX theory-check this test at some point, are
945 * we missing a + 1 somewhere? Note that ran_end
948 if (base->key > ran_end) {
949 if (base->key - rec->base.data_len > ran_end) {
950 kprintf("right edge OOB\n");
953 panic("hammer right edge case\n");
958 * Mark the record and B-Tree entry as deleted. This will
959 * also physically delete the B-Tree entry, record, and
960 * data if the retention policy dictates. The function
961 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
962 * uses to perform a fixup.
964 error = hammer_ip_delete_record(&cursor, trans->tid);
967 error = hammer_ip_next(&cursor);
969 hammer_done_cursor(&cursor);
976 * Delete the record at the current cursor
979 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
981 hammer_btree_elm_t elm;
986 * In-memory (unsynchronized) records can simply be freed.
988 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
989 if (cursor->record == &cursor->iprec->rec) {
990 hammer_free_mem_record(cursor->iprec);
995 * On-disk records are marked as deleted by updating their delete_tid.
997 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
999 hmp = cursor->node->cluster->volume->hmp;
1002 elm = &cursor->node->ondisk->elms[cursor->index];
1003 cursor->record->base.base.delete_tid = tid;
1004 elm->leaf.base.delete_tid = tid;
1005 hammer_modify_buffer(cursor->record_buffer);
1006 hammer_modify_node(cursor->node);
1010 * If we were mounted with the nohistory option, we physically
1011 * delete the record.
1013 if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1015 int32_t data_offset;
1017 hammer_cluster_t cluster;
1019 rec_offset = elm->leaf.rec_offset;
1020 data_offset = elm->leaf.data_offset;
1021 data_len = elm->leaf.data_len;
1022 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1023 rec_offset, data_offset, data_len);
1024 cluster = cursor->node->cluster;
1025 hammer_ref_cluster(cluster);
1027 error = hammer_btree_delete(cursor);
1030 * This forces a fixup for the iteration because
1031 * the cursor is now either sitting at the 'next'
1032 * element or sitting at the end of a leaf.
1034 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1035 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1036 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1038 hammer_free_record(cluster, rec_offset);
1039 if (data_offset - rec_offset < 0 ||
1040 data_offset - rec_offset >= HAMMER_RECORD_SIZE) {
1041 hammer_free_data(cluster, data_offset,data_len);
1044 hammer_rel_cluster(cluster, 0);
1046 kprintf("hammer_ip_delete_record: unable to physically delete the record!\n");