2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.10 2007/12/29 09:01:27 dillon Exp $
39 static int hammer_mem_add(hammer_transaction_t trans,
40 hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43 static void hammer_free_mem_record(hammer_record_t record);
46 * Red-black tree support.
49 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
51 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
53 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
56 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
58 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
61 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
63 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
69 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
71 if (info->rec_type < rec->rec.base.base.rec_type)
73 if (info->rec_type > rec->rec.base.base.rec_type)
76 if (info->key < rec->rec.base.base.key)
78 if (info->key > rec->rec.base.base.key)
82 * This test has a number of special cases. create_tid in key1 is
83 * the as-of transction id, and delete_tid in key1 is NOT USED.
85 * A key1->create_tid of 0 matches any record regardles of when
86 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
87 * used to search for the most current state of the object.
89 * key2->create_tid is a HAMMER record and will never be
90 * 0. key2->delete_tid is the deletion transaction id or 0 if
91 * the record has not yet been deleted.
93 if (info->create_tid) {
94 if (info->create_tid < rec->rec.base.base.create_tid)
96 if (rec->rec.base.base.delete_tid &&
97 info->create_tid >= rec->rec.base.base.delete_tid) {
105 * RB_SCAN comparison code for hammer_mem_first(). The argument order
106 * is reversed so the comparison result has to be negated. key_beg and
107 * key_end are both range-inclusive.
109 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
110 * These do not stop the scan.
112 * Localized deletions are not cached in-memory.
116 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
118 hammer_cursor_t cursor = data;
121 r = hammer_rec_compare(&cursor->key_beg, rec);
126 r = hammer_rec_compare(&cursor->key_end, rec);
132 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
133 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
134 hammer_rec_compare, hammer_base_elm_t);
137 * Allocate a record for the caller to finish filling in. The record is
138 * returned referenced and locked.
141 hammer_alloc_mem_record(hammer_inode_t ip)
143 hammer_record_t record;
145 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
147 hammer_ref(&record->lock);
148 hammer_lock_ex(&record->lock);
153 * Release a memory record. If the record was marked for defered deletion,
154 * and no references remain, the record is physically destroyed.
157 hammer_rel_mem_record(struct hammer_record **recordp)
161 if ((rec = *recordp) != NULL) {
162 hammer_unref(&rec->lock);
163 if (rec->lock.refs == 0) {
164 if (rec->flags & HAMMER_RECF_DELETED)
165 hammer_free_mem_record(rec);
172 * Drop a locked hammer in-memory record. This function unlocks and
173 * dereferences the record. If delete != 0 the record is marked for
174 * deletion. Physical deletion only occurs when the last reference goes
178 hammer_drop_mem_record(hammer_record_t rec, int delete)
181 rec->flags |= HAMMER_RECF_DELETED;
182 hammer_unlock(&rec->lock);
183 hammer_rel_mem_record(&rec);
187 * Free a record. Clean the structure up even though we are throwing it
188 * away as a sanity check. The actual free operation is delayed while
189 * the record is referenced. However, the record is removed from the RB
193 hammer_free_mem_record(hammer_record_t record)
195 if (record->flags & HAMMER_RECF_ONRBTREE) {
196 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
197 record->flags &= ~HAMMER_RECF_ONRBTREE;
199 if (record->lock.refs) {
200 record->flags |= HAMMER_RECF_DELETED;
203 if (record->flags & HAMMER_RECF_ALLOCDATA) {
204 kfree(record->data, M_HAMMER);
205 record->flags &= ~HAMMER_RECF_ALLOCDATA;
208 kfree(record, M_HAMMER);
212 * Lookup an in-memory record given the key specified in the cursor. Works
213 * just like hammer_btree_lookup() but operates on an inode's in-memory
216 * The lookup must fail if the record is marked for deferred deletion.
220 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
225 hammer_rel_mem_record(&cursor->iprec);
227 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
228 &cursor->ip->rec_tree);
231 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
232 cursor->scan.node = NULL;
233 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
234 &ip->rec_tree, &cursor->key_beg);
235 if (cursor->iprec == NULL) {
238 hammer_ref(&cursor->iprec->lock);
245 * hammer_mem_first() - locate the first in-memory record matching the
248 * The RB_SCAN function we use is designed as a callback. We terminate it
249 * (return -1) as soon as we get a match.
253 hammer_rec_scan_callback(hammer_record_t rec, void *data)
255 hammer_cursor_t cursor = data;
258 * Skip if not visible due to our as-of TID
260 if (cursor->key_beg.create_tid) {
261 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
263 if (rec->rec.base.base.delete_tid &&
264 cursor->key_beg.create_tid >=
265 rec->rec.base.base.delete_tid) {
271 * Return the first matching record and stop the scan
273 if (cursor->iprec == NULL) {
275 hammer_ref(&rec->lock);
283 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
286 hammer_rel_mem_record(&cursor->iprec);
288 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
289 &cursor->ip->rec_tree);
292 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
293 cursor->scan.node = NULL;
294 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
295 hammer_rec_scan_callback, cursor);
298 * Adjust scan.node and keep it linked into the RB-tree so we can
299 * hold the cursor through third party modifications of the RB-tree.
302 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
309 hammer_mem_done(hammer_cursor_t cursor)
312 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
313 &cursor->ip->rec_tree);
317 hammer_rel_mem_record(&cursor->iprec);
320 /************************************************************************
321 * HAMMER IN-MEMORY RECORD FUNCTIONS *
322 ************************************************************************
324 * These functions manipulate in-memory records. Such records typically
325 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
329 * Add a directory entry (dip,ncp) which references inode (ip).
331 * Note that the low 32 bits of the namekey are set temporarily to create
332 * a unique in-memory record, and may be modified a second time when the
333 * record is synchronized to disk. In particular, the low 32 bits cannot be
334 * all 0's when synching to disk, which is not handled here.
337 hammer_ip_add_directory(struct hammer_transaction *trans,
338 struct hammer_inode *dip, struct namecache *ncp,
339 struct hammer_inode *ip)
341 hammer_record_t record;
345 record = hammer_alloc_mem_record(dip);
347 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */
348 if (++trans->hmp->namekey_iterator == 0)
349 ++trans->hmp->namekey_iterator;
351 record->rec.entry.base.base.obj_id = dip->obj_id;
352 record->rec.entry.base.base.key =
353 hammer_directory_namekey(ncp->nc_name, bytes);
354 record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
355 record->rec.entry.base.base.create_tid = trans->tid;
356 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
357 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
358 record->rec.entry.obj_id = ip->obj_id;
359 if (bytes <= sizeof(record->rec.entry.den_name)) {
360 record->data = (void *)record->rec.entry.den_name;
361 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
363 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
364 record->flags |= HAMMER_RECF_ALLOCDATA;
366 bcopy(ncp->nc_name, record->data, bytes);
367 record->rec.entry.base.data_len = bytes;
368 ++ip->ino_rec.ino_nlinks;
369 hammer_modify_inode(trans, ip,
370 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
371 error = hammer_mem_add(trans, record);
376 * Delete the directory entry and update the inode link count. The
377 * cursor must be seeked to the directory entry record being deleted.
379 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
382 hammer_ip_del_directory(struct hammer_transaction *trans,
383 hammer_cursor_t cursor, struct hammer_inode *dip,
384 struct hammer_inode *ip)
388 error = hammer_ip_delete_record(cursor, trans->tid);
391 * One less link. The file may still be open in the OS even after
392 * all links have gone away so we don't destroy the inode's data
396 --ip->ino_rec.ino_nlinks;
397 hammer_modify_inode(trans, ip,
398 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
399 if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))
400 hammer_sync_inode(ip, MNT_NOWAIT, 1);
407 * Sync data from a buffer cache buffer (typically) to the filesystem. This
408 * is called via the strategy called from a cached data source. This code
409 * is responsible for actually writing a data record out to the disk.
412 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
413 int64_t offset, void *data, int bytes,
414 struct hammer_cursor **spike)
416 struct hammer_cursor cursor;
417 hammer_record_ondisk_t rec;
418 union hammer_btree_elm elm;
422 error = hammer_init_cursor_ip(&cursor, ip);
425 cursor.key_beg.obj_id = ip->obj_id;
426 cursor.key_beg.key = offset + bytes;
427 cursor.key_beg.create_tid = trans->tid;
428 cursor.key_beg.delete_tid = 0;
429 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
430 cursor.flags = HAMMER_CURSOR_INSERT;
433 * Issue a lookup to position the cursor and locate the cluster
435 error = hammer_btree_lookup(&cursor);
437 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
439 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
440 HAMMER_BTREE_TYPE_LEAF, cursor.index);
447 * Allocate record and data space now that we know which cluster
448 * the B-Tree node ended up in.
450 bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
451 &cursor.data_buffer);
454 rec = hammer_alloc_record(cursor.node->cluster, &error,
455 &cursor.record_buffer);
460 * Fill everything in and insert our B-Tree node.
462 rec->base.base = cursor.key_beg;
463 rec->base.data_crc = crc32(data, bytes);
464 rec->base.rec_id = 0; /* XXX */
465 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
466 rec->base.data_len = bytes;
467 hammer_modify_buffer(cursor.record_buffer);
469 bcopy(data, bdata, bytes);
470 hammer_modify_buffer(cursor.data_buffer);
472 elm.leaf.base = cursor.key_beg;
473 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
474 elm.leaf.data_offset = rec->base.data_offset;
475 elm.leaf.data_len = bytes;
476 elm.leaf.data_crc = rec->base.data_crc;
478 error = hammer_btree_insert(&cursor, &elm);
482 hammer_free_record_ptr(cursor.record_buffer, rec);
484 hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
487 * If ENOSPC in cluster fill in the spike structure and return
491 hammer_load_spike(&cursor, spike);
492 hammer_done_cursor(&cursor);
497 * Sync an in-memory record to the disk. this is typically called via fsync
498 * from a cached record source. This code is responsible for actually
499 * writing a record out to the disk.
502 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
504 struct hammer_cursor cursor;
505 hammer_record_ondisk_t rec;
506 union hammer_btree_elm elm;
510 error = hammer_init_cursor_ip(&cursor, record->ip);
513 cursor.key_beg = record->rec.base.base;
514 cursor.flags = HAMMER_CURSOR_INSERT;
517 * Issue a lookup to position the cursor and locate the cluster. The
518 * target key should not exist.
520 * If we run out of space trying to adjust the B-Tree for the
521 * insert, re-lookup without the insert flag so the cursor
522 * is properly positioned for the spike.
524 error = hammer_btree_lookup(&cursor);
526 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
527 record->rec.base.base.key);
534 * Allocate record and data space now that we know which cluster
535 * the B-Tree node ended up in.
537 if (record->data == NULL ||
538 (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
539 bdata = record->data;
541 bdata = hammer_alloc_data(cursor.node->cluster,
542 record->rec.base.data_len, &error,
543 &cursor.data_buffer);
547 rec = hammer_alloc_record(cursor.node->cluster, &error,
548 &cursor.record_buffer);
553 * Fill everything in and insert our B-Tree node.
555 * XXX assign rec_id here
559 rec->base.data_crc = crc32(record->data,
560 record->rec.base.data_len);
561 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
563 * Data embedded in record
565 rec->base.data_offset = ((char *)bdata -
566 (char *)&record->rec);
567 KKASSERT(rec->base.data_offset >= 0 &&
568 rec->base.data_offset + rec->base.data_len <
570 rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
573 * Data separate from record
575 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
576 bcopy(record->data, bdata, rec->base.data_len);
577 hammer_modify_buffer(cursor.data_buffer);
580 rec->base.rec_id = 0; /* XXX */
582 hammer_modify_buffer(cursor.record_buffer);
584 elm.leaf.base = cursor.key_beg;
585 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
586 elm.leaf.data_offset = rec->base.data_offset;
587 elm.leaf.data_len = rec->base.data_len;
588 elm.leaf.data_crc = rec->base.data_crc;
590 error = hammer_btree_insert(&cursor, &elm);
594 hammer_free_record_ptr(cursor.record_buffer, rec);
596 if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
597 hammer_free_data_ptr(cursor.data_buffer, bdata,
598 record->rec.base.data_len);
602 * If ENOSPC in cluster fill in the spike structure and return
606 hammer_load_spike(&cursor, spike);
607 hammer_done_cursor(&cursor);
612 * Write out a record using the specified cursor. The caller does not have
613 * to seek the cursor. The flags are used to determine whether the data
614 * (if any) is embedded in the record or not.
616 * The target cursor will be modified by this call. Note in particular
617 * that HAMMER_CURSOR_INSERT is set.
620 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
621 void *data, int cursor_flags)
623 union hammer_btree_elm elm;
624 hammer_record_ondisk_t nrec;
628 cursor->key_beg = orec->base.base;
629 cursor->flags |= HAMMER_CURSOR_INSERT;
632 * Issue a lookup to position the cursor and locate the cluster. The
633 * target key should not exist.
635 * If we run out of space trying to adjust the B-Tree for the
636 * insert, re-lookup without the insert flag so the cursor
637 * is properly positioned for the spike.
639 error = hammer_btree_lookup(cursor);
641 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
642 orec->base.base.key);
649 * Allocate record and data space now that we know which cluster
650 * the B-Tree node ended up in.
653 (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
656 bdata = hammer_alloc_data(cursor->node->cluster,
657 orec->base.data_len, &error,
658 &cursor->data_buffer);
662 nrec = hammer_alloc_record(cursor->node->cluster, &error,
663 &cursor->record_buffer);
668 * Fill everything in and insert our B-Tree node.
670 * XXX assign rec_id here
673 nrec->base.data_offset = 0;
675 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
676 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
678 * Data embedded in record
680 nrec->base.data_offset = ((char *)bdata - (char *)orec);
681 KKASSERT(nrec->base.data_offset >= 0 &&
682 nrec->base.data_offset + nrec->base.data_len <
684 nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
687 * Data separate from record
689 nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
690 bcopy(data, bdata, nrec->base.data_len);
691 hammer_modify_buffer(cursor->data_buffer);
694 nrec->base.rec_id = 0; /* XXX */
696 hammer_modify_buffer(cursor->record_buffer);
698 elm.leaf.base = nrec->base.base;
699 elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
700 elm.leaf.data_offset = nrec->base.data_offset;
701 elm.leaf.data_len = nrec->base.data_len;
702 elm.leaf.data_crc = nrec->base.data_crc;
704 error = hammer_btree_insert(cursor, &elm);
708 hammer_free_record_ptr(cursor->record_buffer, nrec);
710 if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
711 hammer_free_data_ptr(cursor->data_buffer, bdata,
712 orec->base.data_len);
715 /* leave cursor intact */
720 * Add the record to the inode's rec_tree. The low 32 bits of a directory
721 * entry's key is used to deal with hash collisions in the upper 32 bits.
722 * A unique 64 bit key is generated in-memory and may be regenerated a
723 * second time when the directory record is flushed to the on-disk B-Tree.
725 * A locked and referenced record is passed to this function. This function
726 * eats the lock and reference.
730 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
732 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
733 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
734 hammer_drop_mem_record(record, 1);
737 if (++trans->hmp->namekey_iterator == 0)
738 ++trans->hmp->namekey_iterator;
739 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
740 record->rec.base.base.key |= trans->hmp->namekey_iterator;
742 record->flags |= HAMMER_RECF_ONRBTREE;
743 hammer_drop_mem_record(record, 0);
747 /************************************************************************
748 * HAMMER INODE MERGED-RECORD FUNCTIONS *
749 ************************************************************************
751 * These functions augment the B-Tree scanning functions in hammer_btree.c
752 * by merging in-memory records with on-disk records.
756 * Locate a particular record either in-memory or on-disk.
758 * NOTE: This is basically a standalone routine, hammer_ip_next() may
759 * NOT be called to iterate results.
762 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
767 * If the element is in-memory return it without searching the
770 error = hammer_mem_lookup(cursor, ip);
772 cursor->record = &cursor->iprec->rec;
779 * If the inode has on-disk components search the on-disk B-Tree.
781 if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
783 error = hammer_btree_lookup(cursor);
785 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
790 * Locate the first record within the cursor's key_beg/key_end range,
791 * restricted to a particular inode. 0 is returned on success, ENOENT
792 * if no records matched the requested range, or some other error.
794 * When 0 is returned hammer_ip_next() may be used to iterate additional
795 * records within the requested range.
798 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
803 * Clean up fields and setup for merged scan
805 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
806 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
807 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
809 hammer_rel_mem_record(&cursor->iprec);
812 * Search the on-disk B-Tree. hammer_btree_lookup() only does an
813 * exact lookup so if we get ENOENT we have to call the iterate
814 * function to validate the first record after the begin key.
816 * The ATEDISK flag is used by hammer_btree_iterate to determine
817 * whether it must index forwards or not. It is also used here
818 * to select the next record from in-memory or on-disk.
820 if (ip->flags & HAMMER_INODE_ONDISK) {
821 error = hammer_btree_lookup(cursor);
822 if (error == ENOENT) {
823 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
824 error = hammer_btree_iterate(cursor);
826 if (error && error != ENOENT)
829 cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
830 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
832 cursor->flags |= HAMMER_CURSOR_ATEDISK;
837 * Search the in-memory record list (Red-Black tree). Unlike the
838 * B-Tree search, mem_first checks for records in the range.
840 error = hammer_mem_first(cursor, ip);
841 if (error && error != ENOENT)
844 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
845 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
849 * This will return the first matching record.
851 return(hammer_ip_next(cursor));
855 * Retrieve the next record in a merged iteration within the bounds of the
856 * cursor. This call may be made multiple times after the cursor has been
857 * initially searched with hammer_ip_first().
859 * 0 is returned on success, ENOENT if no further records match the
860 * requested range, or some other error code is returned.
863 hammer_ip_next(hammer_cursor_t cursor)
865 hammer_btree_elm_t elm;
871 * Load the current on-disk and in-memory record. If we ate any
872 * records we have to get the next one.
874 * If we deleted the last on-disk record we had scanned ATEDISK will
875 * be clear and DELBTREE will be set, forcing a call to iterate. The
876 * fact that ATEDISK is clear causes iterate to re-test the 'current'
877 * element. If ATEDISK is set, iterate will skip the 'current'
880 * Get the next on-disk record
882 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
883 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
884 error = hammer_btree_iterate(cursor);
886 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
888 cursor->flags |= HAMMER_CURSOR_DISKEOF |
889 HAMMER_CURSOR_ATEDISK;
894 * Get the next in-memory record. The record can be ripped out
895 * of the RB tree so we maintain a scan_info structure to track
898 * hammer_rec_scan_cmp: Is the record still in our general range,
899 * (non-inclusive of snapshot exclusions)?
900 * hammer_rec_scan_callback: Is the record in our snapshot?
902 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
903 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
904 hammer_rel_mem_record(&cursor->iprec);
905 rec = cursor->scan.node; /* next node */
907 if (hammer_rec_scan_cmp(rec, cursor) != 0)
909 if (hammer_rec_scan_callback(rec, cursor) != 0)
911 rec = hammer_rec_rb_tree_RB_NEXT(rec);
914 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
915 hammer_ref(&cursor->iprec->lock);
917 hammer_rec_rb_tree_RB_NEXT(rec);
919 cursor->flags |= HAMMER_CURSOR_MEMEOF;
925 * Extract either the disk or memory record depending on their
929 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
934 elm = &cursor->node->ondisk->elms[cursor->index];
935 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
937 error = hammer_btree_extract(cursor,
938 HAMMER_CURSOR_GET_RECORD);
939 cursor->flags |= HAMMER_CURSOR_ATEDISK;
942 /* fall through to the memory entry */
943 case HAMMER_CURSOR_ATEDISK:
945 * Only the memory entry is valid
947 cursor->record = &cursor->iprec->rec;
948 cursor->flags |= HAMMER_CURSOR_ATEMEM;
950 case HAMMER_CURSOR_ATEMEM:
952 * Only the disk entry is valid
954 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
955 cursor->flags |= HAMMER_CURSOR_ATEDISK;
959 * Neither entry is valid
961 * XXX error not set properly
963 cursor->record = NULL;
971 * Resolve the cursor->data pointer for the current cursor position in
972 * a merged iteration.
975 hammer_ip_resolve_data(hammer_cursor_t cursor)
979 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
980 cursor->data = cursor->iprec->data;
983 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
989 * Delete all records within the specified range for inode ip.
991 * NOTE: An unaligned range will cause new records to be added to cover
992 * the edge cases. (XXX not implemented yet).
994 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
996 * NOTE: Record keys for regular file data have to be special-cased since
997 * they indicate the end of the range (key = base + bytes).
999 * NOTE: The spike structure must be filled in if we return ENOSPC.
1002 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1003 int64_t ran_beg, int64_t ran_end,
1004 struct hammer_cursor **spike)
1006 struct hammer_cursor cursor;
1007 hammer_record_ondisk_t rec;
1008 hammer_base_elm_t base;
1013 hammer_init_cursor_ip(&cursor, ip);
1015 cursor.key_beg.obj_id = ip->obj_id;
1016 cursor.key_beg.create_tid = ip->obj_asof;
1017 cursor.key_beg.delete_tid = 0;
1018 cursor.key_beg.obj_type = 0;
1020 cursor.key_end = cursor.key_beg;
1021 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1022 cursor.key_beg.key = ran_beg;
1023 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1024 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1025 cursor.key_end.key = ran_end;
1029 * The key in the B-Tree is (base+bytes), so the first possible
1030 * matching key is ran_beg + 1.
1034 cursor.key_beg.key = ran_beg + 1;
1035 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1036 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1038 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */
1039 if (tmp64 < ran_end)
1040 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1042 cursor.key_end.key = ran_end + MAXPHYS + 1;
1045 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1047 error = hammer_ip_first(&cursor, ip);
1050 * Iterate through matching records and mark them as deleted.
1052 while (error == 0) {
1053 rec = cursor.record;
1054 base = &rec->base.base;
1056 KKASSERT(base->delete_tid == 0);
1059 * There may be overlap cases for regular file data. Also
1060 * remember the key for a regular file record is the offset
1061 * of the last byte of the record (base + len - 1), NOT the
1065 kprintf("delete_range rec_type %02x\n", base->rec_type);
1067 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1069 kprintf("delete_range loop key %016llx\n",
1070 base->key - rec->base.data_len);
1072 off = base->key - rec->base.data_len;
1074 * Check the left edge case. We currently do not
1075 * split existing records.
1077 if (off < ran_beg) {
1078 panic("hammer left edge case %016llx %d\n",
1079 base->key, rec->base.data_len);
1083 * Check the right edge case. Note that the
1084 * record can be completely out of bounds, which
1085 * terminates the search.
1087 * base->key is exclusive of the right edge while
1088 * ran_end is inclusive of the right edge. The
1089 * (key - data_len) left boundary is inclusive.
1091 * XXX theory-check this test at some point, are
1092 * we missing a + 1 somewhere? Note that ran_end
1095 if (base->key > ran_end) {
1096 if (base->key - rec->base.data_len > ran_end) {
1097 kprintf("right edge OOB\n");
1100 panic("hammer right edge case\n");
1105 * Mark the record and B-Tree entry as deleted. This will
1106 * also physically delete the B-Tree entry, record, and
1107 * data if the retention policy dictates. The function
1108 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1109 * uses to perform a fixup.
1111 error = hammer_ip_delete_record(&cursor, trans->tid);
1114 error = hammer_ip_next(&cursor);
1116 hammer_done_cursor(&cursor);
1117 if (error == ENOENT)
1123 * Delete the record at the current cursor
1126 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1128 hammer_btree_elm_t elm;
1133 * In-memory (unsynchronized) records can simply be freed.
1135 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1136 if (cursor->record == &cursor->iprec->rec) {
1137 hammer_free_mem_record(cursor->iprec); /* XXX */
1142 * On-disk records are marked as deleted by updating their delete_tid.
1144 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1146 hmp = cursor->node->cluster->volume->hmp;
1149 elm = &cursor->node->ondisk->elms[cursor->index];
1150 cursor->record->base.base.delete_tid = tid;
1151 elm->leaf.base.delete_tid = tid;
1152 hammer_modify_buffer(cursor->record_buffer);
1153 hammer_modify_node(cursor->node);
1157 * If we were mounted with the nohistory option, we physically
1158 * delete the record.
1160 if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1162 int32_t data_offset;
1164 hammer_cluster_t cluster;
1166 rec_offset = elm->leaf.rec_offset;
1167 data_offset = elm->leaf.data_offset;
1168 data_len = elm->leaf.data_len;
1170 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1171 rec_offset, data_offset, data_len);
1173 cluster = cursor->node->cluster;
1174 hammer_ref_cluster(cluster);
1176 error = hammer_btree_delete(cursor);
1179 * This forces a fixup for the iteration because
1180 * the cursor is now either sitting at the 'next'
1181 * element or sitting at the end of a leaf.
1183 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1184 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1185 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1187 hammer_free_record(cluster, rec_offset);
1188 if (data_offset && (data_offset - rec_offset < 0 ||
1189 data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1190 hammer_free_data(cluster, data_offset,data_len);
1193 hammer_rel_cluster(cluster, 0);
1195 kprintf("hammer_ip_delete_record: unable to physically delete the record!\n");