2 * Copyright (c) 2007 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.11 2007/12/30 00:47:22 dillon Exp $
39 static int hammer_mem_add(hammer_transaction_t trans,
40 hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43 static void hammer_free_mem_record(hammer_record_t record);
46 * Red-black tree support.
49 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
51 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
53 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
56 if (rec1->rec.base.base.key < rec2->rec.base.base.key)
58 if (rec1->rec.base.base.key > rec2->rec.base.base.key)
61 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
63 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
69 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
71 if (info->rec_type < rec->rec.base.base.rec_type)
73 if (info->rec_type > rec->rec.base.base.rec_type)
76 if (info->key < rec->rec.base.base.key)
78 if (info->key > rec->rec.base.base.key)
82 * This test has a number of special cases. create_tid in key1 is
83 * the as-of transction id, and delete_tid in key1 is NOT USED.
85 * A key1->create_tid of 0 matches any record regardles of when
86 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be
87 * used to search for the most current state of the object.
89 * key2->create_tid is a HAMMER record and will never be
90 * 0. key2->delete_tid is the deletion transaction id or 0 if
91 * the record has not yet been deleted.
93 if (info->create_tid) {
94 if (info->create_tid < rec->rec.base.base.create_tid)
96 if (rec->rec.base.base.delete_tid &&
97 info->create_tid >= rec->rec.base.base.delete_tid) {
105 * RB_SCAN comparison code for hammer_mem_first(). The argument order
106 * is reversed so the comparison result has to be negated. key_beg and
107 * key_end are both range-inclusive.
109 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
110 * These do not stop the scan.
112 * Localized deletions are not cached in-memory.
116 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
118 hammer_cursor_t cursor = data;
121 r = hammer_rec_compare(&cursor->key_beg, rec);
126 r = hammer_rec_compare(&cursor->key_end, rec);
132 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
133 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
134 hammer_rec_compare, hammer_base_elm_t);
137 * Allocate a record for the caller to finish filling in. The record is
138 * returned referenced and locked.
141 hammer_alloc_mem_record(hammer_inode_t ip)
143 hammer_record_t record;
145 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
147 hammer_ref(&record->lock);
148 hammer_lock_ex(&record->lock);
153 * Release a memory record. If the record was marked for defered deletion,
154 * and no references remain, the record is physically destroyed.
157 hammer_rel_mem_record(struct hammer_record **recordp)
161 if ((rec = *recordp) != NULL) {
162 hammer_unref(&rec->lock);
163 if (rec->lock.refs == 0) {
164 if (rec->flags & HAMMER_RECF_DELETED)
165 hammer_free_mem_record(rec);
172 * Drop a locked hammer in-memory record. This function unlocks and
173 * dereferences the record. If delete != 0 the record is marked for
174 * deletion. Physical deletion only occurs when the last reference goes
178 hammer_drop_mem_record(hammer_record_t rec, int delete)
181 rec->flags |= HAMMER_RECF_DELETED;
182 hammer_unlock(&rec->lock);
183 hammer_rel_mem_record(&rec);
187 * Free a record. Clean the structure up even though we are throwing it
188 * away as a sanity check. The actual free operation is delayed while
189 * the record is referenced. However, the record is removed from the RB
193 hammer_free_mem_record(hammer_record_t record)
195 if (record->flags & HAMMER_RECF_ONRBTREE) {
196 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
197 record->flags &= ~HAMMER_RECF_ONRBTREE;
199 if (record->lock.refs) {
200 record->flags |= HAMMER_RECF_DELETED;
203 if (record->flags & HAMMER_RECF_ALLOCDATA) {
204 kfree(record->data, M_HAMMER);
205 record->flags &= ~HAMMER_RECF_ALLOCDATA;
208 kfree(record, M_HAMMER);
212 * Lookup an in-memory record given the key specified in the cursor. Works
213 * just like hammer_btree_lookup() but operates on an inode's in-memory
216 * The lookup must fail if the record is marked for deferred deletion.
220 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
225 hammer_rel_mem_record(&cursor->iprec);
227 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
228 &cursor->ip->rec_tree);
231 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
232 cursor->scan.node = NULL;
233 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
234 &ip->rec_tree, &cursor->key_beg);
235 if (cursor->iprec == NULL) {
238 hammer_ref(&cursor->iprec->lock);
245 * hammer_mem_first() - locate the first in-memory record matching the
248 * The RB_SCAN function we use is designed as a callback. We terminate it
249 * (return -1) as soon as we get a match.
253 hammer_rec_scan_callback(hammer_record_t rec, void *data)
255 hammer_cursor_t cursor = data;
258 * Skip if not visible due to our as-of TID
260 if (cursor->key_beg.create_tid) {
261 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
263 if (rec->rec.base.base.delete_tid &&
264 cursor->key_beg.create_tid >=
265 rec->rec.base.base.delete_tid) {
271 * Return the first matching record and stop the scan
273 if (cursor->iprec == NULL) {
275 hammer_ref(&rec->lock);
283 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
286 hammer_rel_mem_record(&cursor->iprec);
288 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
289 &cursor->ip->rec_tree);
292 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
293 cursor->scan.node = NULL;
294 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
295 hammer_rec_scan_callback, cursor);
298 * Adjust scan.node and keep it linked into the RB-tree so we can
299 * hold the cursor through third party modifications of the RB-tree.
302 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
309 hammer_mem_done(hammer_cursor_t cursor)
312 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
313 &cursor->ip->rec_tree);
317 hammer_rel_mem_record(&cursor->iprec);
320 /************************************************************************
321 * HAMMER IN-MEMORY RECORD FUNCTIONS *
322 ************************************************************************
324 * These functions manipulate in-memory records. Such records typically
325 * exist prior to being committed to disk or indexed via the on-disk B-Tree.
329 * Add a directory entry (dip,ncp) which references inode (ip).
331 * Note that the low 32 bits of the namekey are set temporarily to create
332 * a unique in-memory record, and may be modified a second time when the
333 * record is synchronized to disk. In particular, the low 32 bits cannot be
334 * all 0's when synching to disk, which is not handled here.
337 hammer_ip_add_directory(struct hammer_transaction *trans,
338 struct hammer_inode *dip, struct namecache *ncp,
339 struct hammer_inode *ip)
341 hammer_record_t record;
345 record = hammer_alloc_mem_record(dip);
347 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */
348 if (++trans->hmp->namekey_iterator == 0)
349 ++trans->hmp->namekey_iterator;
351 record->rec.entry.base.base.obj_id = dip->obj_id;
352 record->rec.entry.base.base.key =
353 hammer_directory_namekey(ncp->nc_name, bytes);
354 record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
355 record->rec.entry.base.base.create_tid = trans->tid;
356 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
357 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
358 record->rec.entry.obj_id = ip->obj_id;
359 if (bytes <= sizeof(record->rec.entry.den_name)) {
360 record->data = (void *)record->rec.entry.den_name;
361 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
363 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
364 record->flags |= HAMMER_RECF_ALLOCDATA;
366 bcopy(ncp->nc_name, record->data, bytes);
367 record->rec.entry.base.data_len = bytes;
368 ++ip->ino_rec.ino_nlinks;
369 hammer_modify_inode(trans, ip,
370 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
371 error = hammer_mem_add(trans, record);
376 * Delete the directory entry and update the inode link count. The
377 * cursor must be seeked to the directory entry record being deleted.
379 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag.
382 hammer_ip_del_directory(struct hammer_transaction *trans,
383 hammer_cursor_t cursor, struct hammer_inode *dip,
384 struct hammer_inode *ip)
388 error = hammer_ip_delete_record(cursor, trans->tid);
391 * One less link. The file may still be open in the OS even after
392 * all links have gone away so we don't destroy the inode's data
396 --ip->ino_rec.ino_nlinks;
397 hammer_modify_inode(trans, ip,
398 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
399 if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))
400 hammer_sync_inode(ip, MNT_NOWAIT, 1);
407 * Add a record to an inode.
409 * The caller must allocate the record with hammer_alloc_mem_record(ip) and
410 * initialize the following additional fields:
412 * record->rec.entry.base.base.key
413 * record->rec.entry.base.base.rec_type
414 * record->rec.entry.base.base.data_len
415 * record->data (a copy will be kmalloc'd if not embedded)
418 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
420 hammer_inode_t ip = record->ip;
425 record->rec.base.base.obj_id = ip->obj_id;
426 record->rec.base.base.create_tid = trans->tid;
427 record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
428 bytes = record->rec.base.data_len;
431 if ((char *)record->data < (char *)&record->rec ||
432 (char *)record->data >= (char *)(&record->rec + 1)) {
433 data = kmalloc(bytes, M_HAMMER, M_WAITOK);
434 record->flags |= HAMMER_RECF_ALLOCDATA;
435 bcopy(record->data, data, bytes);
438 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
441 hammer_modify_inode(trans, ip,
442 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
443 error = hammer_mem_add(trans, record);
448 * Sync data from a buffer cache buffer (typically) to the filesystem. This
449 * is called via the strategy called from a cached data source. This code
450 * is responsible for actually writing a data record out to the disk.
453 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
454 int64_t offset, void *data, int bytes,
455 struct hammer_cursor **spike)
457 struct hammer_cursor cursor;
458 hammer_record_ondisk_t rec;
459 union hammer_btree_elm elm;
463 error = hammer_init_cursor_ip(&cursor, ip);
466 cursor.key_beg.obj_id = ip->obj_id;
467 cursor.key_beg.key = offset + bytes;
468 cursor.key_beg.create_tid = trans->tid;
469 cursor.key_beg.delete_tid = 0;
470 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
471 cursor.flags = HAMMER_CURSOR_INSERT;
474 * Issue a lookup to position the cursor and locate the cluster
476 error = hammer_btree_lookup(&cursor);
478 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
480 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
481 HAMMER_BTREE_TYPE_LEAF, cursor.index);
488 * Allocate record and data space now that we know which cluster
489 * the B-Tree node ended up in.
491 bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
492 &cursor.data_buffer);
495 rec = hammer_alloc_record(cursor.node->cluster, &error,
496 &cursor.record_buffer);
501 * Fill everything in and insert our B-Tree node.
503 rec->base.base = cursor.key_beg;
504 rec->base.data_crc = crc32(data, bytes);
505 rec->base.rec_id = 0; /* XXX */
506 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
507 rec->base.data_len = bytes;
508 hammer_modify_buffer(cursor.record_buffer);
510 bcopy(data, bdata, bytes);
511 hammer_modify_buffer(cursor.data_buffer);
513 elm.leaf.base = cursor.key_beg;
514 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
515 elm.leaf.data_offset = rec->base.data_offset;
516 elm.leaf.data_len = bytes;
517 elm.leaf.data_crc = rec->base.data_crc;
519 error = hammer_btree_insert(&cursor, &elm);
523 hammer_free_record_ptr(cursor.record_buffer, rec);
525 hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
528 * If ENOSPC in cluster fill in the spike structure and return
532 hammer_load_spike(&cursor, spike);
533 hammer_done_cursor(&cursor);
538 * Sync an in-memory record to the disk. this is typically called via fsync
539 * from a cached record source. This code is responsible for actually
540 * writing a record out to the disk.
543 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
545 struct hammer_cursor cursor;
546 hammer_record_ondisk_t rec;
547 union hammer_btree_elm elm;
551 error = hammer_init_cursor_ip(&cursor, record->ip);
554 cursor.key_beg = record->rec.base.base;
555 cursor.flags = HAMMER_CURSOR_INSERT;
558 * Issue a lookup to position the cursor and locate the cluster. The
559 * target key should not exist.
561 * If we run out of space trying to adjust the B-Tree for the
562 * insert, re-lookup without the insert flag so the cursor
563 * is properly positioned for the spike.
565 error = hammer_btree_lookup(&cursor);
567 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
568 record->rec.base.base.key);
575 * Allocate record and data space now that we know which cluster
576 * the B-Tree node ended up in.
578 if (record->data == NULL ||
579 (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
580 bdata = record->data;
582 bdata = hammer_alloc_data(cursor.node->cluster,
583 record->rec.base.data_len, &error,
584 &cursor.data_buffer);
588 rec = hammer_alloc_record(cursor.node->cluster, &error,
589 &cursor.record_buffer);
594 * Fill everything in and insert our B-Tree node.
596 * XXX assign rec_id here
600 rec->base.data_crc = crc32(record->data,
601 record->rec.base.data_len);
602 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
604 * Data embedded in record
606 rec->base.data_offset = ((char *)bdata -
607 (char *)&record->rec);
608 KKASSERT(rec->base.data_offset >= 0 &&
609 rec->base.data_offset + rec->base.data_len <=
611 rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
614 * Data separate from record
616 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
617 bcopy(record->data, bdata, rec->base.data_len);
618 hammer_modify_buffer(cursor.data_buffer);
621 rec->base.rec_id = 0; /* XXX */
623 hammer_modify_buffer(cursor.record_buffer);
625 elm.leaf.base = cursor.key_beg;
626 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
627 elm.leaf.data_offset = rec->base.data_offset;
628 elm.leaf.data_len = rec->base.data_len;
629 elm.leaf.data_crc = rec->base.data_crc;
631 error = hammer_btree_insert(&cursor, &elm);
635 hammer_free_record_ptr(cursor.record_buffer, rec);
637 if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
638 hammer_free_data_ptr(cursor.data_buffer, bdata,
639 record->rec.base.data_len);
643 * If ENOSPC in cluster fill in the spike structure and return
647 hammer_load_spike(&cursor, spike);
648 hammer_done_cursor(&cursor);
653 * Write out a record using the specified cursor. The caller does not have
654 * to seek the cursor. The flags are used to determine whether the data
655 * (if any) is embedded in the record or not.
657 * The target cursor will be modified by this call. Note in particular
658 * that HAMMER_CURSOR_INSERT is set.
661 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
662 void *data, int cursor_flags)
664 union hammer_btree_elm elm;
665 hammer_record_ondisk_t nrec;
669 cursor->key_beg = orec->base.base;
670 cursor->flags |= HAMMER_CURSOR_INSERT;
673 * Issue a lookup to position the cursor and locate the cluster. The
674 * target key should not exist.
676 * If we run out of space trying to adjust the B-Tree for the
677 * insert, re-lookup without the insert flag so the cursor
678 * is properly positioned for the spike.
680 error = hammer_btree_lookup(cursor);
682 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
683 orec->base.base.key);
690 * Allocate record and data space now that we know which cluster
691 * the B-Tree node ended up in.
694 (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
697 bdata = hammer_alloc_data(cursor->node->cluster,
698 orec->base.data_len, &error,
699 &cursor->data_buffer);
703 nrec = hammer_alloc_record(cursor->node->cluster, &error,
704 &cursor->record_buffer);
709 * Fill everything in and insert our B-Tree node.
711 * XXX assign rec_id here
714 nrec->base.data_offset = 0;
716 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
717 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
719 * Data embedded in record
721 nrec->base.data_offset = ((char *)bdata - (char *)orec);
722 KKASSERT(nrec->base.data_offset >= 0 &&
723 nrec->base.data_offset + nrec->base.data_len <
725 nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
728 * Data separate from record
730 nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
731 bcopy(data, bdata, nrec->base.data_len);
732 hammer_modify_buffer(cursor->data_buffer);
735 nrec->base.rec_id = 0; /* XXX */
737 hammer_modify_buffer(cursor->record_buffer);
739 elm.leaf.base = nrec->base.base;
740 elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
741 elm.leaf.data_offset = nrec->base.data_offset;
742 elm.leaf.data_len = nrec->base.data_len;
743 elm.leaf.data_crc = nrec->base.data_crc;
745 error = hammer_btree_insert(cursor, &elm);
749 hammer_free_record_ptr(cursor->record_buffer, nrec);
751 if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
752 hammer_free_data_ptr(cursor->data_buffer, bdata,
753 orec->base.data_len);
756 /* leave cursor intact */
761 * Add the record to the inode's rec_tree. The low 32 bits of a directory
762 * entry's key is used to deal with hash collisions in the upper 32 bits.
763 * A unique 64 bit key is generated in-memory and may be regenerated a
764 * second time when the directory record is flushed to the on-disk B-Tree.
766 * A locked and referenced record is passed to this function. This function
767 * eats the lock and reference.
771 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
773 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
774 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
775 hammer_drop_mem_record(record, 1);
778 if (++trans->hmp->namekey_iterator == 0)
779 ++trans->hmp->namekey_iterator;
780 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
781 record->rec.base.base.key |= trans->hmp->namekey_iterator;
783 record->flags |= HAMMER_RECF_ONRBTREE;
784 hammer_drop_mem_record(record, 0);
788 /************************************************************************
789 * HAMMER INODE MERGED-RECORD FUNCTIONS *
790 ************************************************************************
792 * These functions augment the B-Tree scanning functions in hammer_btree.c
793 * by merging in-memory records with on-disk records.
797 * Locate a particular record either in-memory or on-disk.
799 * NOTE: This is basically a standalone routine, hammer_ip_next() may
800 * NOT be called to iterate results.
803 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
808 * If the element is in-memory return it without searching the
811 error = hammer_mem_lookup(cursor, ip);
813 cursor->record = &cursor->iprec->rec;
820 * If the inode has on-disk components search the on-disk B-Tree.
822 if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
824 error = hammer_btree_lookup(cursor);
826 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
831 * Locate the first record within the cursor's key_beg/key_end range,
832 * restricted to a particular inode. 0 is returned on success, ENOENT
833 * if no records matched the requested range, or some other error.
835 * When 0 is returned hammer_ip_next() may be used to iterate additional
836 * records within the requested range.
839 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
844 * Clean up fields and setup for merged scan
846 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
847 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
848 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
850 hammer_rel_mem_record(&cursor->iprec);
853 * Search the on-disk B-Tree. hammer_btree_lookup() only does an
854 * exact lookup so if we get ENOENT we have to call the iterate
855 * function to validate the first record after the begin key.
857 * The ATEDISK flag is used by hammer_btree_iterate to determine
858 * whether it must index forwards or not. It is also used here
859 * to select the next record from in-memory or on-disk.
861 if (ip->flags & HAMMER_INODE_ONDISK) {
862 error = hammer_btree_lookup(cursor);
863 if (error == ENOENT) {
864 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
865 error = hammer_btree_iterate(cursor);
867 if (error && error != ENOENT)
870 cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
871 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
873 cursor->flags |= HAMMER_CURSOR_ATEDISK;
878 * Search the in-memory record list (Red-Black tree). Unlike the
879 * B-Tree search, mem_first checks for records in the range.
881 error = hammer_mem_first(cursor, ip);
882 if (error && error != ENOENT)
885 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
886 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
890 * This will return the first matching record.
892 return(hammer_ip_next(cursor));
896 * Retrieve the next record in a merged iteration within the bounds of the
897 * cursor. This call may be made multiple times after the cursor has been
898 * initially searched with hammer_ip_first().
900 * 0 is returned on success, ENOENT if no further records match the
901 * requested range, or some other error code is returned.
904 hammer_ip_next(hammer_cursor_t cursor)
906 hammer_btree_elm_t elm;
912 * Load the current on-disk and in-memory record. If we ate any
913 * records we have to get the next one.
915 * If we deleted the last on-disk record we had scanned ATEDISK will
916 * be clear and DELBTREE will be set, forcing a call to iterate. The
917 * fact that ATEDISK is clear causes iterate to re-test the 'current'
918 * element. If ATEDISK is set, iterate will skip the 'current'
921 * Get the next on-disk record
923 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
924 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
925 error = hammer_btree_iterate(cursor);
927 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
929 cursor->flags |= HAMMER_CURSOR_DISKEOF |
930 HAMMER_CURSOR_ATEDISK;
935 * Get the next in-memory record. The record can be ripped out
936 * of the RB tree so we maintain a scan_info structure to track
939 * hammer_rec_scan_cmp: Is the record still in our general range,
940 * (non-inclusive of snapshot exclusions)?
941 * hammer_rec_scan_callback: Is the record in our snapshot?
943 if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
944 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
945 hammer_rel_mem_record(&cursor->iprec);
946 rec = cursor->scan.node; /* next node */
948 if (hammer_rec_scan_cmp(rec, cursor) != 0)
950 if (hammer_rec_scan_callback(rec, cursor) != 0)
952 rec = hammer_rec_rb_tree_RB_NEXT(rec);
955 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
956 hammer_ref(&cursor->iprec->lock);
958 hammer_rec_rb_tree_RB_NEXT(rec);
960 cursor->flags |= HAMMER_CURSOR_MEMEOF;
966 * Extract either the disk or memory record depending on their
970 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
975 elm = &cursor->node->ondisk->elms[cursor->index];
976 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
978 error = hammer_btree_extract(cursor,
979 HAMMER_CURSOR_GET_RECORD);
980 cursor->flags |= HAMMER_CURSOR_ATEDISK;
983 /* fall through to the memory entry */
984 case HAMMER_CURSOR_ATEDISK:
986 * Only the memory entry is valid
988 cursor->record = &cursor->iprec->rec;
989 cursor->flags |= HAMMER_CURSOR_ATEMEM;
991 case HAMMER_CURSOR_ATEMEM:
993 * Only the disk entry is valid
995 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
996 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1000 * Neither entry is valid
1002 * XXX error not set properly
1004 cursor->record = NULL;
1012 * Resolve the cursor->data pointer for the current cursor position in
1013 * a merged iteration.
1016 hammer_ip_resolve_data(hammer_cursor_t cursor)
1020 if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1021 cursor->data = cursor->iprec->data;
1024 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1030 * Delete all records within the specified range for inode ip.
1032 * NOTE: An unaligned range will cause new records to be added to cover
1033 * the edge cases. (XXX not implemented yet).
1035 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1037 * NOTE: Record keys for regular file data have to be special-cased since
1038 * they indicate the end of the range (key = base + bytes).
1040 * NOTE: The spike structure must be filled in if we return ENOSPC.
1043 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1044 int64_t ran_beg, int64_t ran_end,
1045 struct hammer_cursor **spike)
1047 struct hammer_cursor cursor;
1048 hammer_record_ondisk_t rec;
1049 hammer_base_elm_t base;
1053 hammer_init_cursor_ip(&cursor, ip);
1055 cursor.key_beg.obj_id = ip->obj_id;
1056 cursor.key_beg.create_tid = ip->obj_asof;
1057 cursor.key_beg.delete_tid = 0;
1058 cursor.key_beg.obj_type = 0;
1060 cursor.key_end = cursor.key_beg;
1061 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1062 cursor.key_beg.key = ran_beg;
1063 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1064 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1065 cursor.key_end.key = ran_end;
1068 * The key in the B-Tree is (base+bytes), so the first possible
1069 * matching key is ran_beg + 1.
1073 cursor.key_beg.key = ran_beg + 1;
1074 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1075 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1077 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */
1078 if (tmp64 < ran_end)
1079 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1081 cursor.key_end.key = ran_end + MAXPHYS + 1;
1083 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1085 error = hammer_ip_first(&cursor, ip);
1088 * Iterate through matching records and mark them as deleted.
1090 while (error == 0) {
1091 rec = cursor.record;
1092 base = &rec->base.base;
1094 KKASSERT(base->delete_tid == 0);
1097 * There may be overlap cases for regular file data. Also
1098 * remember the key for a regular file record is the offset
1099 * of the last byte of the record (base + len - 1), NOT the
1103 kprintf("delete_range rec_type %02x\n", base->rec_type);
1105 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1107 kprintf("delete_range loop key %016llx\n",
1108 base->key - rec->base.data_len);
1110 off = base->key - rec->base.data_len;
1112 * Check the left edge case. We currently do not
1113 * split existing records.
1115 if (off < ran_beg) {
1116 panic("hammer left edge case %016llx %d\n",
1117 base->key, rec->base.data_len);
1121 * Check the right edge case. Note that the
1122 * record can be completely out of bounds, which
1123 * terminates the search.
1125 * base->key is exclusive of the right edge while
1126 * ran_end is inclusive of the right edge. The
1127 * (key - data_len) left boundary is inclusive.
1129 * XXX theory-check this test at some point, are
1130 * we missing a + 1 somewhere? Note that ran_end
1133 if (base->key > ran_end) {
1134 if (base->key - rec->base.data_len > ran_end) {
1135 kprintf("right edge OOB\n");
1138 panic("hammer right edge case\n");
1143 * Mark the record and B-Tree entry as deleted. This will
1144 * also physically delete the B-Tree entry, record, and
1145 * data if the retention policy dictates. The function
1146 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1147 * uses to perform a fixup.
1149 error = hammer_ip_delete_record(&cursor, trans->tid);
1152 error = hammer_ip_next(&cursor);
1154 hammer_done_cursor(&cursor);
1155 if (error == ENOENT)
1161 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1163 struct hammer_cursor cursor;
1164 hammer_record_ondisk_t rec;
1165 hammer_base_elm_t base;
1168 hammer_init_cursor_ip(&cursor, ip);
1170 cursor.key_beg.obj_id = ip->obj_id;
1171 cursor.key_beg.create_tid = ip->obj_asof;
1172 cursor.key_beg.delete_tid = 0;
1173 cursor.key_beg.obj_type = 0;
1174 cursor.key_beg.rec_type = 0;
1175 cursor.key_beg.key = HAMMER_MIN_KEY;
1177 cursor.key_end = cursor.key_beg;
1178 cursor.key_end.rec_type = 0xFFFF;
1179 cursor.key_end.key = HAMMER_MAX_KEY;
1181 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1183 error = hammer_ip_first(&cursor, ip);
1186 * Iterate through matching records and mark them as deleted.
1188 while (error == 0) {
1189 rec = cursor.record;
1190 base = &rec->base.base;
1192 KKASSERT(base->delete_tid == 0);
1195 * Mark the record and B-Tree entry as deleted. This will
1196 * also physically delete the B-Tree entry, record, and
1197 * data if the retention policy dictates. The function
1198 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1199 * uses to perform a fixup.
1201 error = hammer_ip_delete_record(&cursor, trans->tid);
1204 error = hammer_ip_next(&cursor);
1206 hammer_done_cursor(&cursor);
1207 if (error == ENOENT)
1213 * Delete the record at the current cursor
1216 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1218 hammer_btree_elm_t elm;
1223 * In-memory (unsynchronized) records can simply be freed.
1225 cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1226 if (cursor->record == &cursor->iprec->rec) {
1227 hammer_free_mem_record(cursor->iprec); /* XXX */
1232 * On-disk records are marked as deleted by updating their delete_tid.
1234 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1236 hmp = cursor->node->cluster->volume->hmp;
1239 elm = &cursor->node->ondisk->elms[cursor->index];
1240 cursor->record->base.base.delete_tid = tid;
1241 elm->leaf.base.delete_tid = tid;
1242 hammer_modify_buffer(cursor->record_buffer);
1243 hammer_modify_node(cursor->node);
1247 * If we were mounted with the nohistory option, we physically
1248 * delete the record.
1250 if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1252 int32_t data_offset;
1254 hammer_cluster_t cluster;
1256 rec_offset = elm->leaf.rec_offset;
1257 data_offset = elm->leaf.data_offset;
1258 data_len = elm->leaf.data_len;
1260 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1261 rec_offset, data_offset, data_len);
1263 cluster = cursor->node->cluster;
1264 hammer_ref_cluster(cluster);
1266 error = hammer_btree_delete(cursor);
1269 * This forces a fixup for the iteration because
1270 * the cursor is now either sitting at the 'next'
1271 * element or sitting at the end of a leaf.
1273 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1274 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1275 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1277 hammer_free_record(cluster, rec_offset);
1278 if (data_offset && (data_offset - rec_offset < 0 ||
1279 data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1280 hammer_free_data(cluster, data_offset,data_len);
1283 hammer_rel_cluster(cluster, 0);
1285 kprintf("hammer_ip_delete_record: unable to physically delete the record!\n");