HAMMER 5/many - in-memory cache and more vnops.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.4 2007/11/20 22:55:40 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61                 return(-1);
62         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63                 return(1);
64         return(0);
65 }
66
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         /*
71          * A key1->rec_type of 0 matches any record type.
72          */
73         if (info->rec_type) {
74                 if (info->rec_type < rec->rec.base.base.rec_type)
75                         return(-3);
76                 if (info->rec_type > rec->rec.base.base.rec_type)
77                         return(3);
78         }
79
80         /*
81          * There is no special case for key.  0 means 0.
82          */
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87
88         /*
89          * This test has a number of special cases.  create_tid in key1 is
90          * the as-of transction id, and delete_tid in key1 is NOT USED.
91          *
92          * A key1->create_tid of 0 matches any record regardles of when
93          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
94          * used to search for the most current state of the object.
95          *
96          * key2->create_tid is a HAMMER record and will never be
97          * 0.   key2->delete_tid is the deletion transaction id or 0 if
98          * the record has not yet been deleted.
99          */
100         if (info->create_tid) {
101                 if (info->create_tid < rec->rec.base.base.create_tid)
102                         return(-1);
103                 if (rec->rec.base.base.delete_tid &&
104                     info->create_tid >= rec->rec.base.base.delete_tid) {
105                         return(1);
106                 }
107         }
108         return(0);
109 }
110
111 /*
112  * RB_SCAN comparison code for hammer_mem_search().  The argument order
113  * is reversed so the comparison result has to be negated.  key_beg and
114  * key_end are both inclusive boundaries.
115  */
116 static
117 int
118 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
119 {
120         hammer_cursor_t cursor = data;
121         int r;
122
123         r = hammer_rec_compare(&cursor->key_beg, rec);
124         if (r > 0)
125                 return(-1);
126         if (r == 0)
127                 return(0);
128         r = hammer_rec_compare(&cursor->key_end, rec);
129         if (r <= 0)
130                 return(1);
131         return(0);
132 }
133
134 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
135 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
136                     hammer_rec_compare, hammer_base_elm_t);
137
138 /*
139  * Allocate a record for the caller to finish filling in
140  */
141 hammer_record_t
142 hammer_alloc_mem_record(struct hammer_transaction *trans, hammer_inode_t ip)
143 {
144         hammer_record_t record;
145
146         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
147         record->ip = ip;
148         return (record);
149 }
150
151 /*
152  * Release a memory record.  If the record is marked for defered deletion,
153  * destroy the record when the last reference goes away.
154  */
155 void
156 hammer_rel_mem_record(struct hammer_record **recordp)
157 {
158         hammer_record_t rec;
159
160         if ((rec = *recordp) != NULL) {
161                 if (hammer_islastref(&rec->lock)) {
162                         hammer_unref(&rec->lock);
163                         if (rec->flags & HAMMER_RECF_DELETED)
164                                 hammer_free_mem_record(rec);
165                 } else {
166                         hammer_unref(&rec->lock);
167                 }
168                 *recordp = NULL;
169         }
170 }
171
172 /*
173  * Free a record.  Clean the structure up even though we are throwing it
174  * away as a sanity check.  The actual free operation is delayed while
175  * the record is referenced.  However, the record is removed from the RB
176  * tree immediately.
177  */
178 void
179 hammer_free_mem_record(hammer_record_t record)
180 {
181         if (record->flags & HAMMER_RECF_ONRBTREE) {
182                 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
183                 record->flags &= ~HAMMER_RECF_ONRBTREE;
184         }
185         if (record->lock.refs) {
186                 record->flags |= HAMMER_RECF_DELETED;
187                 return;
188         }
189         if (record->flags & HAMMER_RECF_ALLOCDATA) {
190                 kfree(record->data, M_HAMMER);
191                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
192         }
193         record->data = NULL;
194         kfree(record, M_HAMMER);
195 }
196
197 /*
198  * Lookup an in-memory record given the key specified in the cursor.  Works
199  * just like hammer_btree_lookup() but operates on an inode's in-memory
200  * record list.
201  *
202  * The lookup must fail if the record is marked for deferred deletion.
203  */
204 static
205 int
206 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
207 {
208         int error;
209
210         if (cursor->iprec)
211                 hammer_rel_mem_record(&cursor->iprec);
212         if (cursor->ip) {
213                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
214                                                   &cursor->ip->rec_tree);
215         }
216         cursor->ip = ip;
217         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
218         cursor->scan.node = NULL;
219         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
220                                 &ip->rec_tree, &cursor->key_beg);
221         if (cursor->iprec == NULL) {
222                 error = ENOENT;
223         } else {
224                 hammer_ref(&cursor->iprec->lock);
225                 error = 0;
226         }
227         return(error);
228 }
229
230 /*
231  * hammer_mem_search() - locate the first in-memory record matching the
232  * cursor.
233  *
234  * The RB_SCAN function we use is designed as a callback.  We terminate it
235  * (return -1) as soon as we get a match.
236  */
237 static
238 int
239 hammer_rec_scan_callback(hammer_record_t rec, void *data)
240 {
241         hammer_cursor_t cursor = data;
242
243         if (cursor->iprec == NULL) {
244                 cursor->iprec = rec;
245                 hammer_ref(&rec->lock);
246                 return(-1);
247         }
248         return(0);
249 }
250
251 static
252 int
253 hammer_mem_search(hammer_cursor_t cursor, hammer_inode_t ip)
254 {
255         if (cursor->iprec)
256                 hammer_rel_mem_record(&cursor->iprec);
257         if (cursor->ip) {
258                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
259                                                   &cursor->ip->rec_tree);
260         }
261         cursor->ip = ip;
262         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
263         cursor->scan.node = NULL;
264         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
265                                    hammer_rec_scan_callback, cursor);
266         if (cursor->iprec) {
267                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
268                 return(0);
269         }
270         return(ENOENT);
271 }
272
273 void
274 hammer_mem_done(hammer_cursor_t cursor)
275 {
276         if (cursor->ip) {
277                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
278                                                   &cursor->ip->rec_tree);
279                 cursor->ip = NULL;
280         }
281         if (cursor->iprec)
282                 hammer_rel_mem_record(&cursor->iprec);
283 }
284
285 /************************************************************************
286  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
287  ************************************************************************
288  *
289  * These functions manipulate in-memory records.  Such records typically
290  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
291  */
292
293 /*
294  * Add a directory entry (dip,ncp) which references inode (ip).
295  *
296  * Note that the low 32 bits of the namekey are set temporarily to create
297  * a unique in-memory record, and may be modified a second time when the
298  * record is synchronized to disk.  In particular, the low 32 bits cannot be
299  * all 0's when synching to disk, which is not handled here.
300  */
301 int
302 hammer_ip_add_directory(struct hammer_transaction *trans,
303                      struct hammer_inode *dip, struct namecache *ncp,
304                      struct hammer_inode *ip)
305 {
306         hammer_record_t record;
307         int error;
308         int bytes;
309
310         record = hammer_alloc_mem_record(trans, dip);
311
312         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
313         if (++trans->hmp->namekey_iterator == 0)
314                 ++trans->hmp->namekey_iterator;
315
316         record->rec.entry.base.base.obj_id = dip->obj_id;
317         record->rec.entry.base.base.key =
318                 hammer_directory_namekey(ncp->nc_name, bytes);
319         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
320         record->rec.entry.base.base.create_tid = trans->tid;
321         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
322         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
323         record->rec.entry.obj_id = ip->obj_id;
324         if (bytes <= sizeof(record->rec.entry.den_name)) {
325                 record->data = (void *)record->rec.entry.den_name;
326         } else {
327                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
328                 record->flags |= HAMMER_RECF_ALLOCDATA;
329         }
330         bcopy(ncp->nc_name, record->data, bytes);
331         record->rec.entry.base.data_len = bytes;
332         ++ip->ino_rec.ino_nlinks;
333         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
334         error = hammer_mem_add(trans, record);
335         return(error);
336 }
337
338 /*
339  * Delete the directory entry and update the inode link count.  The
340  * cursor must be seeked to the directory entry record being deleted.
341  *
342  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
343  */
344 int
345 hammer_ip_del_directory(struct hammer_transaction *trans,
346                      hammer_cursor_t cursor, struct hammer_inode *dip,
347                      struct hammer_inode *ip)
348 {
349         int error;
350
351         if (cursor->record == &cursor->iprec->rec) {
352                 /*
353                  * The directory entry was in-memory, just scrap the
354                  * record.
355                  */
356                 hammer_free_mem_record(cursor->iprec);
357                 error = 0;
358         } else {
359                 /*
360                  * The directory entry was on-disk, mark the record and
361                  * B-Tree entry as deleted.  The B-Tree entry does not
362                  * have to be reindexed because a 'current' delete transid
363                  * will wind up in the same position as the live record.
364                  */
365                 KKASSERT(ip->flags & HAMMER_INODE_ONDISK);
366                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
367                 if (error == 0) {
368                         cursor->node->ondisk->elms[cursor->index].base.delete_tid = trans->tid;
369                         cursor->record->base.base.delete_tid = trans->tid;
370                         hammer_modify_node(cursor->node);
371                         hammer_modify_buffer(cursor->record_buffer);
372
373                 }
374         }
375
376         /*
377          * One less link.  Mark the inode and all of its records as deleted
378          * when the last link goes away.  The inode will be automatically
379          * flushed when its last reference goes away.
380          */
381         if (error == 0) {
382                 --ip->ino_rec.ino_nlinks;
383                 if (ip->ino_rec.ino_nlinks == 0)
384                         ip->ino_rec.base.base.delete_tid = trans->tid;
385                 error = hammer_ip_delete_range(trans, ip,
386                                                HAMMER_MIN_KEY, HAMMER_MAX_KEY);
387                 KKASSERT(RB_EMPTY(&ip->rec_tree));
388                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
389         }
390         return(error);
391 }
392
393 /*
394  * Add a data record to the filesystem.
395  *
396  * This is called via the strategy code, typically when the kernel wants to
397  * flush a buffer cache buffer, so this operation goes directly to the disk.
398  */
399 int
400 hammer_ip_add_data(hammer_transaction_t trans, hammer_inode_t ip,
401                        int64_t offset, void *data, int bytes)
402 {
403         panic("hammer_ip_add_data");
404 }
405
406 /*
407  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
408  * entry's key is used to deal with hash collisions in the upper 32 bits.
409  * A unique 64 bit key is generated in-memory and may be regenerated a
410  * second time when the directory record is flushed to the on-disk B-Tree.
411  */
412 static
413 int
414 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
415 {
416         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
417                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
418                         hammer_free_mem_record(record);
419                         return (EEXIST);
420                 }
421                 if (++trans->hmp->namekey_iterator == 0)
422                         ++trans->hmp->namekey_iterator;
423                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
424                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
425         }
426         record->flags |= HAMMER_RECF_ONRBTREE;
427         return(0);
428 }
429
430 /************************************************************************
431  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
432  ************************************************************************
433  *
434  * These functions augment the B-Tree scanning functions in hammer_btree.c
435  * by merging in-memory records with on-disk records.
436  */
437
438 /*
439  * Locate a particular record either in-memory or on-disk.
440  *
441  * NOTE: This is basically a standalone routine, hammer_ip_next() may
442  * NOT be called to iterate results.
443  */
444 int
445 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
446 {
447         int error;
448
449         /*
450          * If the element is in-memory return it without searching the
451          * on-disk B-Tree
452          */
453         error = hammer_mem_lookup(cursor, ip);
454         if (error == 0) {
455                 cursor->record = &cursor->iprec->rec;
456                 return(error);
457         }
458         if (error != ENOENT)
459                 return(error);
460
461         /*
462          * If the inode has on-disk components search the on-disk B-Tree.
463          */
464         if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
465                 return(error);
466         error = hammer_btree_lookup(cursor);
467         if (error == 0)
468                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
469         return(error);
470 }
471
472 /*
473  * Locate the first record within the cursor's key_beg/key_end range,
474  * restricted to a particular inode.  0 is returned on success, ENOENT
475  * if no records matched the requested range, or some other error.
476  *
477  * When 0 is returned hammer_ip_next() may be used to iterate additional
478  * records within the requested range.
479  */
480 int
481 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
482 {
483         int error;
484
485         /*
486          * Clean up fields and setup for merged scan
487          */
488         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
489         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
490         if (cursor->iprec)
491                 hammer_rel_mem_record(&cursor->iprec);
492
493         /*
494          * Search the on-disk B-Tree
495          */
496         if (ip->flags & HAMMER_INODE_ONDISK) {
497                 error = hammer_btree_lookup(cursor);
498                 if (error && error != ENOENT)
499                         return(error);
500                 if (error == 0) {
501                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF ;
502                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK ;
503                 }
504         }
505
506         /*
507          * Search the in-memory record list (Red-Black tree)
508          */
509         error = hammer_mem_search(cursor, ip);
510         if (error && error != ENOENT)
511                 return(error);
512         if (error == 0) {
513                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
514                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
515         }
516
517         /*
518          * This will return the first matching record.
519          */
520         return(hammer_ip_next(cursor));
521 }
522
523 /*
524  * Retrieve the next record in a merged iteration within the bounds of the
525  * cursor.  This call may be made multiple times after the cursor has been
526  * initially searched with hammer_ip_first().
527  *
528  * 0 is returned on success, ENOENT if no further records match the
529  * requested range, or some other error code is returned.
530  */
531 int
532 hammer_ip_next(hammer_cursor_t cursor)
533 {
534         hammer_btree_elm_t elm;
535         hammer_record_t rec;
536         int error;
537         int r;
538
539         /*
540          * Load the current on-disk and in-memory record.  If we ate any
541          * records we have to get the next one. 
542          *
543          * Get the next on-disk record
544          */
545         if (cursor->flags & HAMMER_CURSOR_ATEDISK) {
546                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
547                         error = hammer_btree_iterate(cursor);
548                         if (error == 0)
549                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
550                         else
551                                 cursor->flags |= HAMMER_CURSOR_DISKEOF;
552                 }
553         }
554
555         /*
556          * Get the next in-memory record.  The record can be ripped out
557          * of the RB tree so we maintain a scan_info structure to track
558          * the next node.
559          */
560         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
561                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
562                         rec = cursor->scan.node;        /* next node */
563                         if (rec) {
564                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
565                                 hammer_ref(&rec->lock);
566                                 cursor->scan.node =
567                                         hammer_rec_rb_tree_RB_NEXT(rec);
568                         } else {
569                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
570                         }
571                         hammer_rel_mem_record(&cursor->iprec);
572                         cursor->iprec = rec;
573                 }
574         }
575
576         /*
577          * Extract either the disk or memory record depending on their
578          * relative position.
579          */
580         error = 0;
581         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
582         case 0:
583                 /*
584                  * Both entries valid
585                  */
586                 elm = &cursor->node->ondisk->elms[cursor->index];
587                 r = hammer_btree_cmp(&elm->base,
588                                      &cursor->iprec->rec.base.base);
589                 if (r < 0) {
590                         error = hammer_btree_extract(cursor,
591                                                      HAMMER_CURSOR_GET_RECORD);
592                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
593                         break;
594                 }
595                 /* fall through to the memory entry */
596         case HAMMER_CURSOR_ATEDISK:
597                 /*
598                  * Only the memory entry is valid
599                  */
600                 cursor->record = &cursor->iprec->rec;
601                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
602                 break;
603         case HAMMER_CURSOR_ATEMEM:
604                 /*
605                  * Only the disk entry is valid
606                  */
607                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
608                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
609                 break;
610         default:
611                 /*
612                  * Neither entry is valid
613                  *
614                  * XXX error not set properly
615                  */
616                 cursor->record = NULL;
617                 error = ENOENT;
618                 break;
619         }
620         return(error);
621 }
622
623 /*
624  * Resolve the cursor->data pointer for the current cursor position in
625  * a merged iteration.
626  */
627 int
628 hammer_ip_resolve_data(hammer_cursor_t cursor)
629 {
630         int error;
631
632         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
633                 cursor->data = cursor->iprec->data;
634                 error = 0;
635         } else {
636                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
637         }
638         return(error);
639 }
640
641 /*
642  * Delete all records within the specified range for inode ip.
643  *
644  * NOTE: An unaligned range will cause new records to be added to cover
645  * the edge cases.
646  *
647  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
648  */
649 int
650 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
651                        int64_t ran_beg, int64_t ran_end)
652 {
653         struct hammer_cursor cursor;
654         hammer_record_ondisk_t rec;
655         hammer_base_elm_t base;
656         int error;
657         int64_t off;
658
659         hammer_init_cursor_ip(&cursor, ip);
660
661         cursor.key_beg.obj_id = ip->obj_id;
662         cursor.key_beg.create_tid = ip->obj_asof;
663         cursor.key_beg.delete_tid = 0;
664         cursor.key_beg.obj_type = 0;
665         cursor.key_beg.key = ran_beg;
666         cursor.key_end = cursor.key_beg;
667         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
668                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
669                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
670                 cursor.key_end.key = ran_end;
671         } else {
672                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
673                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
674                 if (ran_end + MAXPHYS < ran_end)
675                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
676                 else
677                         cursor.key_end.key = ran_end + MAXPHYS;
678         }
679
680         error = hammer_ip_first(&cursor, ip);
681
682         /*
683          * Iterate through matching records and mark them as deleted.
684          */
685         while (error == 0) {
686                 rec = cursor.record;
687                 base = &rec->base.base;
688
689                 KKASSERT(base->delete_tid == 0);
690
691                 /*
692                  * There may be overlap cases for regular file data.  Also
693                  * remember the key for a regular file record is the offset
694                  * of the last byte of the record (base + len - 1), NOT the
695                  * base offset.
696                  */
697                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
698                         off = base->key - rec->base.data_len + 1;
699                         /*
700                          * Check the left edge case
701                          */
702                         if (off < ran_beg) {
703                                 panic("hammer left edge case\n");
704                         }
705
706                         /*
707                          * Check the right edge case.  Note that the
708                          * record can be completely out of bounds, which
709                          * terminates the search.
710                          *
711                          * base->key is (base_offset + bytes - 1), ran_end
712                          * works the same way.
713                          */
714                         if (base->key > ran_end) {
715                                 if (base->key - rec->base.data_len + 1 > ran_end) {
716                                         kprintf("right edge OOB\n");
717                                         break;
718                                 }
719                                 panic("hammer right edge case\n");
720                         }
721                 }
722
723                 /*
724                  * Mark the record and B-Tree entry as deleted
725                  */
726                 if (cursor.record == &cursor.iprec->rec) {
727                         hammer_free_mem_record(cursor.iprec);
728
729                 } else {
730                         cursor.node->ondisk->elms[cursor.index].base.delete_tid = trans->tid;
731                         cursor.record->base.base.delete_tid = trans->tid;
732                         hammer_modify_node(cursor.node);
733                         hammer_modify_buffer(cursor.record_buffer);
734                 }
735                 error = hammer_ip_next(&cursor);
736         }
737         hammer_done_cursor(&cursor);
738         if (error == ENOENT)
739                 error = 0;
740         return(error);
741 }
742