HAMMER 9/many - btree removal cases, mount nohistory
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.8 2007/11/30 00:16:56 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61                 return(-1);
62         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63                 return(1);
64         return(0);
65 }
66
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         /*
71          * A key1->rec_type of 0 matches any record type.
72          */
73         if (info->rec_type) {
74                 if (info->rec_type < rec->rec.base.base.rec_type)
75                         return(-3);
76                 if (info->rec_type > rec->rec.base.base.rec_type)
77                         return(3);
78         }
79
80         /*
81          * There is no special case for key.  0 means 0.
82          */
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87
88         /*
89          * This test has a number of special cases.  create_tid in key1 is
90          * the as-of transction id, and delete_tid in key1 is NOT USED.
91          *
92          * A key1->create_tid of 0 matches any record regardles of when
93          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
94          * used to search for the most current state of the object.
95          *
96          * key2->create_tid is a HAMMER record and will never be
97          * 0.   key2->delete_tid is the deletion transaction id or 0 if
98          * the record has not yet been deleted.
99          */
100         if (info->create_tid) {
101                 if (info->create_tid < rec->rec.base.base.create_tid)
102                         return(-1);
103                 if (rec->rec.base.base.delete_tid &&
104                     info->create_tid >= rec->rec.base.base.delete_tid) {
105                         return(1);
106                 }
107         }
108         return(0);
109 }
110
111 /*
112  * RB_SCAN comparison code for hammer_mem_first().  The argument order
113  * is reversed so the comparison result has to be negated.  key_beg and
114  * key_end are both range-inclusive.
115  *
116  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
117  * These do not stop the scan.
118  *
119  * Localized deletions are not cached in-memory.
120  */
121 static
122 int
123 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
124 {
125         hammer_cursor_t cursor = data;
126         int r;
127
128         r = hammer_rec_compare(&cursor->key_beg, rec);
129         if (r > 1)
130                 return(-1);
131         if (r == 0)
132                 return(0);
133         r = hammer_rec_compare(&cursor->key_end, rec);
134         if (r < -1)
135                 return(1);
136         return(0);
137 }
138
139 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
140 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
141                     hammer_rec_compare, hammer_base_elm_t);
142
143 /*
144  * Allocate a record for the caller to finish filling in
145  */
146 hammer_record_t
147 hammer_alloc_mem_record(hammer_inode_t ip)
148 {
149         hammer_record_t record;
150
151         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
152         record->ip = ip;
153         return (record);
154 }
155
156 /*
157  * Release a memory record.  If the record is marked for defered deletion,
158  * destroy the record when the last reference goes away.
159  */
160 void
161 hammer_rel_mem_record(struct hammer_record **recordp)
162 {
163         hammer_record_t rec;
164
165         if ((rec = *recordp) != NULL) {
166                 if (hammer_islastref(&rec->lock)) {
167                         hammer_unref(&rec->lock);
168                         if (rec->flags & HAMMER_RECF_DELETED)
169                                 hammer_free_mem_record(rec);
170                 } else {
171                         hammer_unref(&rec->lock);
172                 }
173                 *recordp = NULL;
174         }
175 }
176
177 /*
178  * Free a record.  Clean the structure up even though we are throwing it
179  * away as a sanity check.  The actual free operation is delayed while
180  * the record is referenced.  However, the record is removed from the RB
181  * tree immediately.
182  */
183 void
184 hammer_free_mem_record(hammer_record_t record)
185 {
186         if (record->flags & HAMMER_RECF_ONRBTREE) {
187                 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
188                 record->flags &= ~HAMMER_RECF_ONRBTREE;
189         }
190         if (record->lock.refs) {
191                 record->flags |= HAMMER_RECF_DELETED;
192                 return;
193         }
194         if (record->flags & HAMMER_RECF_ALLOCDATA) {
195                 kfree(record->data, M_HAMMER);
196                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
197         }
198         record->data = NULL;
199         kfree(record, M_HAMMER);
200 }
201
202 /*
203  * Lookup an in-memory record given the key specified in the cursor.  Works
204  * just like hammer_btree_lookup() but operates on an inode's in-memory
205  * record list.
206  *
207  * The lookup must fail if the record is marked for deferred deletion.
208  */
209 static
210 int
211 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
212 {
213         int error;
214
215         if (cursor->iprec)
216                 hammer_rel_mem_record(&cursor->iprec);
217         if (cursor->ip) {
218                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
219                                                   &cursor->ip->rec_tree);
220         }
221         cursor->ip = ip;
222         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
223         cursor->scan.node = NULL;
224         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
225                                 &ip->rec_tree, &cursor->key_beg);
226         if (cursor->iprec == NULL) {
227                 error = ENOENT;
228         } else {
229                 hammer_ref(&cursor->iprec->lock);
230                 error = 0;
231         }
232         return(error);
233 }
234
235 /*
236  * hammer_mem_first() - locate the first in-memory record matching the
237  * cursor.
238  *
239  * The RB_SCAN function we use is designed as a callback.  We terminate it
240  * (return -1) as soon as we get a match.
241  */
242 static
243 int
244 hammer_rec_scan_callback(hammer_record_t rec, void *data)
245 {
246         hammer_cursor_t cursor = data;
247
248         /*
249          * Skip if not visible due to our as-of TID
250          */
251         if (cursor->key_beg.create_tid) {
252                 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
253                         return(0);
254                 if (rec->rec.base.base.delete_tid &&
255                     cursor->key_beg.create_tid >=
256                      rec->rec.base.base.delete_tid) {
257                         return(0);
258                 }
259         }
260
261         /*
262          * Return the first matching record and stop the scan
263          */
264         if (cursor->iprec == NULL) {
265                 cursor->iprec = rec;
266                 hammer_ref(&rec->lock);
267                 return(-1);
268         }
269         return(0);
270 }
271
272 static
273 int
274 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
275 {
276         if (cursor->iprec)
277                 hammer_rel_mem_record(&cursor->iprec);
278         if (cursor->ip) {
279                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
280                                                   &cursor->ip->rec_tree);
281         }
282         cursor->ip = ip;
283         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
284         cursor->scan.node = NULL;
285         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
286                                    hammer_rec_scan_callback, cursor);
287
288         /*
289          * Adjust scan.node and keep it linked into the RB-tree so we can
290          * hold the cursor through third party modifications of the RB-tree.
291          */
292         if (cursor->iprec) {
293                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
294                 return(0);
295         }
296         return(ENOENT);
297 }
298
299 void
300 hammer_mem_done(hammer_cursor_t cursor)
301 {
302         if (cursor->ip) {
303                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
304                                                   &cursor->ip->rec_tree);
305                 cursor->ip = NULL;
306         }
307         if (cursor->iprec)
308                 hammer_rel_mem_record(&cursor->iprec);
309 }
310
311 /************************************************************************
312  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
313  ************************************************************************
314  *
315  * These functions manipulate in-memory records.  Such records typically
316  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
317  */
318
319 /*
320  * Add a directory entry (dip,ncp) which references inode (ip).
321  *
322  * Note that the low 32 bits of the namekey are set temporarily to create
323  * a unique in-memory record, and may be modified a second time when the
324  * record is synchronized to disk.  In particular, the low 32 bits cannot be
325  * all 0's when synching to disk, which is not handled here.
326  */
327 int
328 hammer_ip_add_directory(struct hammer_transaction *trans,
329                      struct hammer_inode *dip, struct namecache *ncp,
330                      struct hammer_inode *ip)
331 {
332         hammer_record_t record;
333         int error;
334         int bytes;
335
336         record = hammer_alloc_mem_record(dip);
337
338         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
339         if (++trans->hmp->namekey_iterator == 0)
340                 ++trans->hmp->namekey_iterator;
341
342         record->rec.entry.base.base.obj_id = dip->obj_id;
343         record->rec.entry.base.base.key =
344                 hammer_directory_namekey(ncp->nc_name, bytes);
345         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
346         record->rec.entry.base.base.create_tid = trans->tid;
347         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
348         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
349         record->rec.entry.obj_id = ip->obj_id;
350         if (bytes <= sizeof(record->rec.entry.den_name)) {
351                 record->data = (void *)record->rec.entry.den_name;
352                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
353         } else {
354                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
355                 record->flags |= HAMMER_RECF_ALLOCDATA;
356         }
357         bcopy(ncp->nc_name, record->data, bytes);
358         record->rec.entry.base.data_len = bytes;
359         ++ip->ino_rec.ino_nlinks;
360         hammer_modify_inode(trans, ip,
361                             HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
362         error = hammer_mem_add(trans, record);
363         return(error);
364 }
365
366 /*
367  * Delete the directory entry and update the inode link count.  The
368  * cursor must be seeked to the directory entry record being deleted.
369  *
370  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
371  */
372 int
373 hammer_ip_del_directory(struct hammer_transaction *trans,
374                      hammer_cursor_t cursor, struct hammer_inode *dip,
375                      struct hammer_inode *ip)
376 {
377         int error;
378
379         error = hammer_ip_delete_record(cursor, trans->tid);
380
381         /*
382          * One less link.  The file may still be open in the OS even after
383          * all links have gone away so we don't destroy the inode's data
384          * here.
385          */
386         if (error == 0) {
387                 --ip->ino_rec.ino_nlinks;
388                 hammer_modify_inode(trans, ip,
389                                     HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
390                 if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))
391                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
392
393         }
394         return(error);
395 }
396
397 /*
398  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
399  * is called via the strategy called from a cached data source.  This code
400  * is responsible for actually writing a data record out to the disk.
401  */
402 int
403 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
404                        int64_t offset, void *data, int bytes)
405 {
406         struct hammer_cursor cursor;
407         hammer_record_ondisk_t rec;
408         union hammer_btree_elm elm;
409         void *bdata;
410         int error;
411
412         error = hammer_init_cursor_ip(&cursor, ip);
413         if (error)
414                 return(error);
415         cursor.key_beg.obj_id = ip->obj_id;
416         cursor.key_beg.key = offset + bytes;
417         cursor.key_beg.create_tid = trans->tid;
418         cursor.key_beg.delete_tid = 0;
419         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
420         cursor.flags = HAMMER_CURSOR_INSERT;
421
422         /*
423          * Issue a lookup to position the cursor and locate the cluster
424          */
425         error = hammer_btree_lookup(&cursor);
426         if (error == 0) {
427                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
428                         offset, bytes);
429                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
430                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
431                 error = EIO;
432         }
433         if (error != ENOENT)
434                 goto done;
435
436         /*
437          * Allocate record and data space now that we know which cluster
438          * the B-Tree node ended up in.
439          */
440         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
441                                   &cursor.data_buffer);
442         if (bdata == NULL)
443                 goto done;
444         rec = hammer_alloc_record(cursor.node->cluster, &error,
445                                   &cursor.record_buffer);
446         if (rec == NULL)
447                 goto fail1;
448
449         /*
450          * Fill everything in and insert our B-Tree node.
451          */
452         rec->base.base = cursor.key_beg;
453         rec->base.data_crc = crc32(data, bytes);
454         rec->base.rec_id = 0;   /* XXX */
455         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
456         rec->base.data_len = bytes;
457         hammer_modify_buffer(cursor.record_buffer);
458
459         bcopy(data, bdata, bytes);
460         hammer_modify_buffer(cursor.data_buffer);
461
462         elm.leaf.base = cursor.key_beg;
463         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
464         elm.leaf.data_offset = rec->base.data_offset;
465         elm.leaf.data_len = bytes;
466         elm.leaf.data_crc = rec->base.data_crc;
467
468         error = hammer_btree_insert(&cursor, &elm);
469         if (error == 0)
470                 goto done;
471
472         hammer_free_record_ptr(cursor.record_buffer, rec);
473 fail1:
474         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
475 done:
476         hammer_done_cursor(&cursor);
477         return(error);
478 }
479
480 /*
481  * Sync an in-memory record to the disk.  this is typically called via fsync
482  * from a cached record source.  This code is responsible for actually
483  * writing a record out to the disk.
484  */
485 int
486 hammer_ip_sync_record(hammer_record_t record)
487 {
488         struct hammer_cursor cursor;
489         hammer_record_ondisk_t rec;
490         union hammer_btree_elm elm;
491         void *bdata;
492         int error;
493
494         error = hammer_init_cursor_ip(&cursor, record->ip);
495         if (error)
496                 return(error);
497         cursor.key_beg = record->rec.base.base;
498         cursor.flags = HAMMER_CURSOR_INSERT;
499
500         /*
501          * Issue a lookup to position the cursor and locate the cluster
502          */
503         error = hammer_btree_lookup(&cursor);
504         if (error == 0) {
505                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
506                         record->rec.base.base.key);
507                 error = EIO;
508         }
509         if (error != ENOENT)
510                 goto done;
511
512         /*
513          * Allocate record and data space now that we know which cluster
514          * the B-Tree node ended up in.
515          */
516         if (record->data == NULL ||
517             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
518                 bdata = record->data;
519         } else {
520                 bdata = hammer_alloc_data(cursor.node->cluster,
521                                           record->rec.base.data_len, &error,
522                                           &cursor.data_buffer);
523                 if (bdata == NULL)
524                         goto done;
525         }
526         rec = hammer_alloc_record(cursor.node->cluster, &error,
527                                   &cursor.record_buffer);
528         if (rec == NULL)
529                 goto fail1;
530
531         /*
532          * Fill everything in and insert our B-Tree node.
533          *
534          * XXX assign rec_id here
535          */
536         *rec = record->rec;
537         if (bdata) {
538                 rec->base.data_crc = crc32(record->data,
539                                            record->rec.base.data_len);
540                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
541                         /*
542                          * Data embedded in record
543                          */
544                         rec->base.data_offset = ((char *)bdata -
545                                                  (char *)&record->rec);
546                         KKASSERT(rec->base.data_offset >= 0 && 
547                                  rec->base.data_offset + rec->base.data_len <
548                                   sizeof(*rec));
549                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
550                 } else {
551                         /*
552                          * Data separate from record
553                          */
554                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
555                         bcopy(record->data, bdata, rec->base.data_len);
556                         hammer_modify_buffer(cursor.data_buffer);
557                 }
558         }
559         rec->base.rec_id = 0;   /* XXX */
560
561         hammer_modify_buffer(cursor.record_buffer);
562
563         elm.leaf.base = cursor.key_beg;
564         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
565         elm.leaf.data_offset = rec->base.data_offset;
566         elm.leaf.data_len = rec->base.data_len;
567         elm.leaf.data_crc = rec->base.data_crc;
568
569         error = hammer_btree_insert(&cursor, &elm);
570         if (error == 0)
571                 goto done;
572
573         hammer_free_record_ptr(cursor.record_buffer, rec);
574 fail1:
575         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
576                 hammer_free_data_ptr(cursor.data_buffer, bdata,
577                                      rec->base.data_len);
578         }
579 done:
580         hammer_done_cursor(&cursor);
581         return(error);
582 }
583
584
585 /*
586  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
587  * entry's key is used to deal with hash collisions in the upper 32 bits.
588  * A unique 64 bit key is generated in-memory and may be regenerated a
589  * second time when the directory record is flushed to the on-disk B-Tree.
590  */
591 static
592 int
593 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
594 {
595         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
596                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
597                         hammer_free_mem_record(record);
598                         return (EEXIST);
599                 }
600                 if (++trans->hmp->namekey_iterator == 0)
601                         ++trans->hmp->namekey_iterator;
602                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
603                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
604         }
605         record->flags |= HAMMER_RECF_ONRBTREE;
606         return(0);
607 }
608
609 /************************************************************************
610  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
611  ************************************************************************
612  *
613  * These functions augment the B-Tree scanning functions in hammer_btree.c
614  * by merging in-memory records with on-disk records.
615  */
616
617 /*
618  * Locate a particular record either in-memory or on-disk.
619  *
620  * NOTE: This is basically a standalone routine, hammer_ip_next() may
621  * NOT be called to iterate results.
622  */
623 int
624 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
625 {
626         int error;
627
628         /*
629          * If the element is in-memory return it without searching the
630          * on-disk B-Tree
631          */
632         error = hammer_mem_lookup(cursor, ip);
633         if (error == 0) {
634                 cursor->record = &cursor->iprec->rec;
635                 return(error);
636         }
637         if (error != ENOENT)
638                 return(error);
639
640         /*
641          * If the inode has on-disk components search the on-disk B-Tree.
642          */
643         if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
644                 return(error);
645         error = hammer_btree_lookup(cursor);
646         if (error == 0)
647                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
648         return(error);
649 }
650
651 /*
652  * Locate the first record within the cursor's key_beg/key_end range,
653  * restricted to a particular inode.  0 is returned on success, ENOENT
654  * if no records matched the requested range, or some other error.
655  *
656  * When 0 is returned hammer_ip_next() may be used to iterate additional
657  * records within the requested range.
658  */
659 int
660 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
661 {
662         int error;
663
664         /*
665          * Clean up fields and setup for merged scan
666          */
667         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
668         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
669         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
670         if (cursor->iprec)
671                 hammer_rel_mem_record(&cursor->iprec);
672
673         /*
674          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
675          * exact lookup so if we get ENOENT we have to call the iterate
676          * function to validate the first record after the begin key.
677          *
678          * The ATEDISK flag is used by hammer_btree_iterate to determine
679          * whether it must index forwards or not.
680          */
681         if (ip->flags & HAMMER_INODE_ONDISK) {
682                 error = hammer_btree_lookup(cursor);
683                 if (error == ENOENT) {
684                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
685                         error = hammer_btree_iterate(cursor);
686                 }
687                 if (error && error != ENOENT) 
688                         return(error);
689                 if (error == 0) {
690                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
691                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
692                 } else {
693                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
694                 }
695         }
696
697         /*
698          * Search the in-memory record list (Red-Black tree).  Unlike the
699          * B-Tree search, mem_first checks for records in the range.
700          */
701         error = hammer_mem_first(cursor, ip);
702         if (error && error != ENOENT)
703                 return(error);
704         if (error == 0) {
705                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
706                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
707         }
708
709         /*
710          * This will return the first matching record.
711          */
712         return(hammer_ip_next(cursor));
713 }
714
715 /*
716  * Retrieve the next record in a merged iteration within the bounds of the
717  * cursor.  This call may be made multiple times after the cursor has been
718  * initially searched with hammer_ip_first().
719  *
720  * 0 is returned on success, ENOENT if no further records match the
721  * requested range, or some other error code is returned.
722  */
723 int
724 hammer_ip_next(hammer_cursor_t cursor)
725 {
726         hammer_btree_elm_t elm;
727         hammer_record_t rec;
728         int error;
729         int r;
730
731         /*
732          * Load the current on-disk and in-memory record.  If we ate any
733          * records we have to get the next one. 
734          *
735          * If we deleted the last on-disk record we had scanned ATEDISK will
736          * be clear and DELBTREE will be set, forcing a call to iterate. The
737          * fact that ATEDISK is clear causes iterate to re-test the 'current'
738          * element.  If ATEDISK is set, iterate will skip the 'current'
739          * element.
740          *
741          * Get the next on-disk record
742          */
743         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
744                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
745                         error = hammer_btree_iterate(cursor);
746                         if (error == 0)
747                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
748                         else
749                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
750                                                  HAMMER_CURSOR_ATEDISK;
751                 }
752         }
753
754         /*
755          * Get the next in-memory record.  The record can be ripped out
756          * of the RB tree so we maintain a scan_info structure to track
757          * the next node.
758          *
759          * hammer_rec_scan_cmp:  Is the record still in our general range,
760          *                       (non-inclusive of snapshot exclusions)?
761          * hammer_rec_scan_callback: Is the record in our snapshot?
762          */
763         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
764                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
765                         hammer_rel_mem_record(&cursor->iprec);
766                         rec = cursor->scan.node;        /* next node */
767                         while (rec) {
768                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
769                                         break;
770                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
771                                         break;
772                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
773                         }
774                         if (cursor->iprec) {
775                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
776                                 hammer_ref(&cursor->iprec->lock);
777                                 cursor->scan.node =
778                                         hammer_rec_rb_tree_RB_NEXT(rec);
779                         } else {
780                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
781                         }
782                 }
783         }
784
785         /*
786          * Extract either the disk or memory record depending on their
787          * relative position.
788          */
789         error = 0;
790         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
791         case 0:
792                 /*
793                  * Both entries valid
794                  */
795                 elm = &cursor->node->ondisk->elms[cursor->index];
796                 r = hammer_btree_cmp(&elm->base,
797                                      &cursor->iprec->rec.base.base);
798                 if (r < 0) {
799                         error = hammer_btree_extract(cursor,
800                                                      HAMMER_CURSOR_GET_RECORD);
801                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
802                         break;
803                 }
804                 /* fall through to the memory entry */
805         case HAMMER_CURSOR_ATEDISK:
806                 /*
807                  * Only the memory entry is valid
808                  */
809                 cursor->record = &cursor->iprec->rec;
810                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
811                 break;
812         case HAMMER_CURSOR_ATEMEM:
813                 /*
814                  * Only the disk entry is valid
815                  */
816                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
817                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
818                 break;
819         default:
820                 /*
821                  * Neither entry is valid
822                  *
823                  * XXX error not set properly
824                  */
825                 cursor->record = NULL;
826                 error = ENOENT;
827                 break;
828         }
829         return(error);
830 }
831
832 /*
833  * Resolve the cursor->data pointer for the current cursor position in
834  * a merged iteration.
835  */
836 int
837 hammer_ip_resolve_data(hammer_cursor_t cursor)
838 {
839         int error;
840
841         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
842                 cursor->data = cursor->iprec->data;
843                 error = 0;
844         } else {
845                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
846         }
847         return(error);
848 }
849
850 /*
851  * Delete all records within the specified range for inode ip.
852  *
853  * NOTE: An unaligned range will cause new records to be added to cover
854  * the edge cases. (XXX not implemented yet).
855  *
856  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
857  *
858  * NOTE: Record keys for regular file data have to be special-cased since
859  * they indicate the end of the range (key = base + bytes).
860  */
861 int
862 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
863                        int64_t ran_beg, int64_t ran_end)
864 {
865         struct hammer_cursor cursor;
866         hammer_record_ondisk_t rec;
867         hammer_base_elm_t base;
868         int error;
869         int isregfile;
870         int64_t off;
871
872         hammer_init_cursor_ip(&cursor, ip);
873
874         cursor.key_beg.obj_id = ip->obj_id;
875         cursor.key_beg.create_tid = ip->obj_asof;
876         cursor.key_beg.delete_tid = 0;
877         cursor.key_beg.obj_type = 0;
878
879         cursor.key_end = cursor.key_beg;
880         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
881                 cursor.key_beg.key = ran_beg;
882                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
883                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
884                 cursor.key_end.key = ran_end;
885                 isregfile = 0;
886         } else {
887                 /*
888                  * The key in the B-Tree is (base+bytes), so the first possible
889                  * matching key is ran_beg + 1.
890                  */
891                 int64_t tmp64;
892
893                 cursor.key_beg.key = ran_beg + 1;
894                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
895                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
896
897                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
898                 if (tmp64 < ran_end)
899                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
900                 else
901                         cursor.key_end.key = ran_end + MAXPHYS + 1;
902                 isregfile = 1;
903         }
904
905         error = hammer_ip_first(&cursor, ip);
906
907         /*
908          * Iterate through matching records and mark them as deleted.
909          */
910         while (error == 0) {
911                 rec = cursor.record;
912                 base = &rec->base.base;
913
914                 KKASSERT(base->delete_tid == 0);
915
916                 /*
917                  * There may be overlap cases for regular file data.  Also
918                  * remember the key for a regular file record is the offset
919                  * of the last byte of the record (base + len - 1), NOT the
920                  * base offset.
921                  */
922                 kprintf("delete_range rec_type %02x\n", base->rec_type);
923                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
924                         kprintf("delete_range loop key %016llx\n", base->key - rec->base.data_len);
925                         off = base->key - rec->base.data_len;
926                         /*
927                          * Check the left edge case.  We currently do not
928                          * split existing records.
929                          */
930                         if (off < ran_beg) {
931                                 panic("hammer left edge case %016llx %d\n",
932                                         base->key, rec->base.data_len);
933                         }
934
935                         /*
936                          * Check the right edge case.  Note that the
937                          * record can be completely out of bounds, which
938                          * terminates the search.
939                          *
940                          * base->key is exclusive of the right edge while
941                          * ran_end is inclusive of the right edge.  The
942                          * (key - data_len) left boundary is inclusive.
943                          *
944                          * XXX theory-check this test at some point, are
945                          * we missing a + 1 somewhere?  Note that ran_end
946                          * could overflow.
947                          */
948                         if (base->key > ran_end) {
949                                 if (base->key - rec->base.data_len > ran_end) {
950                                         kprintf("right edge OOB\n");
951                                         break;
952                                 }
953                                 panic("hammer right edge case\n");
954                         }
955                 }
956
957                 /*
958                  * Mark the record and B-Tree entry as deleted.  This will
959                  * also physically delete the B-Tree entry, record, and
960                  * data if the retention policy dictates.  The function
961                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
962                  * uses to perform a fixup.
963                  */
964                 error = hammer_ip_delete_record(&cursor, trans->tid);
965                 if (error)
966                         break;
967                 error = hammer_ip_next(&cursor);
968         }
969         hammer_done_cursor(&cursor);
970         if (error == ENOENT)
971                 error = 0;
972         return(error);
973 }
974
975 /*
976  * Delete the record at the current cursor
977  */
978 int
979 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
980 {
981         hammer_btree_elm_t elm;
982         hammer_mount_t hmp;
983         int error;
984
985         /*
986          * In-memory (unsynchronized) records can simply be freed.
987          */
988         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
989         if (cursor->record == &cursor->iprec->rec) {
990                 hammer_free_mem_record(cursor->iprec);
991                 return(0);
992         }
993
994         /*
995          * On-disk records are marked as deleted by updating their delete_tid.
996          */
997         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
998         elm = NULL;
999         hmp = cursor->node->cluster->volume->hmp;
1000
1001         if (error == 0) {
1002                 elm = &cursor->node->ondisk->elms[cursor->index];
1003                 cursor->record->base.base.delete_tid = tid;
1004                 elm->leaf.base.delete_tid = tid;
1005                 hammer_modify_buffer(cursor->record_buffer);
1006                 hammer_modify_node(cursor->node);
1007         }
1008
1009         /*
1010          * If we were mounted with the nohistory option, we physically
1011          * delete the record.
1012          */
1013         if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1014                 int32_t rec_offset;
1015                 int32_t data_offset;
1016                 int32_t data_len;
1017                 hammer_cluster_t cluster;
1018
1019                 rec_offset = elm->leaf.rec_offset;
1020                 data_offset = elm->leaf.data_offset;
1021                 data_len = elm->leaf.data_len;
1022                 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1023                         rec_offset, data_offset, data_len);
1024                 cluster = cursor->node->cluster;
1025                 hammer_ref_cluster(cluster);
1026
1027                 error = hammer_btree_delete(cursor);
1028                 if (error == 0) {
1029                         /*
1030                          * This forces a fixup for the iteration because
1031                          * the cursor is now either sitting at the 'next'
1032                          * element or sitting at the end of a leaf.
1033                          */
1034                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1035                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1036                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1037                         }
1038                         hammer_free_record(cluster, rec_offset);
1039                         if (data_offset - rec_offset < 0 ||
1040                             data_offset - rec_offset >= HAMMER_RECORD_SIZE) {
1041                                 hammer_free_data(cluster, data_offset,data_len);
1042                         }
1043                 }
1044                 hammer_rel_cluster(cluster, 0);
1045                 if (error) {
1046                         kprintf("hammer_ip_delete_record: unable to physically delete the record!\n");
1047                         error = 0;
1048                 }
1049         }
1050         return(error);
1051 }
1052