HAMMER 8/many - A-list, B-Tree fixes. As-of queries
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.7 2007/11/27 07:48:52 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61                 return(-1);
62         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63                 return(1);
64         return(0);
65 }
66
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         /*
71          * A key1->rec_type of 0 matches any record type.
72          */
73         if (info->rec_type) {
74                 if (info->rec_type < rec->rec.base.base.rec_type)
75                         return(-3);
76                 if (info->rec_type > rec->rec.base.base.rec_type)
77                         return(3);
78         }
79
80         /*
81          * There is no special case for key.  0 means 0.
82          */
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87
88         /*
89          * This test has a number of special cases.  create_tid in key1 is
90          * the as-of transction id, and delete_tid in key1 is NOT USED.
91          *
92          * A key1->create_tid of 0 matches any record regardles of when
93          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
94          * used to search for the most current state of the object.
95          *
96          * key2->create_tid is a HAMMER record and will never be
97          * 0.   key2->delete_tid is the deletion transaction id or 0 if
98          * the record has not yet been deleted.
99          */
100         if (info->create_tid) {
101                 if (info->create_tid < rec->rec.base.base.create_tid)
102                         return(-1);
103                 if (rec->rec.base.base.delete_tid &&
104                     info->create_tid >= rec->rec.base.base.delete_tid) {
105                         return(1);
106                 }
107         }
108         return(0);
109 }
110
111 /*
112  * RB_SCAN comparison code for hammer_mem_first().  The argument order
113  * is reversed so the comparison result has to be negated.  key_beg and
114  * key_end are both range-inclusive.
115  *
116  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
117  * These do not stop the scan.
118  *
119  * Localized deletions are not cached in-memory.
120  */
121 static
122 int
123 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
124 {
125         hammer_cursor_t cursor = data;
126         int r;
127
128         r = hammer_rec_compare(&cursor->key_beg, rec);
129         if (r > 1)
130                 return(-1);
131         if (r == 0)
132                 return(0);
133         r = hammer_rec_compare(&cursor->key_end, rec);
134         if (r < -1)
135                 return(1);
136         return(0);
137 }
138
139 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
140 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
141                     hammer_rec_compare, hammer_base_elm_t);
142
143 /*
144  * Allocate a record for the caller to finish filling in
145  */
146 hammer_record_t
147 hammer_alloc_mem_record(hammer_inode_t ip)
148 {
149         hammer_record_t record;
150
151         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
152         record->ip = ip;
153         return (record);
154 }
155
156 /*
157  * Release a memory record.  If the record is marked for defered deletion,
158  * destroy the record when the last reference goes away.
159  */
160 void
161 hammer_rel_mem_record(struct hammer_record **recordp)
162 {
163         hammer_record_t rec;
164
165         if ((rec = *recordp) != NULL) {
166                 if (hammer_islastref(&rec->lock)) {
167                         hammer_unref(&rec->lock);
168                         if (rec->flags & HAMMER_RECF_DELETED)
169                                 hammer_free_mem_record(rec);
170                 } else {
171                         hammer_unref(&rec->lock);
172                 }
173                 *recordp = NULL;
174         }
175 }
176
177 /*
178  * Free a record.  Clean the structure up even though we are throwing it
179  * away as a sanity check.  The actual free operation is delayed while
180  * the record is referenced.  However, the record is removed from the RB
181  * tree immediately.
182  */
183 void
184 hammer_free_mem_record(hammer_record_t record)
185 {
186         if (record->flags & HAMMER_RECF_ONRBTREE) {
187                 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record);
188                 record->flags &= ~HAMMER_RECF_ONRBTREE;
189         }
190         if (record->lock.refs) {
191                 record->flags |= HAMMER_RECF_DELETED;
192                 return;
193         }
194         if (record->flags & HAMMER_RECF_ALLOCDATA) {
195                 kfree(record->data, M_HAMMER);
196                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
197         }
198         record->data = NULL;
199         kfree(record, M_HAMMER);
200 }
201
202 /*
203  * Lookup an in-memory record given the key specified in the cursor.  Works
204  * just like hammer_btree_lookup() but operates on an inode's in-memory
205  * record list.
206  *
207  * The lookup must fail if the record is marked for deferred deletion.
208  */
209 static
210 int
211 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
212 {
213         int error;
214
215         if (cursor->iprec)
216                 hammer_rel_mem_record(&cursor->iprec);
217         if (cursor->ip) {
218                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
219                                                   &cursor->ip->rec_tree);
220         }
221         cursor->ip = ip;
222         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
223         cursor->scan.node = NULL;
224         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
225                                 &ip->rec_tree, &cursor->key_beg);
226         if (cursor->iprec == NULL) {
227                 error = ENOENT;
228         } else {
229                 hammer_ref(&cursor->iprec->lock);
230                 error = 0;
231         }
232         return(error);
233 }
234
235 /*
236  * hammer_mem_first() - locate the first in-memory record matching the
237  * cursor.
238  *
239  * The RB_SCAN function we use is designed as a callback.  We terminate it
240  * (return -1) as soon as we get a match.
241  */
242 static
243 int
244 hammer_rec_scan_callback(hammer_record_t rec, void *data)
245 {
246         hammer_cursor_t cursor = data;
247
248         /*
249          * Skip if not visible due to our as-of TID
250          */
251         if (cursor->key_beg.create_tid) {
252                 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
253                         return(0);
254                 if (rec->rec.base.base.delete_tid &&
255                     cursor->key_beg.create_tid >=
256                      rec->rec.base.base.delete_tid) {
257                         return(0);
258                 }
259         }
260
261         /*
262          * Return the first matching record and stop the scan
263          */
264         if (cursor->iprec == NULL) {
265                 cursor->iprec = rec;
266                 hammer_ref(&rec->lock);
267                 return(-1);
268         }
269         return(0);
270 }
271
272 static
273 int
274 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
275 {
276         if (cursor->iprec)
277                 hammer_rel_mem_record(&cursor->iprec);
278         if (cursor->ip) {
279                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
280                                                   &cursor->ip->rec_tree);
281         }
282         cursor->ip = ip;
283         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
284         cursor->scan.node = NULL;
285         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
286                                    hammer_rec_scan_callback, cursor);
287
288         /*
289          * Adjust scan.node and keep it linked into the RB-tree so we can
290          * hold the cursor through third party modifications of the RB-tree.
291          */
292         if (cursor->iprec) {
293                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
294                 return(0);
295         }
296         return(ENOENT);
297 }
298
299 void
300 hammer_mem_done(hammer_cursor_t cursor)
301 {
302         if (cursor->ip) {
303                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
304                                                   &cursor->ip->rec_tree);
305                 cursor->ip = NULL;
306         }
307         if (cursor->iprec)
308                 hammer_rel_mem_record(&cursor->iprec);
309 }
310
311 /************************************************************************
312  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
313  ************************************************************************
314  *
315  * These functions manipulate in-memory records.  Such records typically
316  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
317  */
318
319 /*
320  * Add a directory entry (dip,ncp) which references inode (ip).
321  *
322  * Note that the low 32 bits of the namekey are set temporarily to create
323  * a unique in-memory record, and may be modified a second time when the
324  * record is synchronized to disk.  In particular, the low 32 bits cannot be
325  * all 0's when synching to disk, which is not handled here.
326  */
327 int
328 hammer_ip_add_directory(struct hammer_transaction *trans,
329                      struct hammer_inode *dip, struct namecache *ncp,
330                      struct hammer_inode *ip)
331 {
332         hammer_record_t record;
333         int error;
334         int bytes;
335
336         record = hammer_alloc_mem_record(dip);
337
338         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
339         if (++trans->hmp->namekey_iterator == 0)
340                 ++trans->hmp->namekey_iterator;
341
342         record->rec.entry.base.base.obj_id = dip->obj_id;
343         record->rec.entry.base.base.key =
344                 hammer_directory_namekey(ncp->nc_name, bytes);
345         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
346         record->rec.entry.base.base.create_tid = trans->tid;
347         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
348         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
349         record->rec.entry.obj_id = ip->obj_id;
350         if (bytes <= sizeof(record->rec.entry.den_name)) {
351                 record->data = (void *)record->rec.entry.den_name;
352                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
353         } else {
354                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
355                 record->flags |= HAMMER_RECF_ALLOCDATA;
356         }
357         bcopy(ncp->nc_name, record->data, bytes);
358         record->rec.entry.base.data_len = bytes;
359         ++ip->ino_rec.ino_nlinks;
360         hammer_modify_inode(trans, ip,
361                             HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
362         error = hammer_mem_add(trans, record);
363         return(error);
364 }
365
366 /*
367  * Delete the directory entry and update the inode link count.  The
368  * cursor must be seeked to the directory entry record being deleted.
369  *
370  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
371  */
372 int
373 hammer_ip_del_directory(struct hammer_transaction *trans,
374                      hammer_cursor_t cursor, struct hammer_inode *dip,
375                      struct hammer_inode *ip)
376 {
377         int error;
378
379         if (cursor->record == &cursor->iprec->rec) {
380                 /*
381                  * The directory entry was in-memory, just scrap the
382                  * record.
383                  */
384                 kprintf("del directory mem record\n");
385                 hammer_free_mem_record(cursor->iprec);
386                 error = 0;
387         } else {
388                 /*
389                  * The directory entry was on-disk, mark the record and
390                  * B-Tree entry as deleted.  The B-Tree entry does not
391                  * have to be reindexed because a 'current' delete transid
392                  * will wind up in the same position as the live record.
393                  */
394                 KKASSERT(ip->flags & HAMMER_INODE_ONDISK);
395                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
396                 if (error == 0) {
397                         cursor->node->ondisk->elms[cursor->index].base.delete_tid = trans->tid;
398                         cursor->record->base.base.delete_tid = trans->tid;
399                         hammer_modify_node(cursor->node);
400                         hammer_modify_buffer(cursor->record_buffer);
401                 }
402         }
403
404         /*
405          * One less link.  The file may still be open in the OS even after
406          * all links have gone away so we don't destroy the inode's data
407          * here.
408          */
409         if (error == 0) {
410                 --ip->ino_rec.ino_nlinks;
411                 hammer_modify_inode(trans, ip,
412                                     HAMMER_INODE_RDIRTY | HAMMER_INODE_TID);
413                 if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))
414                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
415
416         }
417         return(error);
418 }
419
420 /*
421  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
422  * is called via the strategy called from a cached data source.  This code
423  * is responsible for actually writing a data record out to the disk.
424  */
425 int
426 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
427                        int64_t offset, void *data, int bytes)
428 {
429         struct hammer_cursor cursor;
430         hammer_record_ondisk_t rec;
431         union hammer_btree_elm elm;
432         void *bdata;
433         int error;
434
435         error = hammer_init_cursor_ip(&cursor, ip);
436         if (error)
437                 return(error);
438         cursor.key_beg.obj_id = ip->obj_id;
439         cursor.key_beg.key = offset + bytes;
440         cursor.key_beg.create_tid = trans->tid;
441         cursor.key_beg.delete_tid = 0;
442         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
443         cursor.flags = HAMMER_CURSOR_INSERT;
444
445         /*
446          * Issue a lookup to position the cursor and locate the cluster
447          */
448         error = hammer_btree_lookup(&cursor);
449         if (error == 0) {
450                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
451                         offset, bytes);
452                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
453                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
454                 error = EIO;
455         }
456         if (error != ENOENT)
457                 goto done;
458
459         /*
460          * Allocate record and data space now that we know which cluster
461          * the B-Tree node ended up in.
462          */
463         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
464                                   &cursor.data_buffer);
465         if (bdata == NULL)
466                 goto done;
467         rec = hammer_alloc_record(cursor.node->cluster, &error,
468                                   &cursor.record_buffer);
469         if (rec == NULL)
470                 goto fail1;
471
472         /*
473          * Fill everything in and insert our B-Tree node.
474          */
475         rec->base.base = cursor.key_beg;
476         rec->base.data_crc = crc32(data, bytes);
477         rec->base.rec_id = 0;   /* XXX */
478         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
479         rec->base.data_len = bytes;
480         hammer_modify_buffer(cursor.record_buffer);
481
482         bcopy(data, bdata, bytes);
483         hammer_modify_buffer(cursor.data_buffer);
484
485         elm.leaf.base = cursor.key_beg;
486         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
487         elm.leaf.data_offset = rec->base.data_offset;
488         elm.leaf.data_len = bytes;
489         elm.leaf.data_crc = rec->base.data_crc;
490
491         error = hammer_btree_insert(&cursor, &elm);
492         if (error == 0)
493                 goto done;
494
495         hammer_free_record_ptr(cursor.record_buffer, rec);
496 fail1:
497         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
498 done:
499         hammer_done_cursor(&cursor);
500         return(error);
501 }
502
503 /*
504  * Sync an in-memory record to the disk.  this is typically called via fsync
505  * from a cached record source.  This code is responsible for actually
506  * writing a record out to the disk.
507  */
508 int
509 hammer_ip_sync_record(hammer_record_t record)
510 {
511         struct hammer_cursor cursor;
512         hammer_record_ondisk_t rec;
513         union hammer_btree_elm elm;
514         void *bdata;
515         int error;
516
517         error = hammer_init_cursor_ip(&cursor, record->ip);
518         if (error)
519                 return(error);
520         cursor.key_beg = record->rec.base.base;
521         cursor.flags = HAMMER_CURSOR_INSERT;
522
523         /*
524          * Issue a lookup to position the cursor and locate the cluster
525          */
526         error = hammer_btree_lookup(&cursor);
527         if (error == 0) {
528                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
529                         record->rec.base.base.key);
530                 error = EIO;
531         }
532         if (error != ENOENT)
533                 goto done;
534
535         /*
536          * Allocate record and data space now that we know which cluster
537          * the B-Tree node ended up in.
538          */
539         if (record->data == NULL ||
540             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
541                 bdata = record->data;
542         } else {
543                 bdata = hammer_alloc_data(cursor.node->cluster,
544                                           record->rec.base.data_len, &error,
545                                           &cursor.data_buffer);
546                 if (bdata == NULL)
547                         goto done;
548         }
549         rec = hammer_alloc_record(cursor.node->cluster, &error,
550                                   &cursor.record_buffer);
551         if (rec == NULL)
552                 goto fail1;
553
554         /*
555          * Fill everything in and insert our B-Tree node.
556          *
557          * XXX assign rec_id here
558          */
559         *rec = record->rec;
560         if (bdata) {
561                 rec->base.data_crc = crc32(record->data,
562                                            record->rec.base.data_len);
563                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
564                         /*
565                          * Data embedded in record
566                          */
567                         rec->base.data_offset = ((char *)bdata -
568                                                  (char *)&record->rec);
569                         KKASSERT(rec->base.data_offset >= 0 && 
570                                  rec->base.data_offset + rec->base.data_len <
571                                   sizeof(*rec));
572                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
573                 } else {
574                         /*
575                          * Data separate from record
576                          */
577                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
578                         bcopy(record->data, bdata, rec->base.data_len);
579                         hammer_modify_buffer(cursor.data_buffer);
580                 }
581         }
582         rec->base.rec_id = 0;   /* XXX */
583
584         hammer_modify_buffer(cursor.record_buffer);
585
586         elm.leaf.base = cursor.key_beg;
587         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
588         elm.leaf.data_offset = rec->base.data_offset;
589         elm.leaf.data_len = rec->base.data_len;
590         elm.leaf.data_crc = rec->base.data_crc;
591
592         error = hammer_btree_insert(&cursor, &elm);
593         if (error == 0)
594                 goto done;
595
596         hammer_free_record_ptr(cursor.record_buffer, rec);
597 fail1:
598         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
599                 hammer_free_data_ptr(cursor.data_buffer, bdata,
600                                      rec->base.data_len);
601         }
602 done:
603         hammer_done_cursor(&cursor);
604         return(error);
605 }
606
607
608 /*
609  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
610  * entry's key is used to deal with hash collisions in the upper 32 bits.
611  * A unique 64 bit key is generated in-memory and may be regenerated a
612  * second time when the directory record is flushed to the on-disk B-Tree.
613  */
614 static
615 int
616 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
617 {
618         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
619                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
620                         hammer_free_mem_record(record);
621                         return (EEXIST);
622                 }
623                 if (++trans->hmp->namekey_iterator == 0)
624                         ++trans->hmp->namekey_iterator;
625                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
626                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
627         }
628         record->flags |= HAMMER_RECF_ONRBTREE;
629         return(0);
630 }
631
632 /************************************************************************
633  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
634  ************************************************************************
635  *
636  * These functions augment the B-Tree scanning functions in hammer_btree.c
637  * by merging in-memory records with on-disk records.
638  */
639
640 /*
641  * Locate a particular record either in-memory or on-disk.
642  *
643  * NOTE: This is basically a standalone routine, hammer_ip_next() may
644  * NOT be called to iterate results.
645  */
646 int
647 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
648 {
649         int error;
650
651         /*
652          * If the element is in-memory return it without searching the
653          * on-disk B-Tree
654          */
655         error = hammer_mem_lookup(cursor, ip);
656         if (error == 0) {
657                 cursor->record = &cursor->iprec->rec;
658                 return(error);
659         }
660         if (error != ENOENT)
661                 return(error);
662
663         /*
664          * If the inode has on-disk components search the on-disk B-Tree.
665          */
666         if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
667                 return(error);
668         error = hammer_btree_lookup(cursor);
669         if (error == 0)
670                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
671         return(error);
672 }
673
674 /*
675  * Locate the first record within the cursor's key_beg/key_end range,
676  * restricted to a particular inode.  0 is returned on success, ENOENT
677  * if no records matched the requested range, or some other error.
678  *
679  * When 0 is returned hammer_ip_next() may be used to iterate additional
680  * records within the requested range.
681  */
682 int
683 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
684 {
685         int error;
686
687         /*
688          * Clean up fields and setup for merged scan
689          */
690         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
691         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
692         if (cursor->iprec)
693                 hammer_rel_mem_record(&cursor->iprec);
694
695         /*
696          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
697          * exact lookup so if we get ENOENT we have to call the iterate
698          * function to validate the first record after the begin key.
699          *
700          * The ATEDISK flag is used by hammer_btree_iterate to determine
701          * whether it must index forwards or not.
702          */
703         if (ip->flags & HAMMER_INODE_ONDISK) {
704                 error = hammer_btree_lookup(cursor);
705                 if (error == ENOENT) {
706                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
707                         error = hammer_btree_iterate(cursor);
708                 }
709                 if (error && error != ENOENT) 
710                         return(error);
711                 if (error == 0) {
712                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
713                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
714                 } else {
715                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
716                 }
717         }
718
719         /*
720          * Search the in-memory record list (Red-Black tree).  Unlike the
721          * B-Tree search, mem_first checks for records in the range.
722          */
723         error = hammer_mem_first(cursor, ip);
724         if (error && error != ENOENT)
725                 return(error);
726         if (error == 0) {
727                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
728                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
729         }
730
731         /*
732          * This will return the first matching record.
733          */
734         return(hammer_ip_next(cursor));
735 }
736
737 /*
738  * Retrieve the next record in a merged iteration within the bounds of the
739  * cursor.  This call may be made multiple times after the cursor has been
740  * initially searched with hammer_ip_first().
741  *
742  * 0 is returned on success, ENOENT if no further records match the
743  * requested range, or some other error code is returned.
744  */
745 int
746 hammer_ip_next(hammer_cursor_t cursor)
747 {
748         hammer_btree_elm_t elm;
749         hammer_record_t rec;
750         int error;
751         int r;
752
753         /*
754          * Load the current on-disk and in-memory record.  If we ate any
755          * records we have to get the next one. 
756          *
757          * Get the next on-disk record
758          */
759         if (cursor->flags & HAMMER_CURSOR_ATEDISK) {
760                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
761                         error = hammer_btree_iterate(cursor);
762                         if (error == 0)
763                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
764                         else
765                                 cursor->flags |= HAMMER_CURSOR_DISKEOF;
766                 }
767         }
768
769         /*
770          * Get the next in-memory record.  The record can be ripped out
771          * of the RB tree so we maintain a scan_info structure to track
772          * the next node.
773          *
774          * hammer_rec_scan_cmp:  Is the record still in our general range,
775          *                       (non-inclusive of snapshot exclusions)?
776          * hammer_rec_scan_callback: Is the record in our snapshot?
777          */
778         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
779                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
780                         hammer_rel_mem_record(&cursor->iprec);
781                         rec = cursor->scan.node;        /* next node */
782                         while (rec) {
783                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
784                                         break;
785                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
786                                         break;
787                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
788                         }
789                         if (cursor->iprec) {
790                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
791                                 hammer_ref(&cursor->iprec->lock);
792                                 cursor->scan.node =
793                                         hammer_rec_rb_tree_RB_NEXT(rec);
794                         } else {
795                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
796                         }
797                 }
798         }
799
800         /*
801          * Extract either the disk or memory record depending on their
802          * relative position.
803          */
804         error = 0;
805         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
806         case 0:
807                 /*
808                  * Both entries valid
809                  */
810                 elm = &cursor->node->ondisk->elms[cursor->index];
811                 r = hammer_btree_cmp(&elm->base,
812                                      &cursor->iprec->rec.base.base);
813                 if (r < 0) {
814                         error = hammer_btree_extract(cursor,
815                                                      HAMMER_CURSOR_GET_RECORD);
816                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
817                         break;
818                 }
819                 /* fall through to the memory entry */
820         case HAMMER_CURSOR_ATEDISK:
821                 /*
822                  * Only the memory entry is valid
823                  */
824                 cursor->record = &cursor->iprec->rec;
825                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
826                 break;
827         case HAMMER_CURSOR_ATEMEM:
828                 /*
829                  * Only the disk entry is valid
830                  */
831                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
832                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
833                 break;
834         default:
835                 /*
836                  * Neither entry is valid
837                  *
838                  * XXX error not set properly
839                  */
840                 cursor->record = NULL;
841                 error = ENOENT;
842                 break;
843         }
844         return(error);
845 }
846
847 /*
848  * Resolve the cursor->data pointer for the current cursor position in
849  * a merged iteration.
850  */
851 int
852 hammer_ip_resolve_data(hammer_cursor_t cursor)
853 {
854         int error;
855
856         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
857                 cursor->data = cursor->iprec->data;
858                 error = 0;
859         } else {
860                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
861         }
862         return(error);
863 }
864
865 /*
866  * Delete all records within the specified range for inode ip.
867  *
868  * NOTE: An unaligned range will cause new records to be added to cover
869  * the edge cases. (XXX not implemented yet).
870  *
871  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
872  *
873  * NOTE: Record keys for regular file data have to be special-cased since
874  * they indicate the end of the range (key = base + bytes).
875  */
876 int
877 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
878                        int64_t ran_beg, int64_t ran_end)
879 {
880         struct hammer_cursor cursor;
881         hammer_record_ondisk_t rec;
882         hammer_base_elm_t base;
883         int error;
884         int isregfile;
885         int64_t off;
886
887         hammer_init_cursor_ip(&cursor, ip);
888
889         cursor.key_beg.obj_id = ip->obj_id;
890         cursor.key_beg.create_tid = ip->obj_asof;
891         cursor.key_beg.delete_tid = 0;
892         cursor.key_beg.obj_type = 0;
893
894         /*
895          * The key in the B-Tree is (base+bytes), so the first possible
896          * matching key is ran_beg + 1.
897          */
898         cursor.key_beg.key = ran_beg + 1;
899         cursor.key_end = cursor.key_beg;
900         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
901                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
902                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
903                 cursor.key_end.key = ran_end;
904                 isregfile = 0;
905         } else {
906                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
907                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
908                 if (ran_end + MAXPHYS + 1 < ran_end)
909                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
910                 else
911                         cursor.key_end.key = ran_end + MAXPHYS + 1;
912                 isregfile = 1;
913         }
914
915         error = hammer_ip_first(&cursor, ip);
916
917         /*
918          * Iterate through matching records and mark them as deleted.
919          */
920         while (error == 0) {
921                 rec = cursor.record;
922                 base = &rec->base.base;
923
924                 KKASSERT(base->delete_tid == 0);
925
926                 /*
927                  * There may be overlap cases for regular file data.  Also
928                  * remember the key for a regular file record is the offset
929                  * of the last byte of the record (base + len - 1), NOT the
930                  * base offset.
931                  */
932                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
933                         off = base->key - rec->base.data_len;
934                         /*
935                          * Check the left edge case.  We currently do not
936                          * split existing records.
937                          */
938                         if (off < ran_beg) {
939                                 panic("hammer left edge case %016llx %d\n",
940                                         base->key, rec->base.data_len);
941                         }
942
943                         /*
944                          * Check the right edge case.  Note that the
945                          * record can be completely out of bounds, which
946                          * terminates the search.
947                          *
948                          * base->key is exclusive of the right edge while
949                          * ran_end is inclusive of the right edge.  The
950                          * (key - data_len) left boundary is inclusive.
951                          */
952                         if (base->key > ran_end + 1) {
953                                 if (base->key - rec->base.data_len > ran_end) {
954                                         kprintf("right edge OOB\n");
955                                         break;
956                                 }
957                                 panic("hammer right edge case\n");
958                         }
959                 }
960
961                 /*
962                  * Mark the record and B-Tree entry as deleted
963                  */
964                 if (cursor.record == &cursor.iprec->rec) {
965                         hammer_free_mem_record(cursor.iprec);
966                 } else {
967                         cursor.node->ondisk->
968                             elms[cursor.index].base.delete_tid = trans->tid;
969                         cursor.record->base.base.delete_tid = trans->tid;
970                         hammer_modify_node(cursor.node);
971                         hammer_modify_buffer(cursor.record_buffer);
972                 }
973                 error = hammer_ip_next(&cursor);
974         }
975         hammer_done_cursor(&cursor);
976         if (error == ENOENT)
977                 error = 0;
978         return(error);
979 }
980