HAMMER 20B/many: New spike topology, simplify the B-Tree code.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.20 2008/01/17 05:06:09 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.delete_tid == 0) {
61                 if (rec2->rec.base.base.delete_tid == 0)
62                         return(0);
63                 return(1);
64         }
65         if (rec2->rec.base.base.delete_tid == 0)
66                 return(-1);
67
68         if (rec1->rec.base.base.delete_tid < rec2->rec.base.base.delete_tid)
69                 return(-1);
70         if (rec1->rec.base.base.delete_tid > rec2->rec.base.base.delete_tid)
71                 return(1);
72         return(0);
73 }
74
75 static int
76 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
77 {
78         if (info->rec_type < rec->rec.base.base.rec_type)
79                 return(-3);
80         if (info->rec_type > rec->rec.base.base.rec_type)
81                 return(3);
82
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87
88         if (info->delete_tid == 0) {
89                 if (rec->rec.base.base.delete_tid == 0)
90                         return(0);
91                 return(1);
92         }
93         if (rec->rec.base.base.delete_tid == 0)
94                 return(-1);
95         if (info->delete_tid < rec->rec.base.base.delete_tid)
96                 return(-1);
97         if (info->delete_tid > rec->rec.base.base.delete_tid)
98                 return(1);
99         return(0);
100 }
101
102 /*
103  * RB_SCAN comparison code for hammer_mem_first().  The argument order
104  * is reversed so the comparison result has to be negated.  key_beg and
105  * key_end are both range-inclusive.
106  *
107  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
108  * These do not stop the scan.
109  *
110  * Localized deletions are not cached in-memory.
111  */
112 static
113 int
114 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
115 {
116         hammer_cursor_t cursor = data;
117         int r;
118
119         r = hammer_rec_compare(&cursor->key_beg, rec);
120         if (r > 1)
121                 return(-1);
122         r = hammer_rec_compare(&cursor->key_end, rec);
123         if (r < -1)
124                 return(1);
125         return(0);
126 }
127
128 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
130                     hammer_rec_compare, hammer_base_elm_t);
131
132 /*
133  * Allocate a record for the caller to finish filling in.  The record is
134  * returned referenced.
135  */
136 hammer_record_t
137 hammer_alloc_mem_record(hammer_inode_t ip)
138 {
139         hammer_record_t record;
140
141         ++hammer_count_records;
142         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
143         record->ip = ip;
144         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
145         hammer_ref(&record->lock);
146         return (record);
147 }
148
149 /*
150  * Release a memory record.  Records marked for deletion are immediately
151  * removed from the RB-Tree but otherwise left intact until the last ref
152  * goes away.
153  */
154 void
155 hammer_rel_mem_record(struct hammer_record *record)
156 {
157         hammer_unref(&record->lock);
158         if (record->flags & HAMMER_RECF_DELETED) {
159                 if (record->flags & HAMMER_RECF_ONRBTREE) {
160                         RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
161                                   record);
162                         record->flags &= ~HAMMER_RECF_ONRBTREE;
163                 }
164                 if (record->lock.refs == 0) {
165                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
166                                 --hammer_count_record_datas;
167                                 kfree(record->data, M_HAMMER);
168                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
169                         }
170                         record->data = NULL;
171                         --hammer_count_records;
172                         kfree(record, M_HAMMER);
173                 }
174         }
175 }
176
177 /*
178  * Lookup an in-memory record given the key specified in the cursor.  Works
179  * just like hammer_btree_lookup() but operates on an inode's in-memory
180  * record list.
181  *
182  * The lookup must fail if the record is marked for deferred deletion.
183  */
184 static
185 int
186 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
187 {
188         int error;
189
190         if (cursor->iprec) {
191                 hammer_rel_mem_record(cursor->iprec);
192                 cursor->iprec = NULL;
193         }
194         if (cursor->ip) {
195                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
196                                                   &cursor->ip->rec_tree);
197         }
198         cursor->ip = ip;
199         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
200         cursor->scan.node = NULL;
201         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
202                                 &ip->rec_tree, &cursor->key_beg);
203         if (cursor->iprec == NULL) {
204                 error = ENOENT;
205         } else {
206                 hammer_ref(&cursor->iprec->lock);
207                 error = 0;
208         }
209         return(error);
210 }
211
212 /*
213  * hammer_mem_first() - locate the first in-memory record matching the
214  * cursor.
215  *
216  * The RB_SCAN function we use is designed as a callback.  We terminate it
217  * (return -1) as soon as we get a match.
218  */
219 static
220 int
221 hammer_rec_scan_callback(hammer_record_t rec, void *data)
222 {
223         hammer_cursor_t cursor = data;
224
225         /*
226          * Skip if not visible due to our as-of TID
227          */
228         if (cursor->flags & HAMMER_CURSOR_ASOF) {
229                 if (cursor->asof < rec->rec.base.base.create_tid)
230                         return(0);
231                 if (rec->rec.base.base.delete_tid &&
232                     cursor->asof >= rec->rec.base.base.delete_tid) {
233                         return(0);
234                 }
235         }
236
237         /*
238          * Return the first matching record and stop the scan
239          */
240         if (cursor->iprec == NULL) {
241                 cursor->iprec = rec;
242                 hammer_ref(&rec->lock);
243                 return(-1);
244         }
245         return(0);
246 }
247
248 static
249 int
250 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
251 {
252         if (cursor->iprec) {
253                 hammer_rel_mem_record(cursor->iprec);
254                 cursor->iprec = NULL;
255         }
256         if (cursor->ip) {
257                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
258                                                   &cursor->ip->rec_tree);
259         }
260         cursor->ip = ip;
261         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
262
263         cursor->scan.node = NULL;
264         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
265                                    hammer_rec_scan_callback, cursor);
266
267         /*
268          * Adjust scan.node and keep it linked into the RB-tree so we can
269          * hold the cursor through third party modifications of the RB-tree.
270          */
271         if (cursor->iprec) {
272                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
273                 return(0);
274         }
275         return(ENOENT);
276 }
277
278 void
279 hammer_mem_done(hammer_cursor_t cursor)
280 {
281         if (cursor->ip) {
282                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
283                                                   &cursor->ip->rec_tree);
284                 cursor->ip = NULL;
285         }
286         if (cursor->iprec) {
287                 hammer_rel_mem_record(cursor->iprec);
288                 cursor->iprec = NULL;
289         }
290 }
291
292 /************************************************************************
293  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
294  ************************************************************************
295  *
296  * These functions manipulate in-memory records.  Such records typically
297  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
298  */
299
300 /*
301  * Add a directory entry (dip,ncp) which references inode (ip).
302  *
303  * Note that the low 32 bits of the namekey are set temporarily to create
304  * a unique in-memory record, and may be modified a second time when the
305  * record is synchronized to disk.  In particular, the low 32 bits cannot be
306  * all 0's when synching to disk, which is not handled here.
307  */
308 int
309 hammer_ip_add_directory(struct hammer_transaction *trans,
310                      struct hammer_inode *dip, struct namecache *ncp,
311                      struct hammer_inode *ip)
312 {
313         hammer_record_t record;
314         int error;
315         int bytes;
316
317         record = hammer_alloc_mem_record(dip);
318
319         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
320         if (++trans->hmp->namekey_iterator == 0)
321                 ++trans->hmp->namekey_iterator;
322
323         record->rec.entry.base.base.obj_id = dip->obj_id;
324         record->rec.entry.base.base.key =
325                 hammer_directory_namekey(ncp->nc_name, bytes);
326         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
327         record->rec.entry.base.base.create_tid = trans->tid;
328         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
329         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
330         record->rec.entry.obj_id = ip->obj_id;
331         if (bytes <= sizeof(record->rec.entry.den_name)) {
332                 record->data = (void *)record->rec.entry.den_name;
333                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
334         } else {
335                 ++hammer_count_record_datas;
336                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
337                 record->flags |= HAMMER_RECF_ALLOCDATA;
338         }
339         bcopy(ncp->nc_name, record->data, bytes);
340         record->rec.entry.base.data_len = bytes;
341         ++ip->ino_rec.ino_nlinks;
342         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
343         error = hammer_mem_add(trans, record);
344         return(error);
345 }
346
347 /*
348  * Delete the directory entry and update the inode link count.  The
349  * cursor must be seeked to the directory entry record being deleted.
350  *
351  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
352  */
353 int
354 hammer_ip_del_directory(struct hammer_transaction *trans,
355                      hammer_cursor_t cursor, struct hammer_inode *dip,
356                      struct hammer_inode *ip)
357 {
358         int error;
359
360         error = hammer_ip_delete_record(cursor, trans->tid);
361
362         /*
363          * One less link.  The file may still be open in the OS even after
364          * all links have gone away so we only try to sync if the OS has
365          * no references and nlinks falls to 0.
366          */
367         if (error == 0) {
368                 --ip->ino_rec.ino_nlinks;
369                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
370                 if (ip->ino_rec.ino_nlinks == 0 &&
371                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
372                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
373                 }
374
375         }
376         return(error);
377 }
378
379 /*
380  * Add a record to an inode.
381  *
382  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
383  * initialize the following additional fields:
384  *
385  * record->rec.entry.base.base.key
386  * record->rec.entry.base.base.rec_type
387  * record->rec.entry.base.base.data_len
388  * record->data         (a copy will be kmalloc'd if not embedded)
389  */
390 int
391 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
392 {
393         hammer_inode_t ip = record->ip;
394         int error;
395         int bytes;
396         void *data;
397
398         record->rec.base.base.obj_id = ip->obj_id;
399         record->rec.base.base.create_tid = trans->tid;
400         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
401         bytes = record->rec.base.data_len;
402
403         if (record->data) {
404                 if ((char *)record->data < (char *)&record->rec ||
405                     (char *)record->data >= (char *)(&record->rec + 1)) {
406                         ++hammer_count_record_datas;
407                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
408                         record->flags |= HAMMER_RECF_ALLOCDATA;
409                         bcopy(record->data, data, bytes);
410                         record->data = data;
411                 } else {
412                         record->flags |= HAMMER_RECF_EMBEDDED_DATA;
413                 }
414         }
415         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
416         error = hammer_mem_add(trans, record);
417         return(error);
418 }
419
420 /*
421  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
422  * is called via the strategy called from a cached data source.  This code
423  * is responsible for actually writing a data record out to the disk.
424  */
425 int
426 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
427                        int64_t offset, void *data, int bytes,
428                        struct hammer_cursor **spike)
429 {
430         struct hammer_cursor cursor;
431         hammer_record_ondisk_t rec;
432         union hammer_btree_elm elm;
433         void *bdata;
434         int error;
435
436         error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
437         if (error)
438                 return(error);
439         cursor.key_beg.obj_id = ip->obj_id;
440         cursor.key_beg.key = offset + bytes;
441         cursor.key_beg.create_tid = 0;
442         cursor.key_beg.delete_tid = 0;
443         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
444         cursor.asof = trans->tid;
445         cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_ASOF;
446
447         /*
448          * Issue a lookup to position the cursor and locate the cluster
449          */
450         error = hammer_btree_lookup(&cursor);
451         if (error == 0) {
452                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
453                         offset, bytes);
454                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
455                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
456                 error = EIO;
457         }
458         if (error != ENOENT)
459                 goto done;
460
461         /*
462          * Allocate record and data space now that we know which cluster
463          * the B-Tree node ended up in.
464          */
465         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
466                                   &cursor.data_buffer);
467         if (bdata == NULL)
468                 goto done;
469         rec = hammer_alloc_record(cursor.node->cluster, &error,
470                                   &cursor.record_buffer);
471         if (rec == NULL)
472                 goto fail1;
473
474         /*
475          * Fill everything in and insert our B-Tree node.
476          */
477         hammer_modify_buffer(cursor.record_buffer);
478         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
479         rec->base.base.obj_id = ip->obj_id;
480         rec->base.base.key = offset + bytes;
481         rec->base.base.create_tid = trans->tid;
482         rec->base.base.delete_tid = 0;
483         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
484         rec->base.data_crc = crc32(data, bytes);
485         rec->base.rec_id = 0;   /* XXX */
486         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
487         rec->base.data_len = bytes;
488
489         hammer_modify_buffer(cursor.data_buffer);
490         bcopy(data, bdata, bytes);
491
492         elm.leaf.base = rec->base.base;
493         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
494         elm.leaf.data_offset = rec->base.data_offset;
495         elm.leaf.data_len = bytes;
496         elm.leaf.data_crc = rec->base.data_crc;
497
498         /*
499          * Data records can wind up on-disk before the inode itself is
500          * on-disk.  One must assume data records may be on-disk if either
501          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
502          */
503         ip->flags |= HAMMER_INODE_DONDISK;
504
505         error = hammer_btree_insert(&cursor, &elm);
506         if (error == 0) {
507                 hammer_update_syncid(cursor.record_buffer->cluster, trans->tid);
508                 goto done;
509         }
510
511         hammer_free_record_ptr(cursor.record_buffer, rec);
512 fail1:
513         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
514 done:
515         /*
516          * If ENOSPC in cluster fill in the spike structure and return
517          * ENOSPC.
518          */
519         if (error == ENOSPC)
520                 hammer_load_spike(&cursor, spike);
521         hammer_done_cursor(&cursor);
522         return(error);
523 }
524
525 /*
526  * Sync an in-memory record to the disk.  this is typically called via fsync
527  * from a cached record source.  This code is responsible for actually
528  * writing a record out to the disk.
529  */
530 int
531 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
532 {
533         struct hammer_cursor cursor;
534         hammer_record_ondisk_t rec;
535         hammer_mount_t hmp;
536         union hammer_btree_elm elm;
537         void *bdata;
538         int error;
539
540         error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
541                                        record->ip->hmp);
542         if (error)
543                 return(error);
544         cursor.key_beg = record->rec.base.base;
545         cursor.flags |= HAMMER_CURSOR_INSERT;
546
547         /*
548          * Issue a lookup to position the cursor and locate the cluster.  The
549          * target key should not exist.  If we are creating a directory entry
550          * we may have to iterate the low 32 bits of the key to find an unused
551          * key.
552          *
553          * If we run out of space trying to adjust the B-Tree for the
554          * insert, re-lookup without the insert flag so the cursor
555          * is properly positioned for the spike.
556          */
557 again:
558         error = hammer_btree_lookup(&cursor);
559         if (error == 0) {
560                 if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
561                         hmp = cursor.node->cluster->volume->hmp;
562                         if (++hmp->namekey_iterator == 0)
563                                 ++hmp->namekey_iterator;
564                         record->rec.base.base.key &= ~(0xFFFFFFFFLL);
565                         record->rec.base.base.key |= hmp->namekey_iterator;
566                         cursor.key_beg.key = record->rec.base.base.key;
567                         goto again;
568                 }
569                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
570                         record->rec.base.base.key);
571                 Debugger("duplicate record1");
572                 error = EIO;
573         }
574         if (error != ENOENT)
575                 goto done;
576
577         /*
578          * Mark the record as undergoing synchronization.  Our cursor is
579          * holding a locked B-Tree node for the insertion which interlocks
580          * anyone trying to access this record.
581          *
582          * XXX There is still a race present related to iterations.  An
583          * iteration may process the record, a sync may occur, and then
584          * later process the B-Tree element for the same record.
585          *
586          * We do not try to synchronize a deleted record.
587          */
588         if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING)) {
589                 error = 0;
590                 goto done;
591         }
592         record->flags |= HAMMER_RECF_SYNCING;
593
594         /*
595          * Allocate record and data space now that we know which cluster
596          * the B-Tree node ended up in.
597          */
598         if (record->data == NULL ||
599             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
600                 bdata = record->data;
601         } else {
602                 bdata = hammer_alloc_data(cursor.node->cluster,
603                                           record->rec.base.data_len, &error,
604                                           &cursor.data_buffer);
605                 if (bdata == NULL)
606                         goto fail2;
607         }
608         rec = hammer_alloc_record(cursor.node->cluster, &error,
609                                   &cursor.record_buffer);
610         if (rec == NULL)
611                 goto fail1;
612
613         /*
614          * Fill everything in and insert our B-Tree node.
615          *
616          * XXX assign rec_id here
617          */
618         hammer_modify_buffer(cursor.record_buffer);
619         *rec = record->rec;
620         if (bdata) {
621                 rec->base.data_crc = crc32(record->data,
622                                            record->rec.base.data_len);
623                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
624                         /*
625                          * Data embedded in record
626                          */
627                         rec->base.data_offset = ((char *)bdata -
628                                                  (char *)&record->rec);
629                         KKASSERT(rec->base.data_offset >= 0 && 
630                                  rec->base.data_offset + rec->base.data_len <=
631                                   sizeof(*rec));
632                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
633                 } else {
634                         /*
635                          * Data separate from record
636                          */
637                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
638                         hammer_modify_buffer(cursor.data_buffer);
639                         bcopy(record->data, bdata, rec->base.data_len);
640                 }
641         }
642         rec->base.rec_id = 0;   /* XXX */
643
644         elm.leaf.base = record->rec.base.base;
645         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
646         elm.leaf.data_offset = rec->base.data_offset;
647         elm.leaf.data_len = rec->base.data_len;
648         elm.leaf.data_crc = rec->base.data_crc;
649
650         error = hammer_btree_insert(&cursor, &elm);
651
652         /*
653          * Clean up on success, or fall through on error.
654          */
655         if (error == 0) {
656                 record->flags |= HAMMER_RECF_DELETED;
657                 record->flags &= ~HAMMER_RECF_SYNCING;
658                 hammer_update_syncid(cursor.record_buffer->cluster,
659                                      record->rec.base.base.create_tid);
660                 goto done;
661         }
662
663         hammer_free_record_ptr(cursor.record_buffer, rec);
664 fail1:
665         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
666                 hammer_free_data_ptr(cursor.data_buffer, bdata,
667                                      record->rec.base.data_len);
668         }
669 fail2:
670         record->flags &= ~HAMMER_RECF_SYNCING;
671 done:
672         /*
673          * If ENOSPC in cluster fill in the spike structure and return
674          * ENOSPC.
675          */
676         if (error == ENOSPC)
677                 hammer_load_spike(&cursor, spike);
678         hammer_done_cursor(&cursor);
679         return(error);
680 }
681
682 /*
683  * Write out a record using the specified cursor.  The caller does not have
684  * to seek the cursor.  The flags are used to determine whether the data
685  * (if any) is embedded in the record or not.
686  *
687  * The target cursor will be modified by this call.  Note in particular
688  * that HAMMER_CURSOR_INSERT is set.
689  */
690 int
691 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
692                     void *data, int cursor_flags)
693 {
694         union hammer_btree_elm elm;
695         hammer_record_ondisk_t nrec;
696         void *bdata;
697         int error;
698
699         cursor->key_beg = orec->base.base;
700         cursor->flags |= HAMMER_CURSOR_INSERT;
701
702         /*
703          * Issue a lookup to position the cursor and locate the cluster.  The
704          * target key should not exist.
705          *
706          * If we run out of space trying to adjust the B-Tree for the
707          * insert, re-lookup without the insert flag so the cursor
708          * is properly positioned for the spike.
709          */
710         error = hammer_btree_lookup(cursor);
711         if (error == 0) {
712                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
713                         orec->base.base.key);
714                 Debugger("duplicate record2");
715                 error = EIO;
716         }
717         if (error != ENOENT)
718                 goto done;
719
720         /*
721          * Allocate record and data space now that we know which cluster
722          * the B-Tree node ended up in.
723          */
724         if (data == NULL ||
725             (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
726                 bdata = data;
727         } else {
728                 bdata = hammer_alloc_data(cursor->node->cluster,
729                                           orec->base.data_len, &error,
730                                           &cursor->data_buffer);
731                 if (bdata == NULL)
732                         goto done;
733         }
734         nrec = hammer_alloc_record(cursor->node->cluster, &error,
735                                   &cursor->record_buffer);
736         if (nrec == NULL)
737                 goto fail1;
738
739         /*
740          * Fill everything in and insert our B-Tree node.
741          *
742          * XXX assign rec_id here
743          */
744         hammer_modify_buffer(cursor->record_buffer);
745         *nrec = *orec;
746         nrec->base.data_offset = 0;
747         if (bdata) {
748                 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
749                 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
750                         /*
751                          * Data embedded in record
752                          */
753                         nrec->base.data_offset = ((char *)bdata - (char *)orec);
754                         KKASSERT(nrec->base.data_offset >= 0 && 
755                                  nrec->base.data_offset + nrec->base.data_len <
756                                   sizeof(*nrec));
757                         nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
758                 } else {
759                         /*
760                          * Data separate from record
761                          */
762                         nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
763                         hammer_modify_buffer(cursor->data_buffer);
764                         bcopy(data, bdata, nrec->base.data_len);
765                 }
766         }
767         nrec->base.rec_id = 0;  /* XXX */
768
769         elm.leaf.base = nrec->base.base;
770         elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
771         elm.leaf.data_offset = nrec->base.data_offset;
772         elm.leaf.data_len = nrec->base.data_len;
773         elm.leaf.data_crc = nrec->base.data_crc;
774
775         error = hammer_btree_insert(cursor, &elm);
776         if (error == 0) {
777                 hammer_update_syncid(cursor->record_buffer->cluster,
778                                      nrec->base.base.create_tid);
779                 goto done;
780         }
781
782         hammer_free_record_ptr(cursor->record_buffer, nrec);
783 fail1:
784         if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
785                 hammer_free_data_ptr(cursor->data_buffer, bdata,
786                                      orec->base.data_len);
787         }
788 done:
789         /* leave cursor intact */
790         return(error);
791 }
792
793 /*
794  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
795  * entry's key is used to deal with hash collisions in the upper 32 bits.
796  * A unique 64 bit key is generated in-memory and may be regenerated a
797  * second time when the directory record is flushed to the on-disk B-Tree.
798  *
799  * A referenced record is passed to this function.  This function
800  * eats the reference.  If an error occurs the record will be deleted.
801  */
802 static
803 int
804 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
805 {
806         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
807                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
808                         record->flags |= HAMMER_RECF_DELETED;
809                         hammer_rel_mem_record(record);
810                         return (EEXIST);
811                 }
812                 if (++trans->hmp->namekey_iterator == 0)
813                         ++trans->hmp->namekey_iterator;
814                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
815                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
816         }
817         record->flags |= HAMMER_RECF_ONRBTREE;
818         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
819         hammer_rel_mem_record(record);
820         return(0);
821 }
822
823 /************************************************************************
824  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
825  ************************************************************************
826  *
827  * These functions augment the B-Tree scanning functions in hammer_btree.c
828  * by merging in-memory records with on-disk records.
829  */
830
831 /*
832  * Locate a particular record either in-memory or on-disk.
833  *
834  * NOTE: This is basically a standalone routine, hammer_ip_next() may
835  * NOT be called to iterate results.
836  */
837 int
838 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
839 {
840         int error;
841
842         /*
843          * If the element is in-memory return it without searching the
844          * on-disk B-Tree
845          */
846         error = hammer_mem_lookup(cursor, ip);
847         if (error == 0) {
848                 cursor->record = &cursor->iprec->rec;
849                 return(error);
850         }
851         if (error != ENOENT)
852                 return(error);
853
854         /*
855          * If the inode has on-disk components search the on-disk B-Tree.
856          */
857         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
858                 return(error);
859         error = hammer_btree_lookup(cursor);
860         if (error == 0)
861                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
862         return(error);
863 }
864
865 /*
866  * Locate the first record within the cursor's key_beg/key_end range,
867  * restricted to a particular inode.  0 is returned on success, ENOENT
868  * if no records matched the requested range, or some other error.
869  *
870  * When 0 is returned hammer_ip_next() may be used to iterate additional
871  * records within the requested range.
872  */
873 int
874 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
875 {
876         int error;
877
878         /*
879          * Clean up fields and setup for merged scan
880          */
881         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
882         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
883         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
884         if (cursor->iprec) {
885                 hammer_rel_mem_record(cursor->iprec);
886                 cursor->iprec = NULL;
887         }
888
889         /*
890          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
891          * exact lookup so if we get ENOENT we have to call the iterate
892          * function to validate the first record after the begin key.
893          *
894          * The ATEDISK flag is used by hammer_btree_iterate to determine
895          * whether it must index forwards or not.  It is also used here
896          * to select the next record from in-memory or on-disk.
897          */
898         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
899                 error = hammer_btree_lookup(cursor);
900                 if (error == ENOENT) {
901                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
902                         error = hammer_btree_iterate(cursor);
903                 }
904                 if (error && error != ENOENT) 
905                         return(error);
906                 if (error == 0) {
907                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
908                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
909                 } else {
910                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
911                 }
912         }
913
914         /*
915          * Search the in-memory record list (Red-Black tree).  Unlike the
916          * B-Tree search, mem_first checks for records in the range.
917          */
918         error = hammer_mem_first(cursor, ip);
919         if (error && error != ENOENT)
920                 return(error);
921         if (error == 0) {
922                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
923                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
924         }
925
926         /*
927          * This will return the first matching record.
928          */
929         return(hammer_ip_next(cursor));
930 }
931
932 /*
933  * Retrieve the next record in a merged iteration within the bounds of the
934  * cursor.  This call may be made multiple times after the cursor has been
935  * initially searched with hammer_ip_first().
936  *
937  * 0 is returned on success, ENOENT if no further records match the
938  * requested range, or some other error code is returned.
939  */
940 int
941 hammer_ip_next(hammer_cursor_t cursor)
942 {
943         hammer_btree_elm_t elm;
944         hammer_record_t rec;
945         int error;
946         int r;
947
948         /*
949          * Load the current on-disk and in-memory record.  If we ate any
950          * records we have to get the next one. 
951          *
952          * If we deleted the last on-disk record we had scanned ATEDISK will
953          * be clear and DELBTREE will be set, forcing a call to iterate. The
954          * fact that ATEDISK is clear causes iterate to re-test the 'current'
955          * element.  If ATEDISK is set, iterate will skip the 'current'
956          * element.
957          *
958          * Get the next on-disk record
959          */
960         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
961                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
962                         error = hammer_btree_iterate(cursor);
963                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
964                         if (error == 0)
965                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
966                         else
967                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
968                                                  HAMMER_CURSOR_ATEDISK;
969                 }
970         }
971
972         /*
973          * Get the next in-memory record.  The record can be ripped out
974          * of the RB tree so we maintain a scan_info structure to track
975          * the next node.
976          *
977          * hammer_rec_scan_cmp:  Is the record still in our general range,
978          *                       (non-inclusive of snapshot exclusions)?
979          * hammer_rec_scan_callback: Is the record in our snapshot?
980          */
981         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
982                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
983                         if (cursor->iprec) {
984                                 hammer_rel_mem_record(cursor->iprec);
985                                 cursor->iprec = NULL;
986                         }
987                         rec = cursor->scan.node;        /* next node */
988                         while (rec) {
989                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
990                                         break;
991                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
992                                         break;
993                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
994                         }
995                         if (cursor->iprec) {
996                                 KKASSERT(cursor->iprec == rec);
997                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
998                                 cursor->scan.node =
999                                         hammer_rec_rb_tree_RB_NEXT(rec);
1000                         } else {
1001                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1002                         }
1003                 }
1004         }
1005
1006         /*
1007          * Extract either the disk or memory record depending on their
1008          * relative position.
1009          */
1010         error = 0;
1011         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1012         case 0:
1013                 /*
1014                  * Both entries valid
1015                  */
1016                 elm = &cursor->node->ondisk->elms[cursor->index];
1017                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1018                 if (r < 0) {
1019                         error = hammer_btree_extract(cursor,
1020                                                      HAMMER_CURSOR_GET_RECORD);
1021                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1022                         break;
1023                 }
1024                 /* fall through to the memory entry */
1025         case HAMMER_CURSOR_ATEDISK:
1026                 /*
1027                  * Only the memory entry is valid
1028                  */
1029                 cursor->record = &cursor->iprec->rec;
1030                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1031                 break;
1032         case HAMMER_CURSOR_ATEMEM:
1033                 /*
1034                  * Only the disk entry is valid
1035                  */
1036                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1037                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1038                 break;
1039         default:
1040                 /*
1041                  * Neither entry is valid
1042                  *
1043                  * XXX error not set properly
1044                  */
1045                 cursor->record = NULL;
1046                 error = ENOENT;
1047                 break;
1048         }
1049         return(error);
1050 }
1051
1052 /*
1053  * Resolve the cursor->data pointer for the current cursor position in
1054  * a merged iteration.
1055  */
1056 int
1057 hammer_ip_resolve_data(hammer_cursor_t cursor)
1058 {
1059         int error;
1060
1061         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1062                 cursor->data = cursor->iprec->data;
1063                 error = 0;
1064         } else {
1065                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1066         }
1067         return(error);
1068 }
1069
1070 /*
1071  * Delete all records within the specified range for inode ip.
1072  *
1073  * NOTE: An unaligned range will cause new records to be added to cover
1074  * the edge cases. (XXX not implemented yet).
1075  *
1076  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1077  *
1078  * NOTE: Record keys for regular file data have to be special-cased since
1079  * they indicate the end of the range (key = base + bytes).
1080  *
1081  * NOTE: The spike structure must be filled in if we return ENOSPC.
1082  */
1083 int
1084 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1085                        int64_t ran_beg, int64_t ran_end,
1086                        struct hammer_cursor **spike)
1087 {
1088         struct hammer_cursor cursor;
1089         hammer_record_ondisk_t rec;
1090         hammer_base_elm_t base;
1091         int error;
1092         int64_t off;
1093
1094         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1095
1096         cursor.key_beg.obj_id = ip->obj_id;
1097         cursor.key_beg.create_tid = 0;
1098         cursor.key_beg.delete_tid = 0;
1099         cursor.key_beg.obj_type = 0;
1100         cursor.asof = ip->obj_asof;
1101         cursor.flags |= HAMMER_CURSOR_ASOF;
1102
1103         cursor.key_end = cursor.key_beg;
1104         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1105                 cursor.key_beg.key = ran_beg;
1106                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1107                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1108                 cursor.key_end.key = ran_end;
1109         } else {
1110                 /*
1111                  * The key in the B-Tree is (base+bytes), so the first possible
1112                  * matching key is ran_beg + 1.
1113                  */
1114                 int64_t tmp64;
1115
1116                 cursor.key_beg.key = ran_beg + 1;
1117                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1118                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1119
1120                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1121                 if (tmp64 < ran_end)
1122                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1123                 else
1124                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1125         }
1126         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1127
1128         error = hammer_ip_first(&cursor, ip);
1129
1130         /*
1131          * Iterate through matching records and mark them as deleted.
1132          */
1133         while (error == 0) {
1134                 rec = cursor.record;
1135                 base = &rec->base.base;
1136
1137                 KKASSERT(base->delete_tid == 0);
1138
1139                 /*
1140                  * There may be overlap cases for regular file data.  Also
1141                  * remember the key for a regular file record is the offset
1142                  * of the last byte of the record (base + len - 1), NOT the
1143                  * base offset.
1144                  */
1145 #if 0
1146                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1147 #endif
1148                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1149 #if 0
1150                         kprintf("delete_range loop key %016llx\n",
1151                                 base->key - rec->base.data_len);
1152 #endif
1153                         off = base->key - rec->base.data_len;
1154                         /*
1155                          * Check the left edge case.  We currently do not
1156                          * split existing records.
1157                          */
1158                         if (off < ran_beg) {
1159                                 panic("hammer left edge case %016llx %d\n",
1160                                         base->key, rec->base.data_len);
1161                         }
1162
1163                         /*
1164                          * Check the right edge case.  Note that the
1165                          * record can be completely out of bounds, which
1166                          * terminates the search.
1167                          *
1168                          * base->key is exclusive of the right edge while
1169                          * ran_end is inclusive of the right edge.  The
1170                          * (key - data_len) left boundary is inclusive.
1171                          *
1172                          * XXX theory-check this test at some point, are
1173                          * we missing a + 1 somewhere?  Note that ran_end
1174                          * could overflow.
1175                          */
1176                         if (base->key - 1 > ran_end) {
1177                                 if (base->key - rec->base.data_len > ran_end)
1178                                         break;
1179                                 panic("hammer right edge case\n");
1180                         }
1181                 }
1182
1183                 /*
1184                  * Mark the record and B-Tree entry as deleted.  This will
1185                  * also physically delete the B-Tree entry, record, and
1186                  * data if the retention policy dictates.  The function
1187                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1188                  * uses to perform a fixup.
1189                  */
1190                 error = hammer_ip_delete_record(&cursor, trans->tid);
1191                 if (error)
1192                         break;
1193                 error = hammer_ip_next(&cursor);
1194         }
1195         hammer_done_cursor(&cursor);
1196         if (error == ENOENT)
1197                 error = 0;
1198         return(error);
1199 }
1200
1201 /*
1202  * Delete all records associated with an inode except the inode record
1203  * itself.
1204  */
1205 int
1206 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1207 {
1208         struct hammer_cursor cursor;
1209         hammer_record_ondisk_t rec;
1210         hammer_base_elm_t base;
1211         int error;
1212
1213         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1214
1215         cursor.key_beg.obj_id = ip->obj_id;
1216         cursor.key_beg.create_tid = 0;
1217         cursor.key_beg.delete_tid = 0;
1218         cursor.key_beg.obj_type = 0;
1219         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1220         cursor.key_beg.key = HAMMER_MIN_KEY;
1221
1222         cursor.key_end = cursor.key_beg;
1223         cursor.key_end.rec_type = 0xFFFF;
1224         cursor.key_end.key = HAMMER_MAX_KEY;
1225
1226         cursor.asof = ip->obj_asof;
1227         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1228
1229         error = hammer_ip_first(&cursor, ip);
1230
1231         /*
1232          * Iterate through matching records and mark them as deleted.
1233          */
1234         while (error == 0) {
1235                 rec = cursor.record;
1236                 base = &rec->base.base;
1237
1238                 KKASSERT(base->delete_tid == 0);
1239
1240                 /*
1241                  * Mark the record and B-Tree entry as deleted.  This will
1242                  * also physically delete the B-Tree entry, record, and
1243                  * data if the retention policy dictates.  The function
1244                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1245                  * uses to perform a fixup.
1246                  */
1247                 error = hammer_ip_delete_record(&cursor, trans->tid);
1248                 if (error)
1249                         break;
1250                 error = hammer_ip_next(&cursor);
1251         }
1252         hammer_done_cursor(&cursor);
1253         if (error == ENOENT)
1254                 error = 0;
1255         return(error);
1256 }
1257
1258 /*
1259  * Delete the record at the current cursor
1260  */
1261 int
1262 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1263 {
1264         hammer_btree_elm_t elm;
1265         hammer_mount_t hmp;
1266         int error;
1267
1268         /*
1269          * In-memory (unsynchronized) records can simply be freed.
1270          */
1271         if (cursor->record == &cursor->iprec->rec) {
1272                 cursor->iprec->flags |= HAMMER_RECF_DELETED;
1273                 return(0);
1274         }
1275
1276         /*
1277          * On-disk records are marked as deleted by updating their delete_tid.
1278          */
1279         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1280         elm = NULL;
1281         hmp = cursor->node->cluster->volume->hmp;
1282
1283         if (error == 0) {
1284                 hammer_modify_buffer(cursor->record_buffer);
1285                 cursor->record->base.base.delete_tid = tid;
1286
1287                 hammer_modify_node(cursor->node);
1288                 elm = &cursor->node->ondisk->elms[cursor->index];
1289                 elm->leaf.base.delete_tid = tid;
1290                 hammer_update_syncid(cursor->record_buffer->cluster, tid);
1291         }
1292
1293         /*
1294          * If we were mounted with the nohistory option, we physically
1295          * delete the record.
1296          */
1297         if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1298                 int32_t rec_offset;
1299                 int32_t data_offset;
1300                 int32_t data_len;
1301                 hammer_cluster_t cluster;
1302
1303                 rec_offset = elm->leaf.rec_offset;
1304                 data_offset = elm->leaf.data_offset;
1305                 data_len = elm->leaf.data_len;
1306 #if 0
1307                 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1308                         rec_offset, data_offset, data_len);
1309 #endif
1310                 cluster = cursor->node->cluster;
1311                 hammer_ref_cluster(cluster);
1312
1313                 error = hammer_btree_delete(cursor);
1314                 if (error == 0) {
1315                         /*
1316                          * This forces a fixup for the iteration because
1317                          * the cursor is now either sitting at the 'next'
1318                          * element or sitting at the end of a leaf.
1319                          */
1320                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1321                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1322                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1323                         }
1324                         hammer_free_record(cluster, rec_offset);
1325                         if (data_offset && (data_offset - rec_offset < 0 ||
1326                             data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1327                                 hammer_free_data(cluster, data_offset,data_len);
1328                         }
1329                 }
1330                 hammer_rel_cluster(cluster, 0);
1331                 if (error) {
1332                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1333                         error = 0;
1334                 }
1335         }
1336         return(error);
1337 }
1338
1339 /*
1340  * Determine whether a directory is empty or not.  Returns 0 if the directory
1341  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1342  */
1343 int
1344 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1345 {
1346         struct hammer_cursor cursor;
1347         int error;
1348
1349         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1350
1351         cursor.key_beg.obj_id = ip->obj_id;
1352         cursor.key_beg.create_tid = 0;
1353         cursor.key_beg.delete_tid = 0;
1354         cursor.key_beg.obj_type = 0;
1355         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1356         cursor.key_beg.key = HAMMER_MIN_KEY;
1357
1358         cursor.key_end = cursor.key_beg;
1359         cursor.key_end.rec_type = 0xFFFF;
1360         cursor.key_end.key = HAMMER_MAX_KEY;
1361
1362         cursor.asof = ip->obj_asof;
1363         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1364
1365         error = hammer_ip_first(&cursor, ip);
1366         if (error == ENOENT)
1367                 error = 0;
1368         else if (error == 0)
1369                 error = ENOTEMPTY;
1370         hammer_done_cursor(&cursor);
1371         return(error);
1372 }
1373