HAMMER 16B/many: Fix data overwrite case.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.17 2008/01/09 04:05:37 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61                 return(-1);
62         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63                 return(1);
64         return(0);
65 }
66
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         if (info->rec_type < rec->rec.base.base.rec_type)
71                 return(-3);
72         if (info->rec_type > rec->rec.base.base.rec_type)
73                 return(3);
74
75         if (info->key < rec->rec.base.base.key)
76                 return(-2);
77         if (info->key > rec->rec.base.base.key)
78                 return(2);
79
80         /*
81          * This test has a number of special cases.  create_tid in key1 is
82          * the as-of transction id, and delete_tid in key1 is NOT USED.
83          *
84          * A key1->create_tid of 0 matches any record regardles of when
85          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
86          * used to search for the most current state of the object.
87          *
88          * key2->create_tid is a HAMMER record and will never be
89          * 0.   key2->delete_tid is the deletion transaction id or 0 if
90          * the record has not yet been deleted.
91          */
92         if (info->create_tid) {
93                 if (info->create_tid < rec->rec.base.base.create_tid)
94                         return(-1);
95                 if (rec->rec.base.base.delete_tid &&
96                     info->create_tid >= rec->rec.base.base.delete_tid) {
97                         return(1);
98                 }
99         }
100         return(0);
101 }
102
103 /*
104  * RB_SCAN comparison code for hammer_mem_first().  The argument order
105  * is reversed so the comparison result has to be negated.  key_beg and
106  * key_end are both range-inclusive.
107  *
108  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
109  * These do not stop the scan.
110  *
111  * Localized deletions are not cached in-memory.
112  */
113 static
114 int
115 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
116 {
117         hammer_cursor_t cursor = data;
118         int r;
119
120         r = hammer_rec_compare(&cursor->key_beg, rec);
121         if (r > 1)
122                 return(-1);
123         if (r == 0)
124                 return(0);
125         r = hammer_rec_compare(&cursor->key_end, rec);
126         if (r < -1)
127                 return(1);
128         return(0);
129 }
130
131 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
132 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
133                     hammer_rec_compare, hammer_base_elm_t);
134
135 /*
136  * Allocate a record for the caller to finish filling in.  The record is
137  * returned referenced.
138  */
139 hammer_record_t
140 hammer_alloc_mem_record(hammer_inode_t ip)
141 {
142         hammer_record_t record;
143
144         ++hammer_count_records;
145         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
146         record->ip = ip;
147         hammer_ref(&record->lock);
148         return (record);
149 }
150
151 /*
152  * Release a memory record.  Records marked for deletion are immediately
153  * removed from the RB-Tree but otherwise left intact until the last ref
154  * goes away.
155  */
156 void
157 hammer_rel_mem_record(struct hammer_record *record)
158 {
159         hammer_unref(&record->lock);
160         if (record->flags & HAMMER_RECF_DELETED) {
161                 if (record->flags & HAMMER_RECF_ONRBTREE) {
162                         RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
163                                   record);
164                         record->flags &= ~HAMMER_RECF_ONRBTREE;
165                 }
166                 if (record->lock.refs == 0) {
167                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
168                                 --hammer_count_record_datas;
169                                 kfree(record->data, M_HAMMER);
170                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
171                         }
172                         record->data = NULL;
173                         --hammer_count_records;
174                         kfree(record, M_HAMMER);
175                 }
176         }
177 }
178
179 /*
180  * Lookup an in-memory record given the key specified in the cursor.  Works
181  * just like hammer_btree_lookup() but operates on an inode's in-memory
182  * record list.
183  *
184  * The lookup must fail if the record is marked for deferred deletion.
185  */
186 static
187 int
188 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
189 {
190         int error;
191
192         if (cursor->iprec) {
193                 hammer_rel_mem_record(cursor->iprec);
194                 cursor->iprec = NULL;
195         }
196         if (cursor->ip) {
197                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
198                                                   &cursor->ip->rec_tree);
199         }
200         cursor->ip = ip;
201         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
202         cursor->scan.node = NULL;
203         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
204                                 &ip->rec_tree, &cursor->key_beg);
205         if (cursor->iprec == NULL) {
206                 error = ENOENT;
207         } else {
208                 hammer_ref(&cursor->iprec->lock);
209                 error = 0;
210         }
211         return(error);
212 }
213
214 /*
215  * hammer_mem_first() - locate the first in-memory record matching the
216  * cursor.
217  *
218  * The RB_SCAN function we use is designed as a callback.  We terminate it
219  * (return -1) as soon as we get a match.
220  */
221 static
222 int
223 hammer_rec_scan_callback(hammer_record_t rec, void *data)
224 {
225         hammer_cursor_t cursor = data;
226
227         /*
228          * Skip if not visible due to our as-of TID
229          */
230         if (cursor->key_beg.create_tid) {
231                 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
232                         return(0);
233                 if (rec->rec.base.base.delete_tid &&
234                     cursor->key_beg.create_tid >=
235                      rec->rec.base.base.delete_tid) {
236                         return(0);
237                 }
238         }
239
240         /*
241          * Return the first matching record and stop the scan
242          */
243         if (cursor->iprec == NULL) {
244                 cursor->iprec = rec;
245                 hammer_ref(&rec->lock);
246                 return(-1);
247         }
248         return(0);
249 }
250
251 static
252 int
253 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
254 {
255         if (cursor->iprec) {
256                 hammer_rel_mem_record(cursor->iprec);
257                 cursor->iprec = NULL;
258         }
259         if (cursor->ip) {
260                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
261                                                   &cursor->ip->rec_tree);
262         }
263         cursor->ip = ip;
264         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
265
266         cursor->scan.node = NULL;
267         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
268                                    hammer_rec_scan_callback, cursor);
269
270         /*
271          * Adjust scan.node and keep it linked into the RB-tree so we can
272          * hold the cursor through third party modifications of the RB-tree.
273          */
274         if (cursor->iprec) {
275                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
276                 return(0);
277         }
278         return(ENOENT);
279 }
280
281 void
282 hammer_mem_done(hammer_cursor_t cursor)
283 {
284         if (cursor->ip) {
285                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
286                                                   &cursor->ip->rec_tree);
287                 cursor->ip = NULL;
288         }
289         if (cursor->iprec) {
290                 hammer_rel_mem_record(cursor->iprec);
291                 cursor->iprec = NULL;
292         }
293 }
294
295 /************************************************************************
296  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
297  ************************************************************************
298  *
299  * These functions manipulate in-memory records.  Such records typically
300  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
301  */
302
303 /*
304  * Add a directory entry (dip,ncp) which references inode (ip).
305  *
306  * Note that the low 32 bits of the namekey are set temporarily to create
307  * a unique in-memory record, and may be modified a second time when the
308  * record is synchronized to disk.  In particular, the low 32 bits cannot be
309  * all 0's when synching to disk, which is not handled here.
310  */
311 int
312 hammer_ip_add_directory(struct hammer_transaction *trans,
313                      struct hammer_inode *dip, struct namecache *ncp,
314                      struct hammer_inode *ip)
315 {
316         hammer_record_t record;
317         int error;
318         int bytes;
319
320         record = hammer_alloc_mem_record(dip);
321
322         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
323         if (++trans->hmp->namekey_iterator == 0)
324                 ++trans->hmp->namekey_iterator;
325
326         record->rec.entry.base.base.obj_id = dip->obj_id;
327         record->rec.entry.base.base.key =
328                 hammer_directory_namekey(ncp->nc_name, bytes);
329         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
330         record->rec.entry.base.base.create_tid = trans->tid;
331         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
332         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
333         record->rec.entry.obj_id = ip->obj_id;
334         if (bytes <= sizeof(record->rec.entry.den_name)) {
335                 record->data = (void *)record->rec.entry.den_name;
336                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
337         } else {
338                 ++hammer_count_record_datas;
339                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
340                 record->flags |= HAMMER_RECF_ALLOCDATA;
341         }
342         bcopy(ncp->nc_name, record->data, bytes);
343         record->rec.entry.base.data_len = bytes;
344         ++ip->ino_rec.ino_nlinks;
345         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
346         error = hammer_mem_add(trans, record);
347         return(error);
348 }
349
350 /*
351  * Delete the directory entry and update the inode link count.  The
352  * cursor must be seeked to the directory entry record being deleted.
353  *
354  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
355  */
356 int
357 hammer_ip_del_directory(struct hammer_transaction *trans,
358                      hammer_cursor_t cursor, struct hammer_inode *dip,
359                      struct hammer_inode *ip)
360 {
361         int error;
362
363         error = hammer_ip_delete_record(cursor, trans->tid);
364
365         /*
366          * One less link.  The file may still be open in the OS even after
367          * all links have gone away so we only try to sync if the OS has
368          * no references and nlinks falls to 0.
369          */
370         if (error == 0) {
371                 --ip->ino_rec.ino_nlinks;
372                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
373                 if (ip->ino_rec.ino_nlinks == 0 &&
374                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
375                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
376                 }
377
378         }
379         return(error);
380 }
381
382 /*
383  * Add a record to an inode.
384  *
385  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
386  * initialize the following additional fields:
387  *
388  * record->rec.entry.base.base.key
389  * record->rec.entry.base.base.rec_type
390  * record->rec.entry.base.base.data_len
391  * record->data         (a copy will be kmalloc'd if not embedded)
392  */
393 int
394 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
395 {
396         hammer_inode_t ip = record->ip;
397         int error;
398         int bytes;
399         void *data;
400
401         record->rec.base.base.obj_id = ip->obj_id;
402         record->rec.base.base.create_tid = trans->tid;
403         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
404         bytes = record->rec.base.data_len;
405
406         if (record->data) {
407                 if ((char *)record->data < (char *)&record->rec ||
408                     (char *)record->data >= (char *)(&record->rec + 1)) {
409                         ++hammer_count_record_datas;
410                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
411                         record->flags |= HAMMER_RECF_ALLOCDATA;
412                         bcopy(record->data, data, bytes);
413                         record->data = data;
414                 } else {
415                         record->flags |= HAMMER_RECF_EMBEDDED_DATA;
416                 }
417         }
418         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
419         error = hammer_mem_add(trans, record);
420         return(error);
421 }
422
423 /*
424  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
425  * is called via the strategy called from a cached data source.  This code
426  * is responsible for actually writing a data record out to the disk.
427  */
428 int
429 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
430                        int64_t offset, void *data, int bytes,
431                        struct hammer_cursor **spike)
432 {
433         struct hammer_cursor cursor;
434         hammer_record_ondisk_t rec;
435         union hammer_btree_elm elm;
436         void *bdata;
437         int error;
438
439         error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
440         if (error)
441                 return(error);
442         cursor.key_beg.obj_id = ip->obj_id;
443         cursor.key_beg.key = offset + bytes;
444         cursor.key_beg.create_tid = trans->tid;
445         cursor.key_beg.delete_tid = 0;
446         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
447         cursor.flags = HAMMER_CURSOR_INSERT;
448
449         /*
450          * Issue a lookup to position the cursor and locate the cluster
451          */
452         error = hammer_btree_lookup(&cursor);
453         if (error == 0) {
454                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
455                         offset, bytes);
456                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
457                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
458                 error = EIO;
459         }
460         if (error != ENOENT)
461                 goto done;
462
463         /*
464          * Allocate record and data space now that we know which cluster
465          * the B-Tree node ended up in.
466          */
467         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
468                                   &cursor.data_buffer);
469         if (bdata == NULL)
470                 goto done;
471         rec = hammer_alloc_record(cursor.node->cluster, &error,
472                                   &cursor.record_buffer);
473         if (rec == NULL)
474                 goto fail1;
475
476         /*
477          * Fill everything in and insert our B-Tree node.
478          */
479         hammer_modify_buffer(cursor.record_buffer);
480         rec->base.base = cursor.key_beg;
481         rec->base.data_crc = crc32(data, bytes);
482         rec->base.rec_id = 0;   /* XXX */
483         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
484         rec->base.data_len = bytes;
485         hammer_modify_buffer_done(cursor.record_buffer);
486
487         hammer_modify_buffer(cursor.data_buffer);
488         bcopy(data, bdata, bytes);
489         hammer_modify_buffer_done(cursor.data_buffer);
490
491         elm.leaf.base = cursor.key_beg;
492         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
493         elm.leaf.data_offset = rec->base.data_offset;
494         elm.leaf.data_len = bytes;
495         elm.leaf.data_crc = rec->base.data_crc;
496
497         /*
498          * Data records can wind up on-disk before the inode itself is
499          * on-disk.  One must assume data records may be on-disk if either
500          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
501          */
502         ip->flags |= HAMMER_INODE_DONDISK;
503
504         error = hammer_btree_insert(&cursor, &elm);
505         if (error == 0) {
506                 hammer_update_syncid(cursor.record_buffer->cluster, trans->tid);
507                 goto done;
508         }
509
510         hammer_free_record_ptr(cursor.record_buffer, rec);
511 fail1:
512         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
513 done:
514         /*
515          * If ENOSPC in cluster fill in the spike structure and return
516          * ENOSPC.
517          */
518         if (error == ENOSPC)
519                 hammer_load_spike(&cursor, spike);
520         hammer_done_cursor(&cursor);
521         return(error);
522 }
523
524 /*
525  * Sync an in-memory record to the disk.  this is typically called via fsync
526  * from a cached record source.  This code is responsible for actually
527  * writing a record out to the disk.
528  */
529 int
530 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
531 {
532         struct hammer_cursor cursor;
533         hammer_record_ondisk_t rec;
534         hammer_mount_t hmp;
535         union hammer_btree_elm elm;
536         void *bdata;
537         int error;
538
539         error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
540                                        record->ip->hmp);
541         if (error)
542                 return(error);
543         cursor.key_beg = record->rec.base.base;
544         cursor.flags = HAMMER_CURSOR_INSERT;
545
546         /*
547          * Issue a lookup to position the cursor and locate the cluster.  The
548          * target key should not exist.  If we are creating a directory entry
549          * we may have to iterate the low 32 bits of the key to find an unused
550          * key.
551          *
552          * If we run out of space trying to adjust the B-Tree for the
553          * insert, re-lookup without the insert flag so the cursor
554          * is properly positioned for the spike.
555          */
556 again:
557         error = hammer_btree_lookup(&cursor);
558         if (error == 0) {
559                 if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
560                         hmp = cursor.node->cluster->volume->hmp;
561                         if (++hmp->namekey_iterator == 0)
562                                 ++hmp->namekey_iterator;
563                         record->rec.base.base.key &= ~(0xFFFFFFFFLL);
564                         record->rec.base.base.key |= hmp->namekey_iterator;
565                         goto again;
566                 }
567                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
568                         record->rec.base.base.key);
569                 Debugger("duplicate record1");
570                 error = EIO;
571         }
572         if (error != ENOENT)
573                 goto done;
574
575         /*
576          * Mark the record as undergoing synchronization.  Our cursor is
577          * holding a locked B-Tree node for the insertion which interlocks
578          * anyone trying to access this record.
579          *
580          * XXX There is still a race present related to iterations.  An
581          * iteration may process the record, a sync may occur, and then
582          * later process the B-Tree element for the same record.
583          *
584          * We do not try to synchronize a deleted record.
585          */
586         if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING)) {
587                 error = 0;
588                 goto done;
589         }
590         record->flags |= HAMMER_RECF_SYNCING;
591
592         /*
593          * Allocate record and data space now that we know which cluster
594          * the B-Tree node ended up in.
595          */
596         if (record->data == NULL ||
597             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
598                 bdata = record->data;
599         } else {
600                 bdata = hammer_alloc_data(cursor.node->cluster,
601                                           record->rec.base.data_len, &error,
602                                           &cursor.data_buffer);
603                 if (bdata == NULL)
604                         goto fail2;
605         }
606         rec = hammer_alloc_record(cursor.node->cluster, &error,
607                                   &cursor.record_buffer);
608         if (rec == NULL)
609                 goto fail1;
610
611         /*
612          * Fill everything in and insert our B-Tree node.
613          *
614          * XXX assign rec_id here
615          */
616         hammer_modify_buffer(cursor.record_buffer);
617         *rec = record->rec;
618         if (bdata) {
619                 rec->base.data_crc = crc32(record->data,
620                                            record->rec.base.data_len);
621                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
622                         /*
623                          * Data embedded in record
624                          */
625                         rec->base.data_offset = ((char *)bdata -
626                                                  (char *)&record->rec);
627                         KKASSERT(rec->base.data_offset >= 0 && 
628                                  rec->base.data_offset + rec->base.data_len <=
629                                   sizeof(*rec));
630                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
631                 } else {
632                         /*
633                          * Data separate from record
634                          */
635                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
636                         hammer_modify_buffer(cursor.data_buffer);
637                         bcopy(record->data, bdata, rec->base.data_len);
638                         hammer_modify_buffer_done(cursor.data_buffer);
639                 }
640         }
641         rec->base.rec_id = 0;   /* XXX */
642         hammer_modify_buffer_done(cursor.record_buffer);
643
644         elm.leaf.base = cursor.key_beg;
645         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
646         elm.leaf.data_offset = rec->base.data_offset;
647         elm.leaf.data_len = rec->base.data_len;
648         elm.leaf.data_crc = rec->base.data_crc;
649
650         error = hammer_btree_insert(&cursor, &elm);
651
652         /*
653          * Clean up on success, or fall through on error.
654          */
655         if (error == 0) {
656                 record->flags |= HAMMER_RECF_DELETED;
657                 record->flags &= ~HAMMER_RECF_SYNCING;
658                 hammer_update_syncid(cursor.record_buffer->cluster,
659                                      record->rec.base.base.create_tid);
660                 goto done;
661         }
662
663         hammer_free_record_ptr(cursor.record_buffer, rec);
664 fail1:
665         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
666                 hammer_free_data_ptr(cursor.data_buffer, bdata,
667                                      record->rec.base.data_len);
668         }
669 fail2:
670         record->flags &= ~HAMMER_RECF_SYNCING;
671 done:
672         /*
673          * If ENOSPC in cluster fill in the spike structure and return
674          * ENOSPC.
675          */
676         if (error == ENOSPC)
677                 hammer_load_spike(&cursor, spike);
678         hammer_done_cursor(&cursor);
679         return(error);
680 }
681
682 /*
683  * Write out a record using the specified cursor.  The caller does not have
684  * to seek the cursor.  The flags are used to determine whether the data
685  * (if any) is embedded in the record or not.
686  *
687  * The target cursor will be modified by this call.  Note in particular
688  * that HAMMER_CURSOR_INSERT is set.
689  */
690 int
691 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
692                     void *data, int cursor_flags)
693 {
694         union hammer_btree_elm elm;
695         hammer_record_ondisk_t nrec;
696         void *bdata;
697         int error;
698
699         cursor->key_beg = orec->base.base;
700         cursor->flags |= HAMMER_CURSOR_INSERT;
701
702         /*
703          * Issue a lookup to position the cursor and locate the cluster.  The
704          * target key should not exist.
705          *
706          * If we run out of space trying to adjust the B-Tree for the
707          * insert, re-lookup without the insert flag so the cursor
708          * is properly positioned for the spike.
709          */
710         error = hammer_btree_lookup(cursor);
711         if (error == 0) {
712                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
713                         orec->base.base.key);
714                 Debugger("duplicate record2");
715                 error = EIO;
716         }
717         if (error != ENOENT)
718                 goto done;
719
720         /*
721          * Allocate record and data space now that we know which cluster
722          * the B-Tree node ended up in.
723          */
724         if (data == NULL ||
725             (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
726                 bdata = data;
727         } else {
728                 bdata = hammer_alloc_data(cursor->node->cluster,
729                                           orec->base.data_len, &error,
730                                           &cursor->data_buffer);
731                 if (bdata == NULL)
732                         goto done;
733         }
734         nrec = hammer_alloc_record(cursor->node->cluster, &error,
735                                   &cursor->record_buffer);
736         if (nrec == NULL)
737                 goto fail1;
738
739         /*
740          * Fill everything in and insert our B-Tree node.
741          *
742          * XXX assign rec_id here
743          */
744         hammer_modify_buffer(cursor->record_buffer);
745         *nrec = *orec;
746         nrec->base.data_offset = 0;
747         if (bdata) {
748                 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
749                 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
750                         /*
751                          * Data embedded in record
752                          */
753                         nrec->base.data_offset = ((char *)bdata - (char *)orec);
754                         KKASSERT(nrec->base.data_offset >= 0 && 
755                                  nrec->base.data_offset + nrec->base.data_len <
756                                   sizeof(*nrec));
757                         nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
758                 } else {
759                         /*
760                          * Data separate from record
761                          */
762                         nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
763                         hammer_modify_buffer(cursor->data_buffer);
764                         bcopy(data, bdata, nrec->base.data_len);
765                         hammer_modify_buffer_done(cursor->data_buffer);
766                 }
767         }
768         nrec->base.rec_id = 0;  /* XXX */
769         hammer_modify_buffer_done(cursor->record_buffer);
770
771         elm.leaf.base = nrec->base.base;
772         elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
773         elm.leaf.data_offset = nrec->base.data_offset;
774         elm.leaf.data_len = nrec->base.data_len;
775         elm.leaf.data_crc = nrec->base.data_crc;
776
777         error = hammer_btree_insert(cursor, &elm);
778         if (error == 0) {
779                 hammer_update_syncid(cursor->record_buffer->cluster,
780                                      nrec->base.base.create_tid);
781                 goto done;
782         }
783
784         hammer_free_record_ptr(cursor->record_buffer, nrec);
785 fail1:
786         if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
787                 hammer_free_data_ptr(cursor->data_buffer, bdata,
788                                      orec->base.data_len);
789         }
790 done:
791         /* leave cursor intact */
792         return(error);
793 }
794
795 /*
796  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
797  * entry's key is used to deal with hash collisions in the upper 32 bits.
798  * A unique 64 bit key is generated in-memory and may be regenerated a
799  * second time when the directory record is flushed to the on-disk B-Tree.
800  *
801  * A referenced record is passed to this function.  This function
802  * eats the reference.  If an error occurs the record will be deleted.
803  */
804 static
805 int
806 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
807 {
808         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
809                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
810                         record->flags |= HAMMER_RECF_DELETED;
811                         hammer_rel_mem_record(record);
812                         return (EEXIST);
813                 }
814                 if (++trans->hmp->namekey_iterator == 0)
815                         ++trans->hmp->namekey_iterator;
816                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
817                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
818         }
819         record->flags |= HAMMER_RECF_ONRBTREE;
820         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
821         hammer_rel_mem_record(record);
822         return(0);
823 }
824
825 /************************************************************************
826  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
827  ************************************************************************
828  *
829  * These functions augment the B-Tree scanning functions in hammer_btree.c
830  * by merging in-memory records with on-disk records.
831  */
832
833 /*
834  * Locate a particular record either in-memory or on-disk.
835  *
836  * NOTE: This is basically a standalone routine, hammer_ip_next() may
837  * NOT be called to iterate results.
838  */
839 int
840 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
841 {
842         int error;
843
844         /*
845          * If the element is in-memory return it without searching the
846          * on-disk B-Tree
847          */
848         error = hammer_mem_lookup(cursor, ip);
849         if (error == 0) {
850                 cursor->record = &cursor->iprec->rec;
851                 return(error);
852         }
853         if (error != ENOENT)
854                 return(error);
855
856         /*
857          * If the inode has on-disk components search the on-disk B-Tree.
858          */
859         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
860                 return(error);
861         error = hammer_btree_lookup(cursor);
862         if (error == 0)
863                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
864         return(error);
865 }
866
867 /*
868  * Locate the first record within the cursor's key_beg/key_end range,
869  * restricted to a particular inode.  0 is returned on success, ENOENT
870  * if no records matched the requested range, or some other error.
871  *
872  * When 0 is returned hammer_ip_next() may be used to iterate additional
873  * records within the requested range.
874  */
875 int
876 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
877 {
878         int error;
879
880         /*
881          * Clean up fields and setup for merged scan
882          */
883         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
884         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
885         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
886         if (cursor->iprec) {
887                 hammer_rel_mem_record(cursor->iprec);
888                 cursor->iprec = NULL;
889         }
890
891         /*
892          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
893          * exact lookup so if we get ENOENT we have to call the iterate
894          * function to validate the first record after the begin key.
895          *
896          * The ATEDISK flag is used by hammer_btree_iterate to determine
897          * whether it must index forwards or not.  It is also used here
898          * to select the next record from in-memory or on-disk.
899          */
900         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
901                 error = hammer_btree_lookup(cursor);
902                 if (error == ENOENT) {
903                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
904                         error = hammer_btree_iterate(cursor);
905                 }
906                 if (error && error != ENOENT) 
907                         return(error);
908                 if (error == 0) {
909                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
910                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
911                 } else {
912                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
913                 }
914         }
915
916         /*
917          * Search the in-memory record list (Red-Black tree).  Unlike the
918          * B-Tree search, mem_first checks for records in the range.
919          */
920         error = hammer_mem_first(cursor, ip);
921         if (error && error != ENOENT)
922                 return(error);
923         if (error == 0) {
924                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
925                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
926         }
927
928         /*
929          * This will return the first matching record.
930          */
931         return(hammer_ip_next(cursor));
932 }
933
934 /*
935  * Retrieve the next record in a merged iteration within the bounds of the
936  * cursor.  This call may be made multiple times after the cursor has been
937  * initially searched with hammer_ip_first().
938  *
939  * 0 is returned on success, ENOENT if no further records match the
940  * requested range, or some other error code is returned.
941  */
942 int
943 hammer_ip_next(hammer_cursor_t cursor)
944 {
945         hammer_btree_elm_t elm;
946         hammer_record_t rec;
947         int error;
948         int r;
949
950         /*
951          * Load the current on-disk and in-memory record.  If we ate any
952          * records we have to get the next one. 
953          *
954          * If we deleted the last on-disk record we had scanned ATEDISK will
955          * be clear and DELBTREE will be set, forcing a call to iterate. The
956          * fact that ATEDISK is clear causes iterate to re-test the 'current'
957          * element.  If ATEDISK is set, iterate will skip the 'current'
958          * element.
959          *
960          * Get the next on-disk record
961          */
962         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
963                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
964                         error = hammer_btree_iterate(cursor);
965                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
966                         if (error == 0)
967                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
968                         else
969                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
970                                                  HAMMER_CURSOR_ATEDISK;
971                 }
972         }
973
974         /*
975          * Get the next in-memory record.  The record can be ripped out
976          * of the RB tree so we maintain a scan_info structure to track
977          * the next node.
978          *
979          * hammer_rec_scan_cmp:  Is the record still in our general range,
980          *                       (non-inclusive of snapshot exclusions)?
981          * hammer_rec_scan_callback: Is the record in our snapshot?
982          */
983         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
984                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
985                         if (cursor->iprec) {
986                                 hammer_rel_mem_record(cursor->iprec);
987                                 cursor->iprec = NULL;
988                         }
989                         rec = cursor->scan.node;        /* next node */
990                         while (rec) {
991                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
992                                         break;
993                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
994                                         break;
995                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
996                         }
997                         if (cursor->iprec) {
998                                 KKASSERT(cursor->iprec == rec);
999                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1000                                 cursor->scan.node =
1001                                         hammer_rec_rb_tree_RB_NEXT(rec);
1002                         } else {
1003                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1004                         }
1005                 }
1006         }
1007
1008         /*
1009          * Extract either the disk or memory record depending on their
1010          * relative position.
1011          */
1012         error = 0;
1013         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1014         case 0:
1015                 /*
1016                  * Both entries valid
1017                  */
1018                 elm = &cursor->node->ondisk->elms[cursor->index];
1019                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1020                 if (r < 0) {
1021                         error = hammer_btree_extract(cursor,
1022                                                      HAMMER_CURSOR_GET_RECORD);
1023                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1024                         break;
1025                 }
1026                 /* fall through to the memory entry */
1027         case HAMMER_CURSOR_ATEDISK:
1028                 /*
1029                  * Only the memory entry is valid
1030                  */
1031                 cursor->record = &cursor->iprec->rec;
1032                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1033                 break;
1034         case HAMMER_CURSOR_ATEMEM:
1035                 /*
1036                  * Only the disk entry is valid
1037                  */
1038                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1039                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1040                 break;
1041         default:
1042                 /*
1043                  * Neither entry is valid
1044                  *
1045                  * XXX error not set properly
1046                  */
1047                 cursor->record = NULL;
1048                 error = ENOENT;
1049                 break;
1050         }
1051         return(error);
1052 }
1053
1054 /*
1055  * Resolve the cursor->data pointer for the current cursor position in
1056  * a merged iteration.
1057  */
1058 int
1059 hammer_ip_resolve_data(hammer_cursor_t cursor)
1060 {
1061         int error;
1062
1063         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1064                 cursor->data = cursor->iprec->data;
1065                 error = 0;
1066         } else {
1067                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1068         }
1069         return(error);
1070 }
1071
1072 /*
1073  * Delete all records within the specified range for inode ip.
1074  *
1075  * NOTE: An unaligned range will cause new records to be added to cover
1076  * the edge cases. (XXX not implemented yet).
1077  *
1078  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1079  *
1080  * NOTE: Record keys for regular file data have to be special-cased since
1081  * they indicate the end of the range (key = base + bytes).
1082  *
1083  * NOTE: The spike structure must be filled in if we return ENOSPC.
1084  */
1085 int
1086 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1087                        int64_t ran_beg, int64_t ran_end,
1088                        struct hammer_cursor **spike)
1089 {
1090         struct hammer_cursor cursor;
1091         hammer_record_ondisk_t rec;
1092         hammer_base_elm_t base;
1093         int error;
1094         int64_t off;
1095
1096         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1097
1098         cursor.key_beg.obj_id = ip->obj_id;
1099         cursor.key_beg.create_tid = ip->obj_asof;
1100         cursor.key_beg.delete_tid = 0;
1101         cursor.key_beg.obj_type = 0;
1102
1103         cursor.key_end = cursor.key_beg;
1104         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1105                 cursor.key_beg.key = ran_beg;
1106                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1107                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1108                 cursor.key_end.key = ran_end;
1109         } else {
1110                 /*
1111                  * The key in the B-Tree is (base+bytes), so the first possible
1112                  * matching key is ran_beg + 1.
1113                  */
1114                 int64_t tmp64;
1115
1116                 cursor.key_beg.key = ran_beg + 1;
1117                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1118                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1119
1120                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1121                 if (tmp64 < ran_end)
1122                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1123                 else
1124                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1125         }
1126         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1127
1128         error = hammer_ip_first(&cursor, ip);
1129
1130         /*
1131          * Iterate through matching records and mark them as deleted.
1132          */
1133         while (error == 0) {
1134                 rec = cursor.record;
1135                 base = &rec->base.base;
1136
1137                 KKASSERT(base->delete_tid == 0);
1138
1139                 /*
1140                  * There may be overlap cases for regular file data.  Also
1141                  * remember the key for a regular file record is the offset
1142                  * of the last byte of the record (base + len - 1), NOT the
1143                  * base offset.
1144                  */
1145 #if 0
1146                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1147 #endif
1148                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1149 #if 0
1150                         kprintf("delete_range loop key %016llx\n",
1151                                 base->key - rec->base.data_len);
1152 #endif
1153                         off = base->key - rec->base.data_len;
1154                         /*
1155                          * Check the left edge case.  We currently do not
1156                          * split existing records.
1157                          */
1158                         if (off < ran_beg) {
1159                                 panic("hammer left edge case %016llx %d\n",
1160                                         base->key, rec->base.data_len);
1161                         }
1162
1163                         /*
1164                          * Check the right edge case.  Note that the
1165                          * record can be completely out of bounds, which
1166                          * terminates the search.
1167                          *
1168                          * base->key is exclusive of the right edge while
1169                          * ran_end is inclusive of the right edge.  The
1170                          * (key - data_len) left boundary is inclusive.
1171                          *
1172                          * XXX theory-check this test at some point, are
1173                          * we missing a + 1 somewhere?  Note that ran_end
1174                          * could overflow.
1175                          */
1176                         if (base->key - 1 > ran_end) {
1177                                 if (base->key - rec->base.data_len > ran_end)
1178                                         break;
1179                                 panic("hammer right edge case\n");
1180                         }
1181                 }
1182
1183                 /*
1184                  * Mark the record and B-Tree entry as deleted.  This will
1185                  * also physically delete the B-Tree entry, record, and
1186                  * data if the retention policy dictates.  The function
1187                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1188                  * uses to perform a fixup.
1189                  */
1190                 error = hammer_ip_delete_record(&cursor, trans->tid);
1191                 if (error)
1192                         break;
1193                 error = hammer_ip_next(&cursor);
1194         }
1195         hammer_done_cursor(&cursor);
1196         if (error == ENOENT)
1197                 error = 0;
1198         return(error);
1199 }
1200
1201 /*
1202  * Delete all records associated with an inode except the inode record
1203  * itself.
1204  */
1205 int
1206 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1207 {
1208         struct hammer_cursor cursor;
1209         hammer_record_ondisk_t rec;
1210         hammer_base_elm_t base;
1211         int error;
1212
1213         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1214
1215         cursor.key_beg.obj_id = ip->obj_id;
1216         cursor.key_beg.create_tid = ip->obj_asof;
1217         cursor.key_beg.delete_tid = 0;
1218         cursor.key_beg.obj_type = 0;
1219         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1220         cursor.key_beg.key = HAMMER_MIN_KEY;
1221
1222         cursor.key_end = cursor.key_beg;
1223         cursor.key_end.rec_type = 0xFFFF;
1224         cursor.key_end.key = HAMMER_MAX_KEY;
1225
1226         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1227
1228         error = hammer_ip_first(&cursor, ip);
1229
1230         /*
1231          * Iterate through matching records and mark them as deleted.
1232          */
1233         while (error == 0) {
1234                 rec = cursor.record;
1235                 base = &rec->base.base;
1236
1237                 KKASSERT(base->delete_tid == 0);
1238
1239                 /*
1240                  * Mark the record and B-Tree entry as deleted.  This will
1241                  * also physically delete the B-Tree entry, record, and
1242                  * data if the retention policy dictates.  The function
1243                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1244                  * uses to perform a fixup.
1245                  */
1246                 error = hammer_ip_delete_record(&cursor, trans->tid);
1247                 if (error)
1248                         break;
1249                 error = hammer_ip_next(&cursor);
1250         }
1251         hammer_done_cursor(&cursor);
1252         if (error == ENOENT)
1253                 error = 0;
1254         return(error);
1255 }
1256
1257 /*
1258  * Delete the record at the current cursor
1259  */
1260 int
1261 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1262 {
1263         hammer_btree_elm_t elm;
1264         hammer_mount_t hmp;
1265         int error;
1266
1267         /*
1268          * In-memory (unsynchronized) records can simply be freed.
1269          */
1270         if (cursor->record == &cursor->iprec->rec) {
1271                 cursor->iprec->flags |= HAMMER_RECF_DELETED;
1272                 return(0);
1273         }
1274
1275         /*
1276          * On-disk records are marked as deleted by updating their delete_tid.
1277          */
1278         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1279         elm = NULL;
1280         hmp = cursor->node->cluster->volume->hmp;
1281
1282         if (error == 0) {
1283                 hammer_modify_buffer(cursor->record_buffer);
1284                 cursor->record->base.base.delete_tid = tid;
1285
1286                 hammer_modify_buffer_done(cursor->record_buffer);
1287                 hammer_modify_node(cursor->node);
1288                 elm = &cursor->node->ondisk->elms[cursor->index];
1289                 elm->leaf.base.delete_tid = tid;
1290                 hammer_modify_node_done(cursor->node);
1291                 hammer_update_syncid(cursor->record_buffer->cluster, tid);
1292         }
1293
1294         /*
1295          * If we were mounted with the nohistory option, we physically
1296          * delete the record.
1297          */
1298         if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1299                 int32_t rec_offset;
1300                 int32_t data_offset;
1301                 int32_t data_len;
1302                 hammer_cluster_t cluster;
1303
1304                 rec_offset = elm->leaf.rec_offset;
1305                 data_offset = elm->leaf.data_offset;
1306                 data_len = elm->leaf.data_len;
1307 #if 0
1308                 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1309                         rec_offset, data_offset, data_len);
1310 #endif
1311                 cluster = cursor->node->cluster;
1312                 hammer_ref_cluster(cluster);
1313
1314                 error = hammer_btree_delete(cursor);
1315                 if (error == 0) {
1316                         /*
1317                          * This forces a fixup for the iteration because
1318                          * the cursor is now either sitting at the 'next'
1319                          * element or sitting at the end of a leaf.
1320                          */
1321                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1322                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1323                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1324                         }
1325                         hammer_free_record(cluster, rec_offset);
1326                         if (data_offset && (data_offset - rec_offset < 0 ||
1327                             data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1328                                 hammer_free_data(cluster, data_offset,data_len);
1329                         }
1330                 }
1331                 hammer_rel_cluster(cluster, 0);
1332                 if (error) {
1333                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1334                         error = 0;
1335                 }
1336         }
1337         return(error);
1338 }
1339
1340 /*
1341  * Determine whether a directory is empty or not.  Returns 0 if the directory
1342  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1343  */
1344 int
1345 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1346 {
1347         struct hammer_cursor cursor;
1348         int error;
1349
1350         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1351
1352         cursor.key_beg.obj_id = ip->obj_id;
1353         cursor.key_beg.create_tid = ip->obj_asof;
1354         cursor.key_beg.delete_tid = 0;
1355         cursor.key_beg.obj_type = 0;
1356         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1357         cursor.key_beg.key = HAMMER_MIN_KEY;
1358
1359         cursor.key_end = cursor.key_beg;
1360         cursor.key_end.rec_type = 0xFFFF;
1361         cursor.key_end.key = HAMMER_MAX_KEY;
1362
1363         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1364
1365         error = hammer_ip_first(&cursor, ip);
1366         if (error == ENOENT)
1367                 error = 0;
1368         else if (error == 0)
1369                 error = ENOTEMPTY;
1370         hammer_done_cursor(&cursor);
1371         return(error);
1372 }
1373