HAMMER 16/many - Recovery infrastructure, misc bug fixes
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.16 2008/01/09 00:46:22 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
61                 return(-1);
62         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
63                 return(1);
64         return(0);
65 }
66
67 static int
68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
69 {
70         if (info->rec_type < rec->rec.base.base.rec_type)
71                 return(-3);
72         if (info->rec_type > rec->rec.base.base.rec_type)
73                 return(3);
74
75         if (info->key < rec->rec.base.base.key)
76                 return(-2);
77         if (info->key > rec->rec.base.base.key)
78                 return(2);
79
80         /*
81          * This test has a number of special cases.  create_tid in key1 is
82          * the as-of transction id, and delete_tid in key1 is NOT USED.
83          *
84          * A key1->create_tid of 0 matches any record regardles of when
85          * it was created or destroyed.  0xFFFFFFFFFFFFFFFFULL should be
86          * used to search for the most current state of the object.
87          *
88          * key2->create_tid is a HAMMER record and will never be
89          * 0.   key2->delete_tid is the deletion transaction id or 0 if
90          * the record has not yet been deleted.
91          */
92         if (info->create_tid) {
93                 if (info->create_tid < rec->rec.base.base.create_tid)
94                         return(-1);
95                 if (rec->rec.base.base.delete_tid &&
96                     info->create_tid >= rec->rec.base.base.delete_tid) {
97                         return(1);
98                 }
99         }
100         return(0);
101 }
102
103 /*
104  * RB_SCAN comparison code for hammer_mem_first().  The argument order
105  * is reversed so the comparison result has to be negated.  key_beg and
106  * key_end are both range-inclusive.
107  *
108  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
109  * These do not stop the scan.
110  *
111  * Localized deletions are not cached in-memory.
112  */
113 static
114 int
115 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
116 {
117         hammer_cursor_t cursor = data;
118         int r;
119
120         r = hammer_rec_compare(&cursor->key_beg, rec);
121         if (r > 1)
122                 return(-1);
123         if (r == 0)
124                 return(0);
125         r = hammer_rec_compare(&cursor->key_end, rec);
126         if (r < -1)
127                 return(1);
128         return(0);
129 }
130
131 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
132 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
133                     hammer_rec_compare, hammer_base_elm_t);
134
135 /*
136  * Allocate a record for the caller to finish filling in.  The record is
137  * returned referenced.
138  */
139 hammer_record_t
140 hammer_alloc_mem_record(hammer_inode_t ip)
141 {
142         hammer_record_t record;
143
144         ++hammer_count_records;
145         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
146         record->ip = ip;
147         hammer_ref(&record->lock);
148         return (record);
149 }
150
151 /*
152  * Release a memory record.  Records marked for deletion are immediately
153  * removed from the RB-Tree but otherwise left intact until the last ref
154  * goes away.
155  */
156 void
157 hammer_rel_mem_record(struct hammer_record *record)
158 {
159         hammer_unref(&record->lock);
160         if (record->flags & HAMMER_RECF_DELETED) {
161                 if (record->flags & HAMMER_RECF_ONRBTREE) {
162                         RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
163                                   record);
164                         record->flags &= ~HAMMER_RECF_ONRBTREE;
165                 }
166                 if (record->lock.refs == 0) {
167                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
168                                 --hammer_count_record_datas;
169                                 kfree(record->data, M_HAMMER);
170                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
171                         }
172                         record->data = NULL;
173                         --hammer_count_records;
174                         kfree(record, M_HAMMER);
175                 }
176         }
177 }
178
179 /*
180  * Lookup an in-memory record given the key specified in the cursor.  Works
181  * just like hammer_btree_lookup() but operates on an inode's in-memory
182  * record list.
183  *
184  * The lookup must fail if the record is marked for deferred deletion.
185  */
186 static
187 int
188 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
189 {
190         int error;
191
192         if (cursor->iprec) {
193                 hammer_rel_mem_record(cursor->iprec);
194                 cursor->iprec = NULL;
195         }
196         if (cursor->ip) {
197                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
198                                                   &cursor->ip->rec_tree);
199         }
200         cursor->ip = ip;
201         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
202         cursor->scan.node = NULL;
203         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
204                                 &ip->rec_tree, &cursor->key_beg);
205         if (cursor->iprec == NULL) {
206                 error = ENOENT;
207         } else {
208                 hammer_ref(&cursor->iprec->lock);
209                 error = 0;
210         }
211         return(error);
212 }
213
214 /*
215  * hammer_mem_first() - locate the first in-memory record matching the
216  * cursor.
217  *
218  * The RB_SCAN function we use is designed as a callback.  We terminate it
219  * (return -1) as soon as we get a match.
220  */
221 static
222 int
223 hammer_rec_scan_callback(hammer_record_t rec, void *data)
224 {
225         hammer_cursor_t cursor = data;
226
227         /*
228          * Skip if not visible due to our as-of TID
229          */
230         if (cursor->key_beg.create_tid) {
231                 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid)
232                         return(0);
233                 if (rec->rec.base.base.delete_tid &&
234                     cursor->key_beg.create_tid >=
235                      rec->rec.base.base.delete_tid) {
236                         return(0);
237                 }
238         }
239
240         /*
241          * Return the first matching record and stop the scan
242          */
243         if (cursor->iprec == NULL) {
244                 cursor->iprec = rec;
245                 hammer_ref(&rec->lock);
246                 return(-1);
247         }
248         return(0);
249 }
250
251 static
252 int
253 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
254 {
255         if (cursor->iprec) {
256                 hammer_rel_mem_record(cursor->iprec);
257                 cursor->iprec = NULL;
258         }
259         if (cursor->ip) {
260                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
261                                                   &cursor->ip->rec_tree);
262         }
263         cursor->ip = ip;
264         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
265
266         cursor->scan.node = NULL;
267         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
268                                    hammer_rec_scan_callback, cursor);
269
270         /*
271          * Adjust scan.node and keep it linked into the RB-tree so we can
272          * hold the cursor through third party modifications of the RB-tree.
273          */
274         if (cursor->iprec) {
275                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
276                 return(0);
277         }
278         return(ENOENT);
279 }
280
281 void
282 hammer_mem_done(hammer_cursor_t cursor)
283 {
284         if (cursor->ip) {
285                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
286                                                   &cursor->ip->rec_tree);
287                 cursor->ip = NULL;
288         }
289         if (cursor->iprec) {
290                 hammer_rel_mem_record(cursor->iprec);
291                 cursor->iprec = NULL;
292         }
293 }
294
295 /************************************************************************
296  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
297  ************************************************************************
298  *
299  * These functions manipulate in-memory records.  Such records typically
300  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
301  */
302
303 /*
304  * Add a directory entry (dip,ncp) which references inode (ip).
305  *
306  * Note that the low 32 bits of the namekey are set temporarily to create
307  * a unique in-memory record, and may be modified a second time when the
308  * record is synchronized to disk.  In particular, the low 32 bits cannot be
309  * all 0's when synching to disk, which is not handled here.
310  */
311 int
312 hammer_ip_add_directory(struct hammer_transaction *trans,
313                      struct hammer_inode *dip, struct namecache *ncp,
314                      struct hammer_inode *ip)
315 {
316         hammer_record_t record;
317         int error;
318         int bytes;
319
320         record = hammer_alloc_mem_record(dip);
321
322         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
323         if (++trans->hmp->namekey_iterator == 0)
324                 ++trans->hmp->namekey_iterator;
325
326         record->rec.entry.base.base.obj_id = dip->obj_id;
327         record->rec.entry.base.base.key =
328                 hammer_directory_namekey(ncp->nc_name, bytes);
329         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
330         record->rec.entry.base.base.create_tid = trans->tid;
331         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
332         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
333         record->rec.entry.obj_id = ip->obj_id;
334         if (bytes <= sizeof(record->rec.entry.den_name)) {
335                 record->data = (void *)record->rec.entry.den_name;
336                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
337         } else {
338                 ++hammer_count_record_datas;
339                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
340                 record->flags |= HAMMER_RECF_ALLOCDATA;
341         }
342         bcopy(ncp->nc_name, record->data, bytes);
343         record->rec.entry.base.data_len = bytes;
344         ++ip->ino_rec.ino_nlinks;
345         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
346         error = hammer_mem_add(trans, record);
347         return(error);
348 }
349
350 /*
351  * Delete the directory entry and update the inode link count.  The
352  * cursor must be seeked to the directory entry record being deleted.
353  *
354  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
355  */
356 int
357 hammer_ip_del_directory(struct hammer_transaction *trans,
358                      hammer_cursor_t cursor, struct hammer_inode *dip,
359                      struct hammer_inode *ip)
360 {
361         int error;
362
363         error = hammer_ip_delete_record(cursor, trans->tid);
364
365         /*
366          * One less link.  The file may still be open in the OS even after
367          * all links have gone away so we only try to sync if the OS has
368          * no references and nlinks falls to 0.
369          */
370         if (error == 0) {
371                 --ip->ino_rec.ino_nlinks;
372                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
373                 if (ip->ino_rec.ino_nlinks == 0 &&
374                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
375                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
376                 }
377
378         }
379         return(error);
380 }
381
382 /*
383  * Add a record to an inode.
384  *
385  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
386  * initialize the following additional fields:
387  *
388  * record->rec.entry.base.base.key
389  * record->rec.entry.base.base.rec_type
390  * record->rec.entry.base.base.data_len
391  * record->data         (a copy will be kmalloc'd if not embedded)
392  */
393 int
394 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
395 {
396         hammer_inode_t ip = record->ip;
397         int error;
398         int bytes;
399         void *data;
400
401         record->rec.base.base.obj_id = ip->obj_id;
402         record->rec.base.base.create_tid = trans->tid;
403         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
404         bytes = record->rec.base.data_len;
405
406         if (record->data) {
407                 if ((char *)record->data < (char *)&record->rec ||
408                     (char *)record->data >= (char *)(&record->rec + 1)) {
409                         ++hammer_count_record_datas;
410                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
411                         record->flags |= HAMMER_RECF_ALLOCDATA;
412                         bcopy(record->data, data, bytes);
413                         record->data = data;
414                 } else {
415                         record->flags |= HAMMER_RECF_EMBEDDED_DATA;
416                 }
417         }
418         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
419         error = hammer_mem_add(trans, record);
420         return(error);
421 }
422
423 /*
424  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
425  * is called via the strategy called from a cached data source.  This code
426  * is responsible for actually writing a data record out to the disk.
427  */
428 int
429 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
430                        int64_t offset, void *data, int bytes,
431                        struct hammer_cursor **spike)
432 {
433         struct hammer_cursor cursor;
434         hammer_record_ondisk_t rec;
435         union hammer_btree_elm elm;
436         void *bdata;
437         int error;
438
439         error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
440         if (error)
441                 return(error);
442         cursor.key_beg.obj_id = ip->obj_id;
443         cursor.key_beg.key = offset + bytes;
444         cursor.key_beg.create_tid = trans->tid;
445         cursor.key_beg.delete_tid = 0;
446         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
447         cursor.flags = HAMMER_CURSOR_INSERT;
448
449         /*
450          * Issue a lookup to position the cursor and locate the cluster
451          */
452         error = hammer_btree_lookup(&cursor);
453         if (error == 0) {
454                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
455                         offset, bytes);
456                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
457                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
458                 error = EIO;
459         }
460         if (error != ENOENT)
461                 goto done;
462
463         /*
464          * Allocate record and data space now that we know which cluster
465          * the B-Tree node ended up in.
466          */
467         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
468                                   &cursor.data_buffer);
469         if (bdata == NULL)
470                 goto done;
471         rec = hammer_alloc_record(cursor.node->cluster, &error,
472                                   &cursor.record_buffer);
473         if (rec == NULL)
474                 goto fail1;
475
476         /*
477          * Fill everything in and insert our B-Tree node.
478          */
479         hammer_modify_buffer(cursor.record_buffer);
480         rec->base.base = cursor.key_beg;
481         rec->base.data_crc = crc32(data, bytes);
482         rec->base.rec_id = 0;   /* XXX */
483         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
484         rec->base.data_len = bytes;
485         hammer_modify_buffer_done(cursor.record_buffer);
486
487         hammer_modify_buffer(cursor.data_buffer);
488         bcopy(data, bdata, bytes);
489         hammer_modify_buffer_done(cursor.data_buffer);
490
491         elm.leaf.base = cursor.key_beg;
492         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
493         elm.leaf.data_offset = rec->base.data_offset;
494         elm.leaf.data_len = bytes;
495         elm.leaf.data_crc = rec->base.data_crc;
496
497         error = hammer_btree_insert(&cursor, &elm);
498         if (error == 0) {
499                 hammer_update_syncid(cursor.record_buffer->cluster, trans->tid);
500                 goto done;
501         }
502
503         hammer_free_record_ptr(cursor.record_buffer, rec);
504 fail1:
505         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
506 done:
507         /*
508          * If ENOSPC in cluster fill in the spike structure and return
509          * ENOSPC.
510          */
511         if (error == ENOSPC)
512                 hammer_load_spike(&cursor, spike);
513         hammer_done_cursor(&cursor);
514         return(error);
515 }
516
517 /*
518  * Sync an in-memory record to the disk.  this is typically called via fsync
519  * from a cached record source.  This code is responsible for actually
520  * writing a record out to the disk.
521  */
522 int
523 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
524 {
525         struct hammer_cursor cursor;
526         hammer_record_ondisk_t rec;
527         hammer_mount_t hmp;
528         union hammer_btree_elm elm;
529         void *bdata;
530         int error;
531
532         error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
533                                        record->ip->hmp);
534         if (error)
535                 return(error);
536         cursor.key_beg = record->rec.base.base;
537         cursor.flags = HAMMER_CURSOR_INSERT;
538
539         /*
540          * Issue a lookup to position the cursor and locate the cluster.  The
541          * target key should not exist.  If we are creating a directory entry
542          * we may have to iterate the low 32 bits of the key to find an unused
543          * key.
544          *
545          * If we run out of space trying to adjust the B-Tree for the
546          * insert, re-lookup without the insert flag so the cursor
547          * is properly positioned for the spike.
548          */
549 again:
550         error = hammer_btree_lookup(&cursor);
551         if (error == 0) {
552                 if (record->rec.base.base.rec_type == HAMMER_RECTYPE_DIRENTRY) {
553                         hmp = cursor.node->cluster->volume->hmp;
554                         if (++hmp->namekey_iterator == 0)
555                                 ++hmp->namekey_iterator;
556                         record->rec.base.base.key &= ~(0xFFFFFFFFLL);
557                         record->rec.base.base.key |= hmp->namekey_iterator;
558                         goto again;
559                 }
560                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
561                         record->rec.base.base.key);
562                 Debugger("duplicate record1");
563                 error = EIO;
564         }
565         if (error != ENOENT)
566                 goto done;
567
568         /*
569          * Mark the record as undergoing synchronization.  Our cursor is
570          * holding a locked B-Tree node for the insertion which interlocks
571          * anyone trying to access this record.
572          *
573          * XXX There is still a race present related to iterations.  An
574          * iteration may process the record, a sync may occur, and then
575          * later process the B-Tree element for the same record.
576          *
577          * We do not try to synchronize a deleted record.
578          */
579         if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING)) {
580                 error = 0;
581                 goto done;
582         }
583         record->flags |= HAMMER_RECF_SYNCING;
584
585         /*
586          * Allocate record and data space now that we know which cluster
587          * the B-Tree node ended up in.
588          */
589         if (record->data == NULL ||
590             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
591                 bdata = record->data;
592         } else {
593                 bdata = hammer_alloc_data(cursor.node->cluster,
594                                           record->rec.base.data_len, &error,
595                                           &cursor.data_buffer);
596                 if (bdata == NULL)
597                         goto fail2;
598         }
599         rec = hammer_alloc_record(cursor.node->cluster, &error,
600                                   &cursor.record_buffer);
601         if (rec == NULL)
602                 goto fail1;
603
604         /*
605          * Fill everything in and insert our B-Tree node.
606          *
607          * XXX assign rec_id here
608          */
609         hammer_modify_buffer(cursor.record_buffer);
610         *rec = record->rec;
611         if (bdata) {
612                 rec->base.data_crc = crc32(record->data,
613                                            record->rec.base.data_len);
614                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
615                         /*
616                          * Data embedded in record
617                          */
618                         rec->base.data_offset = ((char *)bdata -
619                                                  (char *)&record->rec);
620                         KKASSERT(rec->base.data_offset >= 0 && 
621                                  rec->base.data_offset + rec->base.data_len <=
622                                   sizeof(*rec));
623                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
624                 } else {
625                         /*
626                          * Data separate from record
627                          */
628                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
629                         hammer_modify_buffer(cursor.data_buffer);
630                         bcopy(record->data, bdata, rec->base.data_len);
631                         hammer_modify_buffer_done(cursor.data_buffer);
632                 }
633         }
634         rec->base.rec_id = 0;   /* XXX */
635         hammer_modify_buffer_done(cursor.record_buffer);
636
637         elm.leaf.base = cursor.key_beg;
638         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
639         elm.leaf.data_offset = rec->base.data_offset;
640         elm.leaf.data_len = rec->base.data_len;
641         elm.leaf.data_crc = rec->base.data_crc;
642
643         error = hammer_btree_insert(&cursor, &elm);
644
645         /*
646          * Clean up on success, or fall through on error.
647          */
648         if (error == 0) {
649                 record->flags |= HAMMER_RECF_DELETED;
650                 record->flags &= ~HAMMER_RECF_SYNCING;
651                 hammer_update_syncid(cursor.record_buffer->cluster,
652                                      record->rec.base.base.create_tid);
653                 goto done;
654         }
655
656         hammer_free_record_ptr(cursor.record_buffer, rec);
657 fail1:
658         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
659                 hammer_free_data_ptr(cursor.data_buffer, bdata,
660                                      record->rec.base.data_len);
661         }
662 fail2:
663         record->flags &= ~HAMMER_RECF_SYNCING;
664 done:
665         /*
666          * If ENOSPC in cluster fill in the spike structure and return
667          * ENOSPC.
668          */
669         if (error == ENOSPC)
670                 hammer_load_spike(&cursor, spike);
671         hammer_done_cursor(&cursor);
672         return(error);
673 }
674
675 /*
676  * Write out a record using the specified cursor.  The caller does not have
677  * to seek the cursor.  The flags are used to determine whether the data
678  * (if any) is embedded in the record or not.
679  *
680  * The target cursor will be modified by this call.  Note in particular
681  * that HAMMER_CURSOR_INSERT is set.
682  */
683 int
684 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
685                     void *data, int cursor_flags)
686 {
687         union hammer_btree_elm elm;
688         hammer_record_ondisk_t nrec;
689         void *bdata;
690         int error;
691
692         cursor->key_beg = orec->base.base;
693         cursor->flags |= HAMMER_CURSOR_INSERT;
694
695         /*
696          * Issue a lookup to position the cursor and locate the cluster.  The
697          * target key should not exist.
698          *
699          * If we run out of space trying to adjust the B-Tree for the
700          * insert, re-lookup without the insert flag so the cursor
701          * is properly positioned for the spike.
702          */
703         error = hammer_btree_lookup(cursor);
704         if (error == 0) {
705                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
706                         orec->base.base.key);
707                 Debugger("duplicate record2");
708                 error = EIO;
709         }
710         if (error != ENOENT)
711                 goto done;
712
713         /*
714          * Allocate record and data space now that we know which cluster
715          * the B-Tree node ended up in.
716          */
717         if (data == NULL ||
718             (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
719                 bdata = data;
720         } else {
721                 bdata = hammer_alloc_data(cursor->node->cluster,
722                                           orec->base.data_len, &error,
723                                           &cursor->data_buffer);
724                 if (bdata == NULL)
725                         goto done;
726         }
727         nrec = hammer_alloc_record(cursor->node->cluster, &error,
728                                   &cursor->record_buffer);
729         if (nrec == NULL)
730                 goto fail1;
731
732         /*
733          * Fill everything in and insert our B-Tree node.
734          *
735          * XXX assign rec_id here
736          */
737         hammer_modify_buffer(cursor->record_buffer);
738         *nrec = *orec;
739         nrec->base.data_offset = 0;
740         if (bdata) {
741                 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
742                 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
743                         /*
744                          * Data embedded in record
745                          */
746                         nrec->base.data_offset = ((char *)bdata - (char *)orec);
747                         KKASSERT(nrec->base.data_offset >= 0 && 
748                                  nrec->base.data_offset + nrec->base.data_len <
749                                   sizeof(*nrec));
750                         nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
751                 } else {
752                         /*
753                          * Data separate from record
754                          */
755                         nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
756                         hammer_modify_buffer(cursor->data_buffer);
757                         bcopy(data, bdata, nrec->base.data_len);
758                         hammer_modify_buffer_done(cursor->data_buffer);
759                 }
760         }
761         nrec->base.rec_id = 0;  /* XXX */
762         hammer_modify_buffer_done(cursor->record_buffer);
763
764         elm.leaf.base = nrec->base.base;
765         elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
766         elm.leaf.data_offset = nrec->base.data_offset;
767         elm.leaf.data_len = nrec->base.data_len;
768         elm.leaf.data_crc = nrec->base.data_crc;
769
770         error = hammer_btree_insert(cursor, &elm);
771         if (error == 0) {
772                 hammer_update_syncid(cursor->record_buffer->cluster,
773                                      nrec->base.base.create_tid);
774                 goto done;
775         }
776
777         hammer_free_record_ptr(cursor->record_buffer, nrec);
778 fail1:
779         if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
780                 hammer_free_data_ptr(cursor->data_buffer, bdata,
781                                      orec->base.data_len);
782         }
783 done:
784         /* leave cursor intact */
785         return(error);
786 }
787
788 /*
789  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
790  * entry's key is used to deal with hash collisions in the upper 32 bits.
791  * A unique 64 bit key is generated in-memory and may be regenerated a
792  * second time when the directory record is flushed to the on-disk B-Tree.
793  *
794  * A referenced record is passed to this function.  This function
795  * eats the reference.  If an error occurs the record will be deleted.
796  */
797 static
798 int
799 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
800 {
801         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
802                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
803                         record->flags |= HAMMER_RECF_DELETED;
804                         hammer_rel_mem_record(record);
805                         return (EEXIST);
806                 }
807                 if (++trans->hmp->namekey_iterator == 0)
808                         ++trans->hmp->namekey_iterator;
809                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
810                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
811         }
812         record->flags |= HAMMER_RECF_ONRBTREE;
813         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
814         hammer_rel_mem_record(record);
815         return(0);
816 }
817
818 /************************************************************************
819  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
820  ************************************************************************
821  *
822  * These functions augment the B-Tree scanning functions in hammer_btree.c
823  * by merging in-memory records with on-disk records.
824  */
825
826 /*
827  * Locate a particular record either in-memory or on-disk.
828  *
829  * NOTE: This is basically a standalone routine, hammer_ip_next() may
830  * NOT be called to iterate results.
831  */
832 int
833 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
834 {
835         int error;
836
837         /*
838          * If the element is in-memory return it without searching the
839          * on-disk B-Tree
840          */
841         error = hammer_mem_lookup(cursor, ip);
842         if (error == 0) {
843                 cursor->record = &cursor->iprec->rec;
844                 return(error);
845         }
846         if (error != ENOENT)
847                 return(error);
848
849         /*
850          * If the inode has on-disk components search the on-disk B-Tree.
851          */
852         if ((ip->flags & HAMMER_INODE_ONDISK) == 0)
853                 return(error);
854         error = hammer_btree_lookup(cursor);
855         if (error == 0)
856                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
857         return(error);
858 }
859
860 /*
861  * Locate the first record within the cursor's key_beg/key_end range,
862  * restricted to a particular inode.  0 is returned on success, ENOENT
863  * if no records matched the requested range, or some other error.
864  *
865  * When 0 is returned hammer_ip_next() may be used to iterate additional
866  * records within the requested range.
867  */
868 int
869 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
870 {
871         int error;
872
873         /*
874          * Clean up fields and setup for merged scan
875          */
876         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
877         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
878         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
879         if (cursor->iprec) {
880                 hammer_rel_mem_record(cursor->iprec);
881                 cursor->iprec = NULL;
882         }
883
884         /*
885          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
886          * exact lookup so if we get ENOENT we have to call the iterate
887          * function to validate the first record after the begin key.
888          *
889          * The ATEDISK flag is used by hammer_btree_iterate to determine
890          * whether it must index forwards or not.  It is also used here
891          * to select the next record from in-memory or on-disk.
892          */
893         if (ip->flags & HAMMER_INODE_ONDISK) {
894                 error = hammer_btree_lookup(cursor);
895                 if (error == ENOENT) {
896                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
897                         error = hammer_btree_iterate(cursor);
898                 }
899                 if (error && error != ENOENT) 
900                         return(error);
901                 if (error == 0) {
902                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
903                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
904                 } else {
905                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
906                 }
907         }
908
909         /*
910          * Search the in-memory record list (Red-Black tree).  Unlike the
911          * B-Tree search, mem_first checks for records in the range.
912          */
913         error = hammer_mem_first(cursor, ip);
914         if (error && error != ENOENT)
915                 return(error);
916         if (error == 0) {
917                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
918                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
919         }
920
921         /*
922          * This will return the first matching record.
923          */
924         return(hammer_ip_next(cursor));
925 }
926
927 /*
928  * Retrieve the next record in a merged iteration within the bounds of the
929  * cursor.  This call may be made multiple times after the cursor has been
930  * initially searched with hammer_ip_first().
931  *
932  * 0 is returned on success, ENOENT if no further records match the
933  * requested range, or some other error code is returned.
934  */
935 int
936 hammer_ip_next(hammer_cursor_t cursor)
937 {
938         hammer_btree_elm_t elm;
939         hammer_record_t rec;
940         int error;
941         int r;
942
943         /*
944          * Load the current on-disk and in-memory record.  If we ate any
945          * records we have to get the next one. 
946          *
947          * If we deleted the last on-disk record we had scanned ATEDISK will
948          * be clear and DELBTREE will be set, forcing a call to iterate. The
949          * fact that ATEDISK is clear causes iterate to re-test the 'current'
950          * element.  If ATEDISK is set, iterate will skip the 'current'
951          * element.
952          *
953          * Get the next on-disk record
954          */
955         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
956                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
957                         error = hammer_btree_iterate(cursor);
958                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
959                         if (error == 0)
960                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
961                         else
962                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
963                                                  HAMMER_CURSOR_ATEDISK;
964                 }
965         }
966
967         /*
968          * Get the next in-memory record.  The record can be ripped out
969          * of the RB tree so we maintain a scan_info structure to track
970          * the next node.
971          *
972          * hammer_rec_scan_cmp:  Is the record still in our general range,
973          *                       (non-inclusive of snapshot exclusions)?
974          * hammer_rec_scan_callback: Is the record in our snapshot?
975          */
976         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
977                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
978                         if (cursor->iprec) {
979                                 hammer_rel_mem_record(cursor->iprec);
980                                 cursor->iprec = NULL;
981                         }
982                         rec = cursor->scan.node;        /* next node */
983                         while (rec) {
984                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
985                                         break;
986                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
987                                         break;
988                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
989                         }
990                         if (cursor->iprec) {
991                                 KKASSERT(cursor->iprec == rec);
992                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
993                                 cursor->scan.node =
994                                         hammer_rec_rb_tree_RB_NEXT(rec);
995                         } else {
996                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
997                         }
998                 }
999         }
1000
1001         /*
1002          * Extract either the disk or memory record depending on their
1003          * relative position.
1004          */
1005         error = 0;
1006         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1007         case 0:
1008                 /*
1009                  * Both entries valid
1010                  */
1011                 elm = &cursor->node->ondisk->elms[cursor->index];
1012                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1013                 if (r < 0) {
1014                         error = hammer_btree_extract(cursor,
1015                                                      HAMMER_CURSOR_GET_RECORD);
1016                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1017                         break;
1018                 }
1019                 /* fall through to the memory entry */
1020         case HAMMER_CURSOR_ATEDISK:
1021                 /*
1022                  * Only the memory entry is valid
1023                  */
1024                 cursor->record = &cursor->iprec->rec;
1025                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1026                 break;
1027         case HAMMER_CURSOR_ATEMEM:
1028                 /*
1029                  * Only the disk entry is valid
1030                  */
1031                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1032                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1033                 break;
1034         default:
1035                 /*
1036                  * Neither entry is valid
1037                  *
1038                  * XXX error not set properly
1039                  */
1040                 cursor->record = NULL;
1041                 error = ENOENT;
1042                 break;
1043         }
1044         return(error);
1045 }
1046
1047 /*
1048  * Resolve the cursor->data pointer for the current cursor position in
1049  * a merged iteration.
1050  */
1051 int
1052 hammer_ip_resolve_data(hammer_cursor_t cursor)
1053 {
1054         int error;
1055
1056         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1057                 cursor->data = cursor->iprec->data;
1058                 error = 0;
1059         } else {
1060                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1061         }
1062         return(error);
1063 }
1064
1065 /*
1066  * Delete all records within the specified range for inode ip.
1067  *
1068  * NOTE: An unaligned range will cause new records to be added to cover
1069  * the edge cases. (XXX not implemented yet).
1070  *
1071  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1072  *
1073  * NOTE: Record keys for regular file data have to be special-cased since
1074  * they indicate the end of the range (key = base + bytes).
1075  *
1076  * NOTE: The spike structure must be filled in if we return ENOSPC.
1077  */
1078 int
1079 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1080                        int64_t ran_beg, int64_t ran_end,
1081                        struct hammer_cursor **spike)
1082 {
1083         struct hammer_cursor cursor;
1084         hammer_record_ondisk_t rec;
1085         hammer_base_elm_t base;
1086         int error;
1087         int64_t off;
1088
1089         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1090
1091         cursor.key_beg.obj_id = ip->obj_id;
1092         cursor.key_beg.create_tid = ip->obj_asof;
1093         cursor.key_beg.delete_tid = 0;
1094         cursor.key_beg.obj_type = 0;
1095
1096         cursor.key_end = cursor.key_beg;
1097         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1098                 cursor.key_beg.key = ran_beg;
1099                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1100                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1101                 cursor.key_end.key = ran_end;
1102         } else {
1103                 /*
1104                  * The key in the B-Tree is (base+bytes), so the first possible
1105                  * matching key is ran_beg + 1.
1106                  */
1107                 int64_t tmp64;
1108
1109                 cursor.key_beg.key = ran_beg + 1;
1110                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1111                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1112
1113                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1114                 if (tmp64 < ran_end)
1115                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1116                 else
1117                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1118         }
1119         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1120
1121         error = hammer_ip_first(&cursor, ip);
1122
1123         /*
1124          * Iterate through matching records and mark them as deleted.
1125          */
1126         while (error == 0) {
1127                 rec = cursor.record;
1128                 base = &rec->base.base;
1129
1130                 KKASSERT(base->delete_tid == 0);
1131
1132                 /*
1133                  * There may be overlap cases for regular file data.  Also
1134                  * remember the key for a regular file record is the offset
1135                  * of the last byte of the record (base + len - 1), NOT the
1136                  * base offset.
1137                  */
1138 #if 0
1139                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1140 #endif
1141                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1142 #if 0
1143                         kprintf("delete_range loop key %016llx\n",
1144                                 base->key - rec->base.data_len);
1145 #endif
1146                         off = base->key - rec->base.data_len;
1147                         /*
1148                          * Check the left edge case.  We currently do not
1149                          * split existing records.
1150                          */
1151                         if (off < ran_beg) {
1152                                 panic("hammer left edge case %016llx %d\n",
1153                                         base->key, rec->base.data_len);
1154                         }
1155
1156                         /*
1157                          * Check the right edge case.  Note that the
1158                          * record can be completely out of bounds, which
1159                          * terminates the search.
1160                          *
1161                          * base->key is exclusive of the right edge while
1162                          * ran_end is inclusive of the right edge.  The
1163                          * (key - data_len) left boundary is inclusive.
1164                          *
1165                          * XXX theory-check this test at some point, are
1166                          * we missing a + 1 somewhere?  Note that ran_end
1167                          * could overflow.
1168                          */
1169                         if (base->key - 1 > ran_end) {
1170                                 if (base->key - rec->base.data_len > ran_end) {
1171                                         kprintf("right edge OOB\n");
1172                                         break;
1173                                 }
1174                                 panic("hammer right edge case\n");
1175                         }
1176                 }
1177
1178                 /*
1179                  * Mark the record and B-Tree entry as deleted.  This will
1180                  * also physically delete the B-Tree entry, record, and
1181                  * data if the retention policy dictates.  The function
1182                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1183                  * uses to perform a fixup.
1184                  */
1185                 error = hammer_ip_delete_record(&cursor, trans->tid);
1186                 if (error)
1187                         break;
1188                 error = hammer_ip_next(&cursor);
1189         }
1190         hammer_done_cursor(&cursor);
1191         if (error == ENOENT)
1192                 error = 0;
1193         return(error);
1194 }
1195
1196 /*
1197  * Delete all records associated with an inode except the inode record
1198  * itself.
1199  */
1200 int
1201 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1202 {
1203         struct hammer_cursor cursor;
1204         hammer_record_ondisk_t rec;
1205         hammer_base_elm_t base;
1206         int error;
1207
1208         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1209
1210         cursor.key_beg.obj_id = ip->obj_id;
1211         cursor.key_beg.create_tid = ip->obj_asof;
1212         cursor.key_beg.delete_tid = 0;
1213         cursor.key_beg.obj_type = 0;
1214         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1215         cursor.key_beg.key = HAMMER_MIN_KEY;
1216
1217         cursor.key_end = cursor.key_beg;
1218         cursor.key_end.rec_type = 0xFFFF;
1219         cursor.key_end.key = HAMMER_MAX_KEY;
1220
1221         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1222
1223         error = hammer_ip_first(&cursor, ip);
1224
1225         /*
1226          * Iterate through matching records and mark them as deleted.
1227          */
1228         while (error == 0) {
1229                 rec = cursor.record;
1230                 base = &rec->base.base;
1231
1232                 KKASSERT(base->delete_tid == 0);
1233
1234                 /*
1235                  * Mark the record and B-Tree entry as deleted.  This will
1236                  * also physically delete the B-Tree entry, record, and
1237                  * data if the retention policy dictates.  The function
1238                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1239                  * uses to perform a fixup.
1240                  */
1241                 error = hammer_ip_delete_record(&cursor, trans->tid);
1242                 if (error)
1243                         break;
1244                 error = hammer_ip_next(&cursor);
1245         }
1246         hammer_done_cursor(&cursor);
1247         if (error == ENOENT)
1248                 error = 0;
1249         return(error);
1250 }
1251
1252 /*
1253  * Delete the record at the current cursor
1254  */
1255 int
1256 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1257 {
1258         hammer_btree_elm_t elm;
1259         hammer_mount_t hmp;
1260         int error;
1261
1262         /*
1263          * In-memory (unsynchronized) records can simply be freed.
1264          */
1265         if (cursor->record == &cursor->iprec->rec) {
1266                 cursor->iprec->flags |= HAMMER_RECF_DELETED;
1267                 return(0);
1268         }
1269
1270         /*
1271          * On-disk records are marked as deleted by updating their delete_tid.
1272          */
1273         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1274         elm = NULL;
1275         hmp = cursor->node->cluster->volume->hmp;
1276
1277         if (error == 0) {
1278                 hammer_modify_buffer(cursor->record_buffer);
1279                 cursor->record->base.base.delete_tid = tid;
1280
1281                 hammer_modify_buffer_done(cursor->record_buffer);
1282                 hammer_modify_node(cursor->node);
1283                 elm = &cursor->node->ondisk->elms[cursor->index];
1284                 elm->leaf.base.delete_tid = tid;
1285                 hammer_modify_node_done(cursor->node);
1286                 hammer_update_syncid(cursor->record_buffer->cluster, tid);
1287         }
1288
1289         /*
1290          * If we were mounted with the nohistory option, we physically
1291          * delete the record.
1292          */
1293         if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1294                 int32_t rec_offset;
1295                 int32_t data_offset;
1296                 int32_t data_len;
1297                 hammer_cluster_t cluster;
1298
1299                 rec_offset = elm->leaf.rec_offset;
1300                 data_offset = elm->leaf.data_offset;
1301                 data_len = elm->leaf.data_len;
1302 #if 0
1303                 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1304                         rec_offset, data_offset, data_len);
1305 #endif
1306                 cluster = cursor->node->cluster;
1307                 hammer_ref_cluster(cluster);
1308
1309                 error = hammer_btree_delete(cursor);
1310                 if (error == 0) {
1311                         /*
1312                          * This forces a fixup for the iteration because
1313                          * the cursor is now either sitting at the 'next'
1314                          * element or sitting at the end of a leaf.
1315                          */
1316                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1317                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1318                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1319                         }
1320                         hammer_free_record(cluster, rec_offset);
1321                         if (data_offset && (data_offset - rec_offset < 0 ||
1322                             data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1323                                 hammer_free_data(cluster, data_offset,data_len);
1324                         }
1325                 }
1326                 hammer_rel_cluster(cluster, 0);
1327                 if (error) {
1328                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1329                         error = 0;
1330                 }
1331         }
1332         return(error);
1333 }
1334
1335 /*
1336  * Determine whether a directory is empty or not.  Returns 0 if the directory
1337  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1338  */
1339 int
1340 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1341 {
1342         struct hammer_cursor cursor;
1343         int error;
1344
1345         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1346
1347         cursor.key_beg.obj_id = ip->obj_id;
1348         cursor.key_beg.create_tid = ip->obj_asof;
1349         cursor.key_beg.delete_tid = 0;
1350         cursor.key_beg.obj_type = 0;
1351         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1352         cursor.key_beg.key = HAMMER_MIN_KEY;
1353
1354         cursor.key_end = cursor.key_beg;
1355         cursor.key_end.rec_type = 0xFFFF;
1356         cursor.key_end.key = HAMMER_MAX_KEY;
1357
1358         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1359
1360         error = hammer_ip_first(&cursor, ip);
1361         if (error == ENOENT)
1362                 error = 0;
1363         else if (error == 0)
1364                 error = ENOTEMPTY;
1365         hammer_done_cursor(&cursor);
1366         return(error);
1367 }
1368