HAMMER 21/many: B-Tree node locking finalization.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.21 2008/01/18 07:02:41 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans,
40                              hammer_record_t record);
41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
43
44 /*
45  * Red-black tree support.
46  */
47 static int
48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
49 {
50         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
51                 return(-1);
52         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
53                 return(1);
54
55         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
56                 return(-1);
57         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
58                 return(1);
59
60         if (rec1->rec.base.base.delete_tid == 0) {
61                 if (rec2->rec.base.base.delete_tid == 0)
62                         return(0);
63                 return(1);
64         }
65         if (rec2->rec.base.base.delete_tid == 0)
66                 return(-1);
67
68         if (rec1->rec.base.base.delete_tid < rec2->rec.base.base.delete_tid)
69                 return(-1);
70         if (rec1->rec.base.base.delete_tid > rec2->rec.base.base.delete_tid)
71                 return(1);
72         return(0);
73 }
74
75 static int
76 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
77 {
78         if (info->rec_type < rec->rec.base.base.rec_type)
79                 return(-3);
80         if (info->rec_type > rec->rec.base.base.rec_type)
81                 return(3);
82
83         if (info->key < rec->rec.base.base.key)
84                 return(-2);
85         if (info->key > rec->rec.base.base.key)
86                 return(2);
87
88         if (info->delete_tid == 0) {
89                 if (rec->rec.base.base.delete_tid == 0)
90                         return(0);
91                 return(1);
92         }
93         if (rec->rec.base.base.delete_tid == 0)
94                 return(-1);
95         if (info->delete_tid < rec->rec.base.base.delete_tid)
96                 return(-1);
97         if (info->delete_tid > rec->rec.base.base.delete_tid)
98                 return(1);
99         return(0);
100 }
101
102 /*
103  * RB_SCAN comparison code for hammer_mem_first().  The argument order
104  * is reversed so the comparison result has to be negated.  key_beg and
105  * key_end are both range-inclusive.
106  *
107  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
108  * These do not stop the scan.
109  *
110  * Localized deletions are not cached in-memory.
111  */
112 static
113 int
114 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
115 {
116         hammer_cursor_t cursor = data;
117         int r;
118
119         r = hammer_rec_compare(&cursor->key_beg, rec);
120         if (r > 1)
121                 return(-1);
122         r = hammer_rec_compare(&cursor->key_end, rec);
123         if (r < -1)
124                 return(1);
125         return(0);
126 }
127
128 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
130                     hammer_rec_compare, hammer_base_elm_t);
131
132 /*
133  * Allocate a record for the caller to finish filling in.  The record is
134  * returned referenced.
135  */
136 hammer_record_t
137 hammer_alloc_mem_record(hammer_inode_t ip)
138 {
139         hammer_record_t record;
140
141         ++hammer_count_records;
142         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
143         record->ip = ip;
144         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
145         hammer_ref(&record->lock);
146         return (record);
147 }
148
149 /*
150  * Release a memory record.  Records marked for deletion are immediately
151  * removed from the RB-Tree but otherwise left intact until the last ref
152  * goes away.
153  */
154 void
155 hammer_rel_mem_record(struct hammer_record *record)
156 {
157         hammer_unref(&record->lock);
158         if (record->flags & HAMMER_RECF_DELETED) {
159                 if (record->flags & HAMMER_RECF_ONRBTREE) {
160                         RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree,
161                                   record);
162                         record->flags &= ~HAMMER_RECF_ONRBTREE;
163                 }
164                 if (record->lock.refs == 0) {
165                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
166                                 --hammer_count_record_datas;
167                                 kfree(record->data, M_HAMMER);
168                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
169                         }
170                         record->data = NULL;
171                         --hammer_count_records;
172                         kfree(record, M_HAMMER);
173                 }
174         }
175 }
176
177 /*
178  * Lookup an in-memory record given the key specified in the cursor.  Works
179  * just like hammer_btree_lookup() but operates on an inode's in-memory
180  * record list.
181  *
182  * The lookup must fail if the record is marked for deferred deletion.
183  */
184 static
185 int
186 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
187 {
188         int error;
189
190         if (cursor->iprec) {
191                 hammer_rel_mem_record(cursor->iprec);
192                 cursor->iprec = NULL;
193         }
194         if (cursor->ip) {
195                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
196                                                   &cursor->ip->rec_tree);
197         }
198         cursor->ip = ip;
199         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
200         cursor->scan.node = NULL;
201         cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO(
202                                 &ip->rec_tree, &cursor->key_beg);
203         if (cursor->iprec == NULL) {
204                 error = ENOENT;
205         } else {
206                 hammer_ref(&cursor->iprec->lock);
207                 error = 0;
208         }
209         return(error);
210 }
211
212 /*
213  * hammer_mem_first() - locate the first in-memory record matching the
214  * cursor.
215  *
216  * The RB_SCAN function we use is designed as a callback.  We terminate it
217  * (return -1) as soon as we get a match.
218  */
219 static
220 int
221 hammer_rec_scan_callback(hammer_record_t rec, void *data)
222 {
223         hammer_cursor_t cursor = data;
224
225         /*
226          * Skip if not visible due to our as-of TID
227          */
228         if (cursor->flags & HAMMER_CURSOR_ASOF) {
229                 if (cursor->asof < rec->rec.base.base.create_tid)
230                         return(0);
231                 if (rec->rec.base.base.delete_tid &&
232                     cursor->asof >= rec->rec.base.base.delete_tid) {
233                         return(0);
234                 }
235         }
236
237         /*
238          * Return the first matching record and stop the scan
239          */
240         if (cursor->iprec == NULL) {
241                 cursor->iprec = rec;
242                 hammer_ref(&rec->lock);
243                 return(-1);
244         }
245         return(0);
246 }
247
248 static
249 int
250 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
251 {
252         if (cursor->iprec) {
253                 hammer_rel_mem_record(cursor->iprec);
254                 cursor->iprec = NULL;
255         }
256         if (cursor->ip) {
257                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
258                                                   &cursor->ip->rec_tree);
259         }
260         cursor->ip = ip;
261         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
262
263         cursor->scan.node = NULL;
264         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
265                                    hammer_rec_scan_callback, cursor);
266
267         /*
268          * Adjust scan.node and keep it linked into the RB-tree so we can
269          * hold the cursor through third party modifications of the RB-tree.
270          */
271         if (cursor->iprec) {
272                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
273                 return(0);
274         }
275         return(ENOENT);
276 }
277
278 void
279 hammer_mem_done(hammer_cursor_t cursor)
280 {
281         if (cursor->ip) {
282                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
283                                                   &cursor->ip->rec_tree);
284                 cursor->ip = NULL;
285         }
286         if (cursor->iprec) {
287                 hammer_rel_mem_record(cursor->iprec);
288                 cursor->iprec = NULL;
289         }
290 }
291
292 /************************************************************************
293  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
294  ************************************************************************
295  *
296  * These functions manipulate in-memory records.  Such records typically
297  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
298  */
299
300 /*
301  * Add a directory entry (dip,ncp) which references inode (ip).
302  *
303  * Note that the low 32 bits of the namekey are set temporarily to create
304  * a unique in-memory record, and may be modified a second time when the
305  * record is synchronized to disk.  In particular, the low 32 bits cannot be
306  * all 0's when synching to disk, which is not handled here.
307  */
308 int
309 hammer_ip_add_directory(struct hammer_transaction *trans,
310                      struct hammer_inode *dip, struct namecache *ncp,
311                      struct hammer_inode *ip)
312 {
313         hammer_record_t record;
314         int error;
315         int bytes;
316
317         record = hammer_alloc_mem_record(dip);
318
319         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
320         if (++trans->hmp->namekey_iterator == 0)
321                 ++trans->hmp->namekey_iterator;
322
323         record->rec.entry.base.base.obj_id = dip->obj_id;
324         record->rec.entry.base.base.key =
325                 hammer_directory_namekey(ncp->nc_name, bytes);
326         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
327         record->rec.entry.base.base.create_tid = trans->tid;
328         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
329         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
330         record->rec.entry.obj_id = ip->obj_id;
331         if (bytes <= sizeof(record->rec.entry.den_name)) {
332                 record->data = (void *)record->rec.entry.den_name;
333                 record->flags |= HAMMER_RECF_EMBEDDED_DATA;
334         } else {
335                 ++hammer_count_record_datas;
336                 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK);
337                 record->flags |= HAMMER_RECF_ALLOCDATA;
338         }
339         bcopy(ncp->nc_name, record->data, bytes);
340         record->rec.entry.base.data_len = bytes;
341         ++ip->ino_rec.ino_nlinks;
342         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
343         error = hammer_mem_add(trans, record);
344         return(error);
345 }
346
347 /*
348  * Delete the directory entry and update the inode link count.  The
349  * cursor must be seeked to the directory entry record being deleted.
350  *
351  * NOTE: HAMMER_CURSOR_DELETE may not have been set.  XXX remove flag.
352  *
353  * This function can return EDEADLK requiring the caller to terminate
354  * the cursor and retry.
355  */
356 int
357 hammer_ip_del_directory(struct hammer_transaction *trans,
358                      hammer_cursor_t cursor, struct hammer_inode *dip,
359                      struct hammer_inode *ip)
360 {
361         int error;
362
363         error = hammer_ip_delete_record(cursor, trans->tid);
364
365         /*
366          * One less link.  The file may still be open in the OS even after
367          * all links have gone away so we only try to sync if the OS has
368          * no references and nlinks falls to 0.
369          *
370          * We have to terminate the cursor before syncing the inode to
371          * avoid deadlocking against ourselves.
372          */
373         if (error == 0) {
374                 --ip->ino_rec.ino_nlinks;
375                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
376                 if (ip->ino_rec.ino_nlinks == 0 &&
377                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
378                         hammer_done_cursor(cursor);
379                         hammer_sync_inode(ip, MNT_NOWAIT, 1);
380                 }
381
382         }
383         return(error);
384 }
385
386 /*
387  * Add a record to an inode.
388  *
389  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
390  * initialize the following additional fields:
391  *
392  * record->rec.entry.base.base.key
393  * record->rec.entry.base.base.rec_type
394  * record->rec.entry.base.base.data_len
395  * record->data         (a copy will be kmalloc'd if not embedded)
396  */
397 int
398 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
399 {
400         hammer_inode_t ip = record->ip;
401         int error;
402         int bytes;
403         void *data;
404
405         record->rec.base.base.obj_id = ip->obj_id;
406         record->rec.base.base.create_tid = trans->tid;
407         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
408         bytes = record->rec.base.data_len;
409
410         if (record->data) {
411                 if ((char *)record->data < (char *)&record->rec ||
412                     (char *)record->data >= (char *)(&record->rec + 1)) {
413                         ++hammer_count_record_datas;
414                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
415                         record->flags |= HAMMER_RECF_ALLOCDATA;
416                         bcopy(record->data, data, bytes);
417                         record->data = data;
418                 } else {
419                         record->flags |= HAMMER_RECF_EMBEDDED_DATA;
420                 }
421         }
422         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
423         error = hammer_mem_add(trans, record);
424         return(error);
425 }
426
427 /*
428  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
429  * is called via the strategy called from a cached data source.  This code
430  * is responsible for actually writing a data record out to the disk.
431  */
432 int
433 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
434                        int64_t offset, void *data, int bytes,
435                        struct hammer_cursor **spike)
436 {
437         struct hammer_cursor cursor;
438         hammer_record_ondisk_t rec;
439         union hammer_btree_elm elm;
440         void *bdata;
441         int error;
442
443 retry:
444         error = hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
445         if (error)
446                 return(error);
447         cursor.key_beg.obj_id = ip->obj_id;
448         cursor.key_beg.key = offset + bytes;
449         cursor.key_beg.create_tid = 0;
450         cursor.key_beg.delete_tid = 0;
451         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
452         cursor.asof = trans->tid;
453         cursor.flags |= HAMMER_CURSOR_INSERT | HAMMER_CURSOR_ASOF;
454
455         /*
456          * Issue a lookup to position the cursor and locate the cluster
457          */
458         error = hammer_btree_lookup(&cursor);
459         if (error == 0) {
460                 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n",
461                         offset, bytes);
462                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
463                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
464                 error = EIO;
465         }
466         if (error != ENOENT)
467                 goto done;
468
469         /*
470          * Allocate record and data space now that we know which cluster
471          * the B-Tree node ended up in.
472          */
473         bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error,
474                                   &cursor.data_buffer);
475         if (bdata == NULL)
476                 goto done;
477         rec = hammer_alloc_record(cursor.node->cluster, &error,
478                                   &cursor.record_buffer);
479         if (rec == NULL)
480                 goto fail1;
481
482         /*
483          * Fill everything in and insert our B-Tree node.
484          */
485         hammer_modify_buffer(cursor.record_buffer);
486         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
487         rec->base.base.obj_id = ip->obj_id;
488         rec->base.base.key = offset + bytes;
489         rec->base.base.create_tid = trans->tid;
490         rec->base.base.delete_tid = 0;
491         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
492         rec->base.data_crc = crc32(data, bytes);
493         rec->base.rec_id = 0;   /* XXX */
494         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata);
495         rec->base.data_len = bytes;
496
497         hammer_modify_buffer(cursor.data_buffer);
498         bcopy(data, bdata, bytes);
499
500         elm.leaf.base = rec->base.base;
501         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
502         elm.leaf.data_offset = rec->base.data_offset;
503         elm.leaf.data_len = bytes;
504         elm.leaf.data_crc = rec->base.data_crc;
505
506         /*
507          * Data records can wind up on-disk before the inode itself is
508          * on-disk.  One must assume data records may be on-disk if either
509          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
510          */
511         ip->flags |= HAMMER_INODE_DONDISK;
512
513         error = hammer_btree_insert(&cursor, &elm);
514         if (error == 0) {
515                 hammer_update_syncid(cursor.record_buffer->cluster, trans->tid);
516                 goto done;
517         }
518
519         hammer_free_record_ptr(cursor.record_buffer, rec);
520 fail1:
521         hammer_free_data_ptr(cursor.data_buffer, bdata, bytes);
522 done:
523         /*
524          * If ENOSPC in cluster fill in the spike structure and return
525          * ENOSPC.
526          */
527         if (error == ENOSPC)
528                 hammer_load_spike(&cursor, spike);
529         hammer_done_cursor(&cursor);
530         if (error == EDEADLK)
531                 goto retry;
532         return(error);
533 }
534
535 /*
536  * Sync an in-memory record to the disk.  this is typically called via fsync
537  * from a cached record source.  This code is responsible for actually
538  * writing a record out to the disk.
539  */
540 int
541 hammer_ip_sync_record(hammer_record_t record, struct hammer_cursor **spike)
542 {
543         struct hammer_cursor cursor;
544         hammer_record_ondisk_t rec;
545         hammer_mount_t hmp;
546         union hammer_btree_elm elm;
547         void *bdata;
548         int error;
549
550 retry:
551         error = hammer_init_cursor_hmp(&cursor, &record->ip->cache[0],
552                                        record->ip->hmp);
553         if (error)
554                 return(error);
555         cursor.key_beg = record->rec.base.base;
556         cursor.flags |= HAMMER_CURSOR_INSERT;
557
558         /*
559          * Issue a lookup to position the cursor and locate the cluster.  The
560          * target key should not exist.  If we are creating a directory entry
561          * we may have to iterate the low 32 bits of the key to find an unused
562          * key.
563          *
564          * If we run out of space trying to adjust the B-Tree for the
565          * insert, re-lookup without the insert flag so the cursor
566          * is properly positioned for the spike.
567          */
568         for (;;) {
569                 error = hammer_btree_lookup(&cursor);
570                 if (error)
571                         break;
572                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
573                         kprintf("hammer_ip_sync_record: duplicate rec "
574                                 "at (%016llx)\n", record->rec.base.base.key);
575                         Debugger("duplicate record1");
576                         error = EIO;
577                         break;
578                 }
579                 hmp = cursor.node->cluster->volume->hmp;
580                 if (++hmp->namekey_iterator == 0)
581                         ++hmp->namekey_iterator;
582                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
583                 record->rec.base.base.key |= hmp->namekey_iterator;
584                 cursor.key_beg.key = record->rec.base.base.key;
585         }
586         if (error != ENOENT)
587                 goto done;
588
589         /*
590          * Mark the record as undergoing synchronization.  Our cursor is
591          * holding a locked B-Tree node for the insertion which interlocks
592          * anyone trying to access this record.
593          *
594          * XXX There is still a race present related to iterations.  An
595          * iteration may process the record, a sync may occur, and then
596          * later process the B-Tree element for the same record.
597          *
598          * We do not try to synchronize a deleted record.
599          */
600         if (record->flags & (HAMMER_RECF_DELETED | HAMMER_RECF_SYNCING)) {
601                 error = 0;
602                 goto done;
603         }
604         record->flags |= HAMMER_RECF_SYNCING;
605
606         /*
607          * Allocate record and data space now that we know which cluster
608          * the B-Tree node ended up in.
609          */
610         if (record->data == NULL ||
611             (record->flags & HAMMER_RECF_EMBEDDED_DATA)) {
612                 bdata = record->data;
613         } else {
614                 bdata = hammer_alloc_data(cursor.node->cluster,
615                                           record->rec.base.data_len, &error,
616                                           &cursor.data_buffer);
617                 if (bdata == NULL)
618                         goto fail2;
619         }
620         rec = hammer_alloc_record(cursor.node->cluster, &error,
621                                   &cursor.record_buffer);
622         if (rec == NULL)
623                 goto fail1;
624
625         /*
626          * Fill everything in and insert our B-Tree node.
627          *
628          * XXX assign rec_id here
629          */
630         hammer_modify_buffer(cursor.record_buffer);
631         *rec = record->rec;
632         if (bdata) {
633                 rec->base.data_crc = crc32(record->data,
634                                            record->rec.base.data_len);
635                 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) {
636                         /*
637                          * Data embedded in record
638                          */
639                         rec->base.data_offset = ((char *)bdata -
640                                                  (char *)&record->rec);
641                         KKASSERT(rec->base.data_offset >= 0 && 
642                                  rec->base.data_offset + rec->base.data_len <=
643                                   sizeof(*rec));
644                         rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec);
645                 } else {
646                         /*
647                          * Data separate from record
648                          */
649                         rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata);
650                         hammer_modify_buffer(cursor.data_buffer);
651                         bcopy(record->data, bdata, rec->base.data_len);
652                 }
653         }
654         rec->base.rec_id = 0;   /* XXX */
655
656         elm.leaf.base = record->rec.base.base;
657         elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec);
658         elm.leaf.data_offset = rec->base.data_offset;
659         elm.leaf.data_len = rec->base.data_len;
660         elm.leaf.data_crc = rec->base.data_crc;
661
662         error = hammer_btree_insert(&cursor, &elm);
663
664         /*
665          * Clean up on success, or fall through on error.
666          */
667         if (error == 0) {
668                 record->flags |= HAMMER_RECF_DELETED;
669                 record->flags &= ~HAMMER_RECF_SYNCING;
670                 hammer_update_syncid(cursor.record_buffer->cluster,
671                                      record->rec.base.base.create_tid);
672                 goto done;
673         }
674
675         hammer_free_record_ptr(cursor.record_buffer, rec);
676 fail1:
677         if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
678                 hammer_free_data_ptr(cursor.data_buffer, bdata,
679                                      record->rec.base.data_len);
680         }
681 fail2:
682         record->flags &= ~HAMMER_RECF_SYNCING;
683 done:
684         /*
685          * If ENOSPC in cluster fill in the spike structure and return
686          * ENOSPC.
687          */
688         if (error == ENOSPC)
689                 hammer_load_spike(&cursor, spike);
690         hammer_done_cursor(&cursor);
691         if (error == EDEADLK)
692                 goto retry;
693         return(error);
694 }
695
696 /*
697  * Write out a record using the specified cursor.  The caller does not have
698  * to seek the cursor.  The flags are used to determine whether the data
699  * (if any) is embedded in the record or not.
700  *
701  * The target cursor will be modified by this call.  Note in particular
702  * that HAMMER_CURSOR_INSERT is set.
703  *
704  * NOTE: This can return EDEADLK, requiring the caller to release its cursor
705  * and retry the operation.
706  */
707 int
708 hammer_write_record(hammer_cursor_t cursor, hammer_record_ondisk_t orec,
709                     void *data, int cursor_flags)
710 {
711         union hammer_btree_elm elm;
712         hammer_record_ondisk_t nrec;
713         void *bdata;
714         int error;
715
716         cursor->key_beg = orec->base.base;
717         cursor->flags |= HAMMER_CURSOR_INSERT;
718
719         /*
720          * Issue a lookup to position the cursor and locate the cluster.  The
721          * target key should not exist.
722          *
723          * If we run out of space trying to adjust the B-Tree for the
724          * insert, re-lookup without the insert flag so the cursor
725          * is properly positioned for the spike.
726          */
727         error = hammer_btree_lookup(cursor);
728         if (error == 0) {
729                 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n",
730                         orec->base.base.key);
731                 Debugger("duplicate record2");
732                 error = EIO;
733         }
734         if (error != ENOENT)
735                 goto done;
736
737         /*
738          * Allocate record and data space now that we know which cluster
739          * the B-Tree node ended up in.
740          */
741         if (data == NULL ||
742             (cursor_flags & HAMMER_RECF_EMBEDDED_DATA)) {
743                 bdata = data;
744         } else {
745                 bdata = hammer_alloc_data(cursor->node->cluster,
746                                           orec->base.data_len, &error,
747                                           &cursor->data_buffer);
748                 if (bdata == NULL)
749                         goto done;
750         }
751         nrec = hammer_alloc_record(cursor->node->cluster, &error,
752                                   &cursor->record_buffer);
753         if (nrec == NULL)
754                 goto fail1;
755
756         /*
757          * Fill everything in and insert our B-Tree node.
758          *
759          * XXX assign rec_id here
760          */
761         hammer_modify_buffer(cursor->record_buffer);
762         *nrec = *orec;
763         nrec->base.data_offset = 0;
764         if (bdata) {
765                 nrec->base.data_crc = crc32(bdata, nrec->base.data_len);
766                 if (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) {
767                         /*
768                          * Data embedded in record
769                          */
770                         nrec->base.data_offset = ((char *)bdata - (char *)orec);
771                         KKASSERT(nrec->base.data_offset >= 0 && 
772                                  nrec->base.data_offset + nrec->base.data_len <
773                                   sizeof(*nrec));
774                         nrec->base.data_offset += hammer_bclu_offset(cursor->record_buffer, nrec);
775                 } else {
776                         /*
777                          * Data separate from record
778                          */
779                         nrec->base.data_offset = hammer_bclu_offset(cursor->data_buffer, bdata);
780                         hammer_modify_buffer(cursor->data_buffer);
781                         bcopy(data, bdata, nrec->base.data_len);
782                 }
783         }
784         nrec->base.rec_id = 0;  /* XXX */
785
786         elm.leaf.base = nrec->base.base;
787         elm.leaf.rec_offset = hammer_bclu_offset(cursor->record_buffer, nrec);
788         elm.leaf.data_offset = nrec->base.data_offset;
789         elm.leaf.data_len = nrec->base.data_len;
790         elm.leaf.data_crc = nrec->base.data_crc;
791
792         error = hammer_btree_insert(cursor, &elm);
793         if (error == 0) {
794                 hammer_update_syncid(cursor->record_buffer->cluster,
795                                      nrec->base.base.create_tid);
796                 goto done;
797         }
798
799         hammer_free_record_ptr(cursor->record_buffer, nrec);
800 fail1:
801         if (data && (cursor_flags & HAMMER_RECF_EMBEDDED_DATA) == 0) {
802                 hammer_free_data_ptr(cursor->data_buffer, bdata,
803                                      orec->base.data_len);
804         }
805 done:
806         /* leave cursor intact */
807         return(error);
808 }
809
810 /*
811  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
812  * entry's key is used to deal with hash collisions in the upper 32 bits.
813  * A unique 64 bit key is generated in-memory and may be regenerated a
814  * second time when the directory record is flushed to the on-disk B-Tree.
815  *
816  * A referenced record is passed to this function.  This function
817  * eats the reference.  If an error occurs the record will be deleted.
818  */
819 static
820 int
821 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
822 {
823         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
824                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
825                         record->flags |= HAMMER_RECF_DELETED;
826                         hammer_rel_mem_record(record);
827                         return (EEXIST);
828                 }
829                 if (++trans->hmp->namekey_iterator == 0)
830                         ++trans->hmp->namekey_iterator;
831                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
832                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
833         }
834         record->flags |= HAMMER_RECF_ONRBTREE;
835         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
836         hammer_rel_mem_record(record);
837         return(0);
838 }
839
840 /************************************************************************
841  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
842  ************************************************************************
843  *
844  * These functions augment the B-Tree scanning functions in hammer_btree.c
845  * by merging in-memory records with on-disk records.
846  */
847
848 /*
849  * Locate a particular record either in-memory or on-disk.
850  *
851  * NOTE: This is basically a standalone routine, hammer_ip_next() may
852  * NOT be called to iterate results.
853  */
854 int
855 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
856 {
857         int error;
858
859         /*
860          * If the element is in-memory return it without searching the
861          * on-disk B-Tree
862          */
863         error = hammer_mem_lookup(cursor, ip);
864         if (error == 0) {
865                 cursor->record = &cursor->iprec->rec;
866                 return(error);
867         }
868         if (error != ENOENT)
869                 return(error);
870
871         /*
872          * If the inode has on-disk components search the on-disk B-Tree.
873          */
874         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
875                 return(error);
876         error = hammer_btree_lookup(cursor);
877         if (error == 0)
878                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
879         return(error);
880 }
881
882 /*
883  * Locate the first record within the cursor's key_beg/key_end range,
884  * restricted to a particular inode.  0 is returned on success, ENOENT
885  * if no records matched the requested range, or some other error.
886  *
887  * When 0 is returned hammer_ip_next() may be used to iterate additional
888  * records within the requested range.
889  *
890  * This function can return EDEADLK, requiring the caller to terminate
891  * the cursor and try again.
892  */
893 int
894 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
895 {
896         int error;
897
898         /*
899          * Clean up fields and setup for merged scan
900          */
901         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
902         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
903         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
904         if (cursor->iprec) {
905                 hammer_rel_mem_record(cursor->iprec);
906                 cursor->iprec = NULL;
907         }
908
909         /*
910          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
911          * exact lookup so if we get ENOENT we have to call the iterate
912          * function to validate the first record after the begin key.
913          *
914          * The ATEDISK flag is used by hammer_btree_iterate to determine
915          * whether it must index forwards or not.  It is also used here
916          * to select the next record from in-memory or on-disk.
917          *
918          * EDEADLK can only occur if the lookup hit an empty internal
919          * element and couldn't delete it.  Since this could only occur
920          * in-range, we can just iterate from the failure point.
921          */
922         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
923                 error = hammer_btree_lookup(cursor);
924                 if (error == ENOENT || error == EDEADLK) {
925                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
926                         error = hammer_btree_iterate(cursor);
927                 }
928                 if (error && error != ENOENT) 
929                         return(error);
930                 if (error == 0) {
931                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
932                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
933                 } else {
934                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
935                 }
936         }
937
938         /*
939          * Search the in-memory record list (Red-Black tree).  Unlike the
940          * B-Tree search, mem_first checks for records in the range.
941          */
942         error = hammer_mem_first(cursor, ip);
943         if (error && error != ENOENT)
944                 return(error);
945         if (error == 0) {
946                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
947                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
948         }
949
950         /*
951          * This will return the first matching record.
952          */
953         return(hammer_ip_next(cursor));
954 }
955
956 /*
957  * Retrieve the next record in a merged iteration within the bounds of the
958  * cursor.  This call may be made multiple times after the cursor has been
959  * initially searched with hammer_ip_first().
960  *
961  * 0 is returned on success, ENOENT if no further records match the
962  * requested range, or some other error code is returned.
963  */
964 int
965 hammer_ip_next(hammer_cursor_t cursor)
966 {
967         hammer_btree_elm_t elm;
968         hammer_record_t rec;
969         int error;
970         int r;
971
972         /*
973          * Load the current on-disk and in-memory record.  If we ate any
974          * records we have to get the next one. 
975          *
976          * If we deleted the last on-disk record we had scanned ATEDISK will
977          * be clear and DELBTREE will be set, forcing a call to iterate. The
978          * fact that ATEDISK is clear causes iterate to re-test the 'current'
979          * element.  If ATEDISK is set, iterate will skip the 'current'
980          * element.
981          *
982          * Get the next on-disk record
983          */
984         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
985                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
986                         error = hammer_btree_iterate(cursor);
987                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
988                         if (error == 0)
989                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
990                         else
991                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
992                                                  HAMMER_CURSOR_ATEDISK;
993                 }
994         }
995
996         /*
997          * Get the next in-memory record.  The record can be ripped out
998          * of the RB tree so we maintain a scan_info structure to track
999          * the next node.
1000          *
1001          * hammer_rec_scan_cmp:  Is the record still in our general range,
1002          *                       (non-inclusive of snapshot exclusions)?
1003          * hammer_rec_scan_callback: Is the record in our snapshot?
1004          */
1005         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1006                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1007                         if (cursor->iprec) {
1008                                 hammer_rel_mem_record(cursor->iprec);
1009                                 cursor->iprec = NULL;
1010                         }
1011                         rec = cursor->scan.node;        /* next node */
1012                         while (rec) {
1013                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1014                                         break;
1015                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1016                                         break;
1017                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1018                         }
1019                         if (cursor->iprec) {
1020                                 KKASSERT(cursor->iprec == rec);
1021                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1022                                 cursor->scan.node =
1023                                         hammer_rec_rb_tree_RB_NEXT(rec);
1024                         } else {
1025                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1026                         }
1027                 }
1028         }
1029
1030         /*
1031          * Extract either the disk or memory record depending on their
1032          * relative position.
1033          */
1034         error = 0;
1035         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1036         case 0:
1037                 /*
1038                  * Both entries valid
1039                  */
1040                 elm = &cursor->node->ondisk->elms[cursor->index];
1041                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1042                 if (r < 0) {
1043                         error = hammer_btree_extract(cursor,
1044                                                      HAMMER_CURSOR_GET_RECORD);
1045                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1046                         break;
1047                 }
1048                 /* fall through to the memory entry */
1049         case HAMMER_CURSOR_ATEDISK:
1050                 /*
1051                  * Only the memory entry is valid
1052                  */
1053                 cursor->record = &cursor->iprec->rec;
1054                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1055                 break;
1056         case HAMMER_CURSOR_ATEMEM:
1057                 /*
1058                  * Only the disk entry is valid
1059                  */
1060                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1061                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1062                 break;
1063         default:
1064                 /*
1065                  * Neither entry is valid
1066                  *
1067                  * XXX error not set properly
1068                  */
1069                 cursor->record = NULL;
1070                 error = ENOENT;
1071                 break;
1072         }
1073         return(error);
1074 }
1075
1076 /*
1077  * Resolve the cursor->data pointer for the current cursor position in
1078  * a merged iteration.
1079  */
1080 int
1081 hammer_ip_resolve_data(hammer_cursor_t cursor)
1082 {
1083         int error;
1084
1085         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1086                 cursor->data = cursor->iprec->data;
1087                 error = 0;
1088         } else {
1089                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1090         }
1091         return(error);
1092 }
1093
1094 /*
1095  * Delete all records within the specified range for inode ip.
1096  *
1097  * NOTE: An unaligned range will cause new records to be added to cover
1098  * the edge cases. (XXX not implemented yet).
1099  *
1100  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1101  *
1102  * NOTE: Record keys for regular file data have to be special-cased since
1103  * they indicate the end of the range (key = base + bytes).
1104  *
1105  * NOTE: The spike structure must be filled in if we return ENOSPC.
1106  */
1107 int
1108 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1109                        int64_t ran_beg, int64_t ran_end,
1110                        struct hammer_cursor **spike)
1111 {
1112         struct hammer_cursor cursor;
1113         hammer_record_ondisk_t rec;
1114         hammer_base_elm_t base;
1115         int error;
1116         int64_t off;
1117
1118 retry:
1119         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1120
1121         cursor.key_beg.obj_id = ip->obj_id;
1122         cursor.key_beg.create_tid = 0;
1123         cursor.key_beg.delete_tid = 0;
1124         cursor.key_beg.obj_type = 0;
1125         cursor.asof = ip->obj_asof;
1126         cursor.flags |= HAMMER_CURSOR_ASOF;
1127
1128         cursor.key_end = cursor.key_beg;
1129         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1130                 cursor.key_beg.key = ran_beg;
1131                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1132                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1133                 cursor.key_end.key = ran_end;
1134         } else {
1135                 /*
1136                  * The key in the B-Tree is (base+bytes), so the first possible
1137                  * matching key is ran_beg + 1.
1138                  */
1139                 int64_t tmp64;
1140
1141                 cursor.key_beg.key = ran_beg + 1;
1142                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1143                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1144
1145                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1146                 if (tmp64 < ran_end)
1147                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1148                 else
1149                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1150         }
1151         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1152
1153         error = hammer_ip_first(&cursor, ip);
1154
1155         /*
1156          * Iterate through matching records and mark them as deleted.
1157          */
1158         while (error == 0) {
1159                 rec = cursor.record;
1160                 base = &rec->base.base;
1161
1162                 KKASSERT(base->delete_tid == 0);
1163
1164                 /*
1165                  * There may be overlap cases for regular file data.  Also
1166                  * remember the key for a regular file record is the offset
1167                  * of the last byte of the record (base + len - 1), NOT the
1168                  * base offset.
1169                  */
1170 #if 0
1171                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1172 #endif
1173                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1174 #if 0
1175                         kprintf("delete_range loop key %016llx\n",
1176                                 base->key - rec->base.data_len);
1177 #endif
1178                         off = base->key - rec->base.data_len;
1179                         /*
1180                          * Check the left edge case.  We currently do not
1181                          * split existing records.
1182                          */
1183                         if (off < ran_beg) {
1184                                 panic("hammer left edge case %016llx %d\n",
1185                                         base->key, rec->base.data_len);
1186                         }
1187
1188                         /*
1189                          * Check the right edge case.  Note that the
1190                          * record can be completely out of bounds, which
1191                          * terminates the search.
1192                          *
1193                          * base->key is exclusive of the right edge while
1194                          * ran_end is inclusive of the right edge.  The
1195                          * (key - data_len) left boundary is inclusive.
1196                          *
1197                          * XXX theory-check this test at some point, are
1198                          * we missing a + 1 somewhere?  Note that ran_end
1199                          * could overflow.
1200                          */
1201                         if (base->key - 1 > ran_end) {
1202                                 if (base->key - rec->base.data_len > ran_end)
1203                                         break;
1204                                 panic("hammer right edge case\n");
1205                         }
1206                 }
1207
1208                 /*
1209                  * Mark the record and B-Tree entry as deleted.  This will
1210                  * also physically delete the B-Tree entry, record, and
1211                  * data if the retention policy dictates.  The function
1212                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1213                  * uses to perform a fixup.
1214                  */
1215                 error = hammer_ip_delete_record(&cursor, trans->tid);
1216                 if (error)
1217                         break;
1218                 error = hammer_ip_next(&cursor);
1219         }
1220         hammer_done_cursor(&cursor);
1221         if (error == EDEADLK)
1222                 goto retry;
1223         if (error == ENOENT)
1224                 error = 0;
1225         return(error);
1226 }
1227
1228 /*
1229  * Delete all records associated with an inode except the inode record
1230  * itself.
1231  */
1232 int
1233 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1234 {
1235         struct hammer_cursor cursor;
1236         hammer_record_ondisk_t rec;
1237         hammer_base_elm_t base;
1238         int error;
1239
1240 retry:
1241         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1242
1243         cursor.key_beg.obj_id = ip->obj_id;
1244         cursor.key_beg.create_tid = 0;
1245         cursor.key_beg.delete_tid = 0;
1246         cursor.key_beg.obj_type = 0;
1247         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1248         cursor.key_beg.key = HAMMER_MIN_KEY;
1249
1250         cursor.key_end = cursor.key_beg;
1251         cursor.key_end.rec_type = 0xFFFF;
1252         cursor.key_end.key = HAMMER_MAX_KEY;
1253
1254         cursor.asof = ip->obj_asof;
1255         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1256
1257         error = hammer_ip_first(&cursor, ip);
1258
1259         /*
1260          * Iterate through matching records and mark them as deleted.
1261          */
1262         while (error == 0) {
1263                 rec = cursor.record;
1264                 base = &rec->base.base;
1265
1266                 KKASSERT(base->delete_tid == 0);
1267
1268                 /*
1269                  * Mark the record and B-Tree entry as deleted.  This will
1270                  * also physically delete the B-Tree entry, record, and
1271                  * data if the retention policy dictates.  The function
1272                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1273                  * uses to perform a fixup.
1274                  */
1275                 error = hammer_ip_delete_record(&cursor, trans->tid);
1276                 if (error)
1277                         break;
1278                 error = hammer_ip_next(&cursor);
1279         }
1280         hammer_done_cursor(&cursor);
1281         if (error == EDEADLK)
1282                 goto retry;
1283         if (error == ENOENT)
1284                 error = 0;
1285         return(error);
1286 }
1287
1288 /*
1289  * Delete the record at the current cursor.
1290  *
1291  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1292  * cursor and retry.
1293  */
1294 int
1295 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1296 {
1297         hammer_btree_elm_t elm;
1298         hammer_mount_t hmp;
1299         int error;
1300
1301         /*
1302          * In-memory (unsynchronized) records can simply be freed.
1303          */
1304         if (cursor->record == &cursor->iprec->rec) {
1305                 cursor->iprec->flags |= HAMMER_RECF_DELETED;
1306                 return(0);
1307         }
1308
1309         /*
1310          * On-disk records are marked as deleted by updating their delete_tid.
1311          */
1312         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1313         elm = NULL;
1314         hmp = cursor->node->cluster->volume->hmp;
1315
1316         if (error == 0) {
1317                 hammer_modify_buffer(cursor->record_buffer);
1318                 cursor->record->base.base.delete_tid = tid;
1319
1320                 error = hammer_cursor_upgrade(cursor);
1321                 if (error == 0) {
1322                         hammer_modify_node(cursor->node);
1323                         elm = &cursor->node->ondisk->elms[cursor->index];
1324                         elm->leaf.base.delete_tid = tid;
1325                         hammer_update_syncid(cursor->record_buffer->cluster,
1326                                              tid);
1327                 }
1328         }
1329
1330         /*
1331          * If we were mounted with the nohistory option, we physically
1332          * delete the record.
1333          */
1334         if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) {
1335                 int32_t rec_offset;
1336                 int32_t data_offset;
1337                 int32_t data_len;
1338                 hammer_cluster_t cluster;
1339
1340                 rec_offset = elm->leaf.rec_offset;
1341                 data_offset = elm->leaf.data_offset;
1342                 data_len = elm->leaf.data_len;
1343 #if 0
1344                 kprintf("hammer_ip_delete_record: %08x %08x/%d\n",
1345                         rec_offset, data_offset, data_len);
1346 #endif
1347                 cluster = cursor->node->cluster;
1348                 hammer_ref_cluster(cluster);
1349
1350                 error = hammer_btree_delete(cursor);
1351                 if (error == 0) {
1352                         /*
1353                          * This forces a fixup for the iteration because
1354                          * the cursor is now either sitting at the 'next'
1355                          * element or sitting at the end of a leaf.
1356                          */
1357                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1358                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
1359                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1360                         }
1361                         hammer_free_record(cluster, rec_offset);
1362                         if (data_offset && (data_offset - rec_offset < 0 ||
1363                             data_offset - rec_offset >= HAMMER_RECORD_SIZE)) {
1364                                 hammer_free_data(cluster, data_offset,data_len);
1365                         }
1366                 }
1367                 hammer_rel_cluster(cluster, 0);
1368                 if (error) {
1369                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1370                         error = 0;
1371                 }
1372         }
1373         return(error);
1374 }
1375
1376 /*
1377  * Determine whether a directory is empty or not.  Returns 0 if the directory
1378  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1379  */
1380 int
1381 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1382 {
1383         struct hammer_cursor cursor;
1384         int error;
1385
1386         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1387
1388         cursor.key_beg.obj_id = ip->obj_id;
1389         cursor.key_beg.create_tid = 0;
1390         cursor.key_beg.delete_tid = 0;
1391         cursor.key_beg.obj_type = 0;
1392         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1393         cursor.key_beg.key = HAMMER_MIN_KEY;
1394
1395         cursor.key_end = cursor.key_beg;
1396         cursor.key_end.rec_type = 0xFFFF;
1397         cursor.key_end.key = HAMMER_MAX_KEY;
1398
1399         cursor.asof = ip->obj_asof;
1400         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1401
1402         error = hammer_ip_first(&cursor, ip);
1403         if (error == ENOENT)
1404                 error = 0;
1405         else if (error == 0)
1406                 error = ENOTEMPTY;
1407         hammer_done_cursor(&cursor);
1408         return(error);
1409 }
1410