e7bb5ae9e492c810fc67a320b51d0dd7bf5e82de
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.86 2008/07/11 01:22:29 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor);
41 static int hammer_mem_first(hammer_cursor_t cursor);
42 static int hammer_frontend_trunc_callback(hammer_record_t record,
43                                 void *data __unused);
44 static int hammer_record_needs_overwrite_delete(hammer_record_t record);
45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
46                       hammer_btree_leaf_elm_t leaf);
47
48 struct rec_trunc_info {
49         u_int16_t       rec_type;
50         int64_t         trunc_off;
51 };
52
53 /*
54  * Red-black tree support.  Comparison code for insertion.
55  */
56 static int
57 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
58 {
59         if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type)
60                 return(-1);
61         if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type)
62                 return(1);
63
64         if (rec1->leaf.base.key < rec2->leaf.base.key)
65                 return(-1);
66         if (rec1->leaf.base.key > rec2->leaf.base.key)
67                 return(1);
68
69         /*
70          * Never match against an item deleted by the front-end.
71          *
72          * rec1 is greater then rec2 if rec1 is marked deleted.
73          * rec1 is less then rec2 if rec2 is marked deleted.
74          *
75          * Multiple deleted records may be present, do not return 0
76          * if both are marked deleted.
77          */
78         if (rec1->flags & HAMMER_RECF_DELETED_FE)
79                 return(1);
80         if (rec2->flags & HAMMER_RECF_DELETED_FE)
81                 return(-1);
82
83         return(0);
84 }
85
86 /*
87  * Basic record comparison code similar to hammer_btree_cmp().
88  */
89 static int
90 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
91 {
92         if (elm->rec_type < rec->leaf.base.rec_type)
93                 return(-3);
94         if (elm->rec_type > rec->leaf.base.rec_type)
95                 return(3);
96
97         if (elm->key < rec->leaf.base.key)
98                 return(-2);
99         if (elm->key > rec->leaf.base.key)
100                 return(2);
101
102         /*
103          * Never match against an item deleted by the front-end.
104          * elm is less then rec if rec is marked deleted.
105          */
106         if (rec->flags & HAMMER_RECF_DELETED_FE)
107                 return(-1);
108         return(0);
109 }
110
111 /*
112  * Special LOOKUP_INFO to locate an overlapping record.  This used by
113  * the reservation code to implement small-block records (whos keys will
114  * be different depending on data_len, when representing the same base
115  * offset).
116  *
117  * NOTE: The base file offset of a data record is (key - data_len), not (key).
118  */
119 static int
120 hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec)
121 {
122         if (leaf->base.rec_type < rec->leaf.base.rec_type)
123                 return(-3);
124         if (leaf->base.rec_type > rec->leaf.base.rec_type)
125                 return(3);
126
127         /*
128          * Overlap compare
129          */
130         if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
131                 /* leaf_end <= rec_beg */
132                 if (leaf->base.key <= rec->leaf.base.key - rec->leaf.data_len)
133                         return(-2);
134                 /* leaf_beg >= rec_end */
135                 if (leaf->base.key - leaf->data_len >= rec->leaf.base.key)
136                         return(2);
137         } else {
138                 if (leaf->base.key < rec->leaf.base.key)
139                         return(-2);
140                 if (leaf->base.key > rec->leaf.base.key)
141                         return(2);
142         }
143
144         /*
145          * Never match against an item deleted by the front-end.
146          * leaf is less then rec if rec is marked deleted.
147          *
148          * We must still return the proper code for the scan to continue
149          * along the correct branches.
150          */
151         if (rec->flags & HAMMER_RECF_DELETED_FE) {
152                 if (leaf->base.key < rec->leaf.base.key)
153                         return(-2);
154                 if (leaf->base.key > rec->leaf.base.key)
155                         return(2);
156                 return(-1);
157         }
158         return(0);
159 }
160
161 /*
162  * RB_SCAN comparison code for hammer_mem_first().  The argument order
163  * is reversed so the comparison result has to be negated.  key_beg and
164  * key_end are both range-inclusive.
165  *
166  * Localized deletions are not cached in-memory.
167  */
168 static
169 int
170 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
171 {
172         hammer_cursor_t cursor = data;
173         int r;
174
175         r = hammer_rec_cmp(&cursor->key_beg, rec);
176         if (r > 1)
177                 return(-1);
178         r = hammer_rec_cmp(&cursor->key_end, rec);
179         if (r < -1)
180                 return(1);
181         return(0);
182 }
183
184 /*
185  * This compare function is used when simply looking up key_beg.
186  */
187 static
188 int
189 hammer_rec_find_cmp(hammer_record_t rec, void *data)
190 {
191         hammer_cursor_t cursor = data;
192         int r;
193
194         r = hammer_rec_cmp(&cursor->key_beg, rec);
195         if (r > 1)
196                 return(-1);
197         if (r < -1)
198                 return(1);
199         return(0);
200 }
201
202 /*
203  * Locate blocks within the truncation range.  Partial blocks do not count.
204  */
205 static
206 int
207 hammer_rec_trunc_cmp(hammer_record_t rec, void *data)
208 {
209         struct rec_trunc_info *info = data;
210
211         if (rec->leaf.base.rec_type < info->rec_type)
212                 return(-1);
213         if (rec->leaf.base.rec_type > info->rec_type)
214                 return(1);
215
216         switch(rec->leaf.base.rec_type) {
217         case HAMMER_RECTYPE_DB:
218                 /*
219                  * DB record key is not beyond the truncation point, retain.
220                  */
221                 if (rec->leaf.base.key < info->trunc_off)
222                         return(-1);
223                 break;
224         case HAMMER_RECTYPE_DATA:
225                 /*
226                  * DATA record offset start is not beyond the truncation point,
227                  * retain.
228                  */
229                 if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off)
230                         return(-1);
231                 break;
232         default:
233                 panic("hammer_rec_trunc_cmp: unexpected record type");
234         }
235
236         /*
237          * The record start is >= the truncation point, return match,
238          * the record should be destroyed.
239          */
240         return(0);
241 }
242
243 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
244 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
245                     hammer_rec_overlap_compare, hammer_btree_leaf_elm_t);
246
247 /*
248  * Allocate a record for the caller to finish filling in.  The record is
249  * returned referenced.
250  */
251 hammer_record_t
252 hammer_alloc_mem_record(hammer_inode_t ip, int data_len)
253 {
254         hammer_record_t record;
255
256         ++hammer_count_records;
257         record = kmalloc(sizeof(*record), M_HAMMER,
258                          M_WAITOK | M_ZERO | M_USE_RESERVE);
259         record->flush_state = HAMMER_FST_IDLE;
260         record->ip = ip;
261         record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
262         record->leaf.data_len = data_len;
263         hammer_ref(&record->lock);
264
265         if (data_len) {
266                 record->data = kmalloc(data_len, M_HAMMER, M_WAITOK | M_ZERO);
267                 record->flags |= HAMMER_RECF_ALLOCDATA;
268                 ++hammer_count_record_datas;
269         }
270
271         return (record);
272 }
273
274 void
275 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident)
276 {
277         while (record->flush_state == HAMMER_FST_FLUSH) {
278                 record->flags |= HAMMER_RECF_WANTED;
279                 tsleep(record, 0, ident, 0);
280         }
281 }
282
283 /*
284  * Called from the backend, hammer_inode.c, after a record has been
285  * flushed to disk.  The record has been exclusively locked by the
286  * caller and interlocked with BE.
287  *
288  * We clean up the state, unlock, and release the record (the record
289  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
290  */
291 void
292 hammer_flush_record_done(hammer_record_t record, int error)
293 {
294         hammer_inode_t target_ip;
295
296         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
297         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
298
299         if (error) {
300                 /*
301                  * An error occured, the backend was unable to sync the
302                  * record to its media.  Leave the record intact.
303                  */
304                 Debugger("flush_record_done error");
305         }
306
307         if (record->flags & HAMMER_RECF_DELETED_BE) {
308                 if ((target_ip = record->target_ip) != NULL) {
309                         TAILQ_REMOVE(&target_ip->target_list, record,
310                                      target_entry);
311                         record->target_ip = NULL;
312                         hammer_test_inode(target_ip);
313                 }
314                 record->flush_state = HAMMER_FST_IDLE;
315         } else {
316                 if (record->target_ip) {
317                         record->flush_state = HAMMER_FST_SETUP;
318                         hammer_test_inode(record->ip);
319                         hammer_test_inode(record->target_ip);
320                 } else {
321                         record->flush_state = HAMMER_FST_IDLE;
322                 }
323         }
324         record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
325         if (record->flags & HAMMER_RECF_WANTED) {
326                 record->flags &= ~HAMMER_RECF_WANTED;
327                 wakeup(record);
328         }
329         hammer_rel_mem_record(record);
330 }
331
332 /*
333  * Release a memory record.  Records marked for deletion are immediately
334  * removed from the RB-Tree but otherwise left intact until the last ref
335  * goes away.
336  */
337 void
338 hammer_rel_mem_record(struct hammer_record *record)
339 {
340         hammer_inode_t ip, target_ip;
341
342         hammer_unref(&record->lock);
343
344         if (record->lock.refs == 0) {
345                 /*
346                  * Upon release of the last reference wakeup any waiters.
347                  * The record structure may get destroyed so callers will
348                  * loop up and do a relookup.
349                  *
350                  * WARNING!  Record must be removed from RB-TREE before we
351                  * might possibly block.  hammer_test_inode() can block!
352                  */
353                 ip = record->ip;
354
355                 /*
356                  * Upon release of the last reference a record marked deleted
357                  * is destroyed.
358                  */
359                 if (record->flags & HAMMER_RECF_DELETED_FE) {
360                         KKASSERT(ip->lock.refs > 0);
361                         KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
362
363                         /*
364                          * target_ip may have zero refs, we have to ref it
365                          * to prevent it from being ripped out from under
366                          * us.
367                          */
368                         if ((target_ip = record->target_ip) != NULL) {
369                                 TAILQ_REMOVE(&target_ip->target_list,
370                                              record, target_entry);
371                                 record->target_ip = NULL;
372                                 hammer_ref(&target_ip->lock);
373                         }
374
375                         if (record->flags & HAMMER_RECF_ONRBTREE) {
376                                 RB_REMOVE(hammer_rec_rb_tree,
377                                           &record->ip->rec_tree,
378                                           record);
379                                 KKASSERT(ip->rsv_recs > 0);
380                                 --ip->hmp->rsv_recs;
381                                 --ip->rsv_recs;
382                                 ip->hmp->rsv_databytes -= record->leaf.data_len;
383                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
384
385                                 if (RB_EMPTY(&record->ip->rec_tree)) {
386                                         record->ip->flags &= ~HAMMER_INODE_XDIRTY;
387                                         record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
388                                         hammer_test_inode(record->ip);
389                                 }
390                         }
391
392                         /*
393                          * Do this test after removing record from the B-Tree.
394                          */
395                         if (target_ip) {
396                                 hammer_test_inode(target_ip);
397                                 hammer_rel_inode(target_ip, 0);
398                         }
399
400                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
401                                 --hammer_count_record_datas;
402                                 kfree(record->data, M_HAMMER);
403                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
404                         }
405                         if (record->resv) {
406                                 hammer_blockmap_reserve_complete(ip->hmp,
407                                                                  record->resv);
408                                 record->resv = NULL;
409                         }
410                         record->data = NULL;
411                         --hammer_count_records;
412                         kfree(record, M_HAMMER);
413                 }
414         }
415 }
416
417 /*
418  * Record visibility depends on whether the record is being accessed by
419  * the backend or the frontend.
420  *
421  * Return non-zero if the record is visible, zero if it isn't or if it is
422  * deleted.
423  */
424 static __inline
425 int
426 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
427 {
428         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
429                 if (record->flags & HAMMER_RECF_DELETED_BE)
430                         return(0);
431         } else {
432                 if (record->flags & HAMMER_RECF_DELETED_FE)
433                         return(0);
434         }
435         return(1);
436 }
437
438 /*
439  * This callback is used as part of the RB_SCAN function for in-memory
440  * records.  We terminate it (return -1) as soon as we get a match.
441  *
442  * This routine is used by frontend code.
443  *
444  * The primary compare code does not account for ASOF lookups.  This
445  * code handles that case as well as a few others.
446  */
447 static
448 int
449 hammer_rec_scan_callback(hammer_record_t rec, void *data)
450 {
451         hammer_cursor_t cursor = data;
452
453         /*
454          * We terminate on success, so this should be NULL on entry.
455          */
456         KKASSERT(cursor->iprec == NULL);
457
458         /*
459          * Skip if the record was marked deleted.
460          */
461         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
462                 return(0);
463
464         /*
465          * Skip if not visible due to our as-of TID
466          */
467         if (cursor->flags & HAMMER_CURSOR_ASOF) {
468                 if (cursor->asof < rec->leaf.base.create_tid)
469                         return(0);
470                 if (rec->leaf.base.delete_tid &&
471                     cursor->asof >= rec->leaf.base.delete_tid) {
472                         return(0);
473                 }
474         }
475
476         /*
477          * ref the record.  The record is protected from backend B-Tree
478          * interactions by virtue of the cursor's IP lock.
479          */
480         hammer_ref(&rec->lock);
481
482         /*
483          * The record may have been deleted while we were blocked.
484          */
485         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
486                 hammer_rel_mem_record(rec);
487                 return(0);
488         }
489
490         /*
491          * Set the matching record and stop the scan.
492          */
493         cursor->iprec = rec;
494         return(-1);
495 }
496
497
498 /*
499  * Lookup an in-memory record given the key specified in the cursor.  Works
500  * just like hammer_btree_lookup() but operates on an inode's in-memory
501  * record list.
502  *
503  * The lookup must fail if the record is marked for deferred deletion.
504  */
505 static
506 int
507 hammer_mem_lookup(hammer_cursor_t cursor)
508 {
509         int error;
510
511         KKASSERT(cursor->ip);
512         if (cursor->iprec) {
513                 hammer_rel_mem_record(cursor->iprec);
514                 cursor->iprec = NULL;
515         }
516         hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp,
517                                    hammer_rec_scan_callback, cursor);
518
519         if (cursor->iprec == NULL)
520                 error = ENOENT;
521         else
522                 error = 0;
523         return(error);
524 }
525
526 /*
527  * hammer_mem_first() - locate the first in-memory record matching the
528  * cursor within the bounds of the key range.
529  */
530 static
531 int
532 hammer_mem_first(hammer_cursor_t cursor)
533 {
534         hammer_inode_t ip;
535
536         ip = cursor->ip;
537         KKASSERT(ip != NULL);
538
539         if (cursor->iprec) {
540                 hammer_rel_mem_record(cursor->iprec);
541                 cursor->iprec = NULL;
542         }
543
544         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
545                                    hammer_rec_scan_callback, cursor);
546
547         /*
548          * Adjust scan.node and keep it linked into the RB-tree so we can
549          * hold the cursor through third party modifications of the RB-tree.
550          */
551         if (cursor->iprec)
552                 return(0);
553         return(ENOENT);
554 }
555
556 /************************************************************************
557  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
558  ************************************************************************
559  *
560  * These functions manipulate in-memory records.  Such records typically
561  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
562  */
563
564 /*
565  * Add a directory entry (dip,ncp) which references inode (ip).
566  *
567  * Note that the low 32 bits of the namekey are set temporarily to create
568  * a unique in-memory record, and may be modified a second time when the
569  * record is synchronized to disk.  In particular, the low 32 bits cannot be
570  * all 0's when synching to disk, which is not handled here.
571  *
572  * NOTE: bytes does not include any terminating \0 on name, and name might
573  * not be terminated.
574  */
575 int
576 hammer_ip_add_directory(struct hammer_transaction *trans,
577                      struct hammer_inode *dip, const char *name, int bytes,
578                      struct hammer_inode *ip)
579 {
580         struct hammer_cursor cursor;
581         hammer_record_t record;
582         int error;
583         int count;
584         u_int32_t iterator;
585
586         record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes));
587         if (++trans->hmp->namekey_iterator == 0)
588                 ++trans->hmp->namekey_iterator;
589
590         record->type = HAMMER_MEM_RECORD_ADD;
591         record->leaf.base.localization = dip->obj_localization +
592                                          HAMMER_LOCALIZE_MISC;
593         record->leaf.base.obj_id = dip->obj_id;
594         record->leaf.base.key = hammer_directory_namekey(name, bytes);
595         record->leaf.base.key += trans->hmp->namekey_iterator;
596         record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
597         record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
598         record->data->entry.obj_id = ip->obj_id;
599         record->data->entry.localization = ip->obj_localization;
600         bcopy(name, record->data->entry.name, bytes);
601
602         ++ip->ino_data.nlinks;
603         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
604
605         /*
606          * Find an unused namekey.  Both the in-memory record tree and
607          * the B-Tree are checked.  Exact matches also match create_tid
608          * so use an ASOF search to (mostly) ignore it.
609          *
610          * delete-visibility is set so pending deletions do not give us
611          * a false-negative on our ability to use an iterator.
612          */
613         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
614         cursor.key_beg = record->leaf.base;
615         cursor.flags |= HAMMER_CURSOR_ASOF;
616         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
617         cursor.asof = ip->obj_asof;
618
619         count = 0;
620         while (hammer_ip_lookup(&cursor) == 0) {
621                 iterator = (u_int32_t)record->leaf.base.key + 1;
622                 if (iterator == 0)
623                         iterator = 1;
624                 record->leaf.base.key &= ~0xFFFFFFFFLL;
625                 record->leaf.base.key |= iterator;
626                 cursor.key_beg.key = record->leaf.base.key;
627                 if (++count == 1000000000) {
628                         hammer_rel_mem_record(record);
629                         error = ENOSPC;
630                         goto failed;
631                 }
632         }
633
634         /*
635          * The target inode and the directory entry are bound together.
636          */
637         record->target_ip = ip;
638         record->flush_state = HAMMER_FST_SETUP;
639         TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
640
641         /*
642          * The inode now has a dependancy and must be taken out of the idle
643          * state.  An inode not in an idle state is given an extra reference.
644          */
645         if (ip->flush_state == HAMMER_FST_IDLE) {
646                 hammer_ref(&ip->lock);
647                 ip->flush_state = HAMMER_FST_SETUP;
648         }
649         error = hammer_mem_add(record);
650 failed:
651         hammer_done_cursor(&cursor);
652         return(error);
653 }
654
655 /*
656  * Delete the directory entry and update the inode link count.  The
657  * cursor must be seeked to the directory entry record being deleted.
658  *
659  * The related inode should be share-locked by the caller.  The caller is
660  * on the frontend.
661  *
662  * This function can return EDEADLK requiring the caller to terminate
663  * the cursor, any locks, wait on the returned record, and retry.
664  */
665 int
666 hammer_ip_del_directory(struct hammer_transaction *trans,
667                      hammer_cursor_t cursor, struct hammer_inode *dip,
668                      struct hammer_inode *ip)
669 {
670         hammer_record_t record;
671         int error;
672
673         if (hammer_cursor_inmem(cursor)) {
674                 /*
675                  * In-memory (unsynchronized) records can simply be freed.
676                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
677                  * by the backend, we must still avoid races against the
678                  * backend potentially syncing the record to the media. 
679                  *
680                  * We cannot call hammer_ip_delete_record(), that routine may
681                  * only be called from the backend.
682                  */
683                 record = cursor->iprec;
684                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
685                         KKASSERT(cursor->deadlk_rec == NULL);
686                         hammer_ref(&record->lock);
687                         cursor->deadlk_rec = record;
688                         error = EDEADLK;
689                 } else {
690                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
691                         record->flags |= HAMMER_RECF_DELETED_FE;
692                         error = 0;
693                 }
694         } else {
695                 /*
696                  * If the record is on-disk we have to queue the deletion by
697                  * the record's key.  This also causes lookups to skip the
698                  * record.
699                  */
700                 KKASSERT(dip->flags &
701                          (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK));
702                 record = hammer_alloc_mem_record(dip, 0);
703                 record->type = HAMMER_MEM_RECORD_DEL;
704                 record->leaf.base = cursor->leaf->base;
705
706                 record->target_ip = ip;
707                 record->flush_state = HAMMER_FST_SETUP;
708                 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
709
710                 /*
711                  * The inode now has a dependancy and must be taken out of
712                  * the idle state.  An inode not in an idle state is given
713                  * an extra reference.
714                  */
715                 if (ip->flush_state == HAMMER_FST_IDLE) {
716                         hammer_ref(&ip->lock);
717                         ip->flush_state = HAMMER_FST_SETUP;
718                 }
719
720                 error = hammer_mem_add(record);
721         }
722
723         /*
724          * One less link.  The file may still be open in the OS even after
725          * all links have gone away.
726          *
727          * We have to terminate the cursor before syncing the inode to
728          * avoid deadlocking against ourselves.  XXX this may no longer
729          * be true.
730          *
731          * If nlinks drops to zero and the vnode is inactive (or there is
732          * no vnode), call hammer_inode_unloadable_check() to zonk the
733          * inode.  If we don't do this here the inode will not be destroyed
734          * on-media until we unmount.
735          */
736         if (error == 0) {
737                 --ip->ino_data.nlinks;
738                 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
739                 if (ip->ino_data.nlinks == 0 &&
740                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
741                         hammer_done_cursor(cursor);
742                         hammer_inode_unloadable_check(ip, 1);
743                         hammer_flush_inode(ip, 0);
744                 }
745
746         }
747         return(error);
748 }
749
750 /*
751  * Add a record to an inode.
752  *
753  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
754  * initialize the following additional fields:
755  *
756  * The related inode should be share-locked by the caller.  The caller is
757  * on the frontend.
758  *
759  * record->rec.entry.base.base.key
760  * record->rec.entry.base.base.rec_type
761  * record->rec.entry.base.base.data_len
762  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
763  */
764 int
765 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
766 {
767         hammer_inode_t ip = record->ip;
768         int error;
769
770         KKASSERT(record->leaf.base.localization != 0);
771         record->leaf.base.obj_id = ip->obj_id;
772         record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
773         error = hammer_mem_add(record);
774         return(error);
775 }
776
777 /*
778  * Locate a bulk record in-memory.  Bulk records allow disk space to be
779  * reserved so the front-end can flush large data writes without having
780  * to queue the BIO to the flusher.  Only the related record gets queued
781  * to the flusher.
782  */
783 static hammer_record_t
784 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes)
785 {
786         hammer_record_t record;
787         struct hammer_btree_leaf_elm leaf;
788
789         bzero(&leaf, sizeof(leaf));
790         leaf.base.obj_id = ip->obj_id;
791         leaf.base.key = file_offset + bytes;
792         leaf.base.create_tid = 0;
793         leaf.base.delete_tid = 0;
794         leaf.base.rec_type = HAMMER_RECTYPE_DATA;
795         leaf.base.obj_type = 0;                 /* unused */
796         leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;     /* unused */
797         leaf.base.localization = ip->obj_localization + HAMMER_LOCALIZE_MISC;
798         leaf.data_len = bytes;
799
800         record = hammer_rec_rb_tree_RB_LOOKUP_INFO(&ip->rec_tree, &leaf);
801         if (record)
802                 hammer_ref(&record->lock);
803         return(record);
804 }
805
806 /*
807  * Reserve blockmap space placemarked with an in-memory record.  
808  *
809  * This routine is called by the frontend in order to be able to directly
810  * flush a buffer cache buffer.  The frontend has locked the related buffer
811  * cache buffers and we should be able to manipulate any overlapping
812  * in-memory records.
813  */
814 hammer_record_t
815 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
816                    int *errorp)
817 {
818         hammer_record_t record;
819         hammer_record_t conflict;
820         int zone;
821         int flags;
822
823         /*
824          * Deal with conflicting in-memory records.  We cannot have multiple
825          * in-memory records for the same offset without seriously confusing
826          * the backend, including but not limited to the backend issuing
827          * delete-create-delete sequences and asserting on the delete_tid
828          * being the same as the create_tid.
829          *
830          * If we encounter a record with the backend interlock set we cannot
831          * immediately delete it without confusing the backend.
832          */
833         while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) {
834                 if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
835                         conflict->flags |= HAMMER_RECF_WANTED;
836                         tsleep(conflict, 0, "hmrrc3", 0);
837                 } else {
838                         conflict->flags |= HAMMER_RECF_DELETED_FE;
839                 }
840                 hammer_rel_mem_record(conflict);
841         }
842
843         /*
844          * Create a record to cover the direct write.  This is called with
845          * the related BIO locked so there should be no possible conflict.
846          *
847          * The backend is responsible for finalizing the space reserved in
848          * this record.
849          *
850          * XXX bytes not aligned, depend on the reservation code to
851          * align the reservation.
852          */
853         record = hammer_alloc_mem_record(ip, 0);
854         zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
855                                            HAMMER_ZONE_SMALL_DATA_INDEX;
856         record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
857                                                &record->leaf.data_offset,
858                                                errorp);
859         if (record->resv == NULL) {
860                 kprintf("hammer_ip_add_bulk: reservation failed\n");
861                 hammer_rel_mem_record(record);
862                 return(NULL);
863         }
864         record->type = HAMMER_MEM_RECORD_DATA;
865         record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
866         record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
867         record->leaf.base.obj_id = ip->obj_id;
868         record->leaf.base.key = file_offset + bytes;
869         record->leaf.base.localization = ip->obj_localization +
870                                          HAMMER_LOCALIZE_MISC;
871         record->leaf.data_len = bytes;
872         hammer_crc_set_leaf(data, &record->leaf);
873         flags = record->flags;
874
875         hammer_ref(&record->lock);      /* mem_add eats a reference */
876         *errorp = hammer_mem_add(record);
877         if (*errorp) {
878                 conflict = hammer_ip_get_bulk(ip, file_offset, bytes);
879                 kprintf("hammer_ip_add_bulk: error %d conflict %p file_offset %lld bytes %d\n",
880                         *errorp, conflict, file_offset, bytes);
881                 if (conflict)
882                         kprintf("conflict %lld %d\n", conflict->leaf.base.key, conflict->leaf.data_len);
883                 if (conflict)
884                         hammer_rel_mem_record(conflict);
885         }
886         KKASSERT(*errorp == 0);
887         conflict = hammer_ip_get_bulk(ip, file_offset, bytes);
888         if (conflict != record) {
889                 kprintf("conflict mismatch %p %p %08x\n", conflict, record, record->flags);
890                 if (conflict)
891                     kprintf("conflict mismatch %lld/%d %lld/%d\n", conflict->leaf.base.key, conflict->leaf.data_len, record->leaf.base.key, record->leaf.data_len);
892         }
893         KKASSERT(conflict == record);
894         hammer_rel_mem_record(conflict);
895
896         return (record);
897 }
898
899 /*
900  * Frontend truncation code.  Scan in-memory records only.  On-disk records
901  * and records in a flushing state are handled by the backend.  The vnops
902  * setattr code will handle the block containing the truncation point.
903  *
904  * Partial blocks are not deleted.
905  */
906 int
907 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size)
908 {
909         struct rec_trunc_info info;
910
911         switch(ip->ino_data.obj_type) {
912         case HAMMER_OBJTYPE_REGFILE:
913                 info.rec_type = HAMMER_RECTYPE_DATA;
914                 break;
915         case HAMMER_OBJTYPE_DBFILE:
916                 info.rec_type = HAMMER_RECTYPE_DB;
917                 break;
918         default:
919                 return(EINVAL);
920         }
921         info.trunc_off = file_size;
922         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp,
923                                    hammer_frontend_trunc_callback, &info);
924         return(0);
925 }
926
927 static int
928 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused)
929 {
930         if (record->flags & HAMMER_RECF_DELETED_FE)
931                 return(0);
932         if (record->flush_state == HAMMER_FST_FLUSH)
933                 return(0);
934         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
935         hammer_ref(&record->lock);
936         record->flags |= HAMMER_RECF_DELETED_FE;
937         hammer_rel_mem_record(record);
938         return(0);
939 }
940
941 /*
942  * Return 1 if the caller must check for and delete existing records
943  * before writing out a new data record.
944  *
945  * Return 0 if the caller can just insert the record into the B-Tree without
946  * checking.
947  */
948 static int
949 hammer_record_needs_overwrite_delete(hammer_record_t record)
950 {
951         hammer_inode_t ip = record->ip;
952         int64_t file_offset;
953         int r;
954
955         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE)
956                 file_offset = record->leaf.base.key;
957         else
958                 file_offset = record->leaf.base.key - record->leaf.data_len;
959         r = (file_offset < ip->save_trunc_off);
960         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
961                 if (ip->save_trunc_off <= record->leaf.base.key)
962                         ip->save_trunc_off = record->leaf.base.key + 1;
963         } else {
964                 if (ip->save_trunc_off < record->leaf.base.key)
965                         ip->save_trunc_off = record->leaf.base.key;
966         }
967         return(r);
968 }
969
970 /*
971  * Backend code.  Sync a record to the media.
972  */
973 int
974 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
975 {
976         hammer_transaction_t trans = cursor->trans;
977         int64_t file_offset;
978         int bytes;
979         void *bdata;
980         int error;
981         int doprop;
982
983         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
984         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
985         KKASSERT(record->leaf.base.localization != 0);
986
987         /*
988          * If this is a bulk-data record placemarker there may be an existing
989          * record on-disk, indicating a data overwrite.  If there is the
990          * on-disk record must be deleted before we can insert our new record.
991          *
992          * We've synthesized this record and do not know what the create_tid
993          * on-disk is, nor how much data it represents.
994          *
995          * Keep in mind that (key) for data records is (base_offset + len),
996          * not (base_offset).  Also, we only want to get rid of on-disk
997          * records since we are trying to sync our in-memory record, call
998          * hammer_ip_delete_range() with truncating set to 1 to make sure
999          * it skips in-memory records.
1000          *
1001          * It is ok for the lookup to return ENOENT.
1002          *
1003          * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
1004          * to call hammer_ip_delete_range() or not.  This also means we must
1005          * update sync_trunc_off() as we write.
1006          */
1007         if (record->type == HAMMER_MEM_RECORD_DATA &&
1008             hammer_record_needs_overwrite_delete(record)) {
1009                 file_offset = record->leaf.base.key - record->leaf.data_len;
1010                 bytes = (record->leaf.data_len + HAMMER_BUFMASK) & 
1011                         ~HAMMER_BUFMASK;
1012                 KKASSERT((file_offset & HAMMER_BUFMASK) == 0);
1013                 error = hammer_ip_delete_range(
1014                                 cursor, record->ip,
1015                                 file_offset, file_offset + bytes - 1,
1016                                 1);
1017                 if (error && error != ENOENT)
1018                         goto done;
1019         }
1020
1021         /*
1022          * If this is a general record there may be an on-disk version
1023          * that must be deleted before we can insert the new record.
1024          */
1025         if (record->type == HAMMER_MEM_RECORD_GENERAL) {
1026                 error = hammer_delete_general(cursor, record->ip,
1027                                               &record->leaf);
1028                 if (error && error != ENOENT)
1029                         goto done;
1030         }
1031
1032         /*
1033          * Setup the cursor.
1034          */
1035         hammer_normalize_cursor(cursor);
1036         cursor->key_beg = record->leaf.base;
1037         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1038         cursor->flags |= HAMMER_CURSOR_BACKEND;
1039         cursor->flags &= ~HAMMER_CURSOR_INSERT;
1040
1041         /*
1042          * Records can wind up on-media before the inode itself is on-media.
1043          * Flag the case.
1044          */
1045         record->ip->flags |= HAMMER_INODE_DONDISK;
1046
1047         /*
1048          * If we are deleting a directory entry an exact match must be
1049          * found on-disk.
1050          */
1051         if (record->type == HAMMER_MEM_RECORD_DEL) {
1052                 error = hammer_btree_lookup(cursor);
1053                 if (error == 0) {
1054                         error = hammer_ip_delete_record(cursor, record->ip,
1055                                                         trans->tid);
1056                         if (error == 0) {
1057                                 record->flags |= HAMMER_RECF_DELETED_FE;
1058                                 record->flags |= HAMMER_RECF_DELETED_BE;
1059                         }
1060                 }
1061                 goto done;
1062         }
1063
1064         /*
1065          * We are inserting.
1066          *
1067          * Issue a lookup to position the cursor and locate the cluster.  The
1068          * target key should not exist.  If we are creating a directory entry
1069          * we may have to iterate the low 32 bits of the key to find an unused
1070          * key.
1071          */
1072         hammer_sync_lock_sh(trans);
1073         cursor->flags |= HAMMER_CURSOR_INSERT;
1074         error = hammer_btree_lookup(cursor);
1075         if (hammer_debug_inode)
1076                 kprintf("DOINSERT LOOKUP %d\n", error);
1077         if (error == 0) {
1078                 kprintf("hammer_ip_sync_record: duplicate rec "
1079                         "at (%016llx)\n", record->leaf.base.key);
1080                 Debugger("duplicate record1");
1081                 error = EIO;
1082         }
1083 #if 0
1084         if (record->type == HAMMER_MEM_RECORD_DATA)
1085                 kprintf("sync_record  %016llx ---------------- %016llx %d\n",
1086                         record->leaf.base.key - record->leaf.data_len,
1087                         record->leaf.data_offset, error);
1088 #endif
1089
1090         if (error != ENOENT)
1091                 goto done_unlock;
1092
1093         /*
1094          * Allocate the record and data.  The result buffers will be
1095          * marked as being modified and further calls to
1096          * hammer_modify_buffer() will result in unneeded UNDO records.
1097          *
1098          * Support zero-fill records (data == NULL and data_len != 0)
1099          */
1100         if (record->type == HAMMER_MEM_RECORD_DATA) {
1101                 /*
1102                  * The data portion of a bulk-data record has already been
1103                  * committed to disk, we need only adjust the layer2
1104                  * statistics in the same transaction as our B-Tree insert.
1105                  */
1106                 KKASSERT(record->leaf.data_offset != 0);
1107                 hammer_blockmap_finalize(trans, record->leaf.data_offset,
1108                                          record->leaf.data_len);
1109                 error = 0;
1110         } else if (record->data && record->leaf.data_len) {
1111                 /*
1112                  * Wholely cached record, with data.  Allocate the data.
1113                  */
1114                 bdata = hammer_alloc_data(trans, record->leaf.data_len,
1115                                           record->leaf.base.rec_type,
1116                                           &record->leaf.data_offset,
1117                                           &cursor->data_buffer, &error);
1118                 if (bdata == NULL)
1119                         goto done_unlock;
1120                 hammer_crc_set_leaf(record->data, &record->leaf);
1121                 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
1122                 bcopy(record->data, bdata, record->leaf.data_len);
1123                 hammer_modify_buffer_done(cursor->data_buffer);
1124         } else {
1125                 /*
1126                  * Wholely cached record, without data.
1127                  */
1128                 record->leaf.data_offset = 0;
1129                 record->leaf.data_crc = 0;
1130         }
1131
1132         error = hammer_btree_insert(cursor, &record->leaf, &doprop);
1133         if (hammer_debug_inode && error)
1134                 kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key);
1135
1136         /*
1137          * Our record is on-disk, normally mark the in-memory version as
1138          * deleted.  If the record represented a directory deletion but
1139          * we had to sync a valid directory entry to disk we must convert
1140          * the record to a covering delete so the frontend does not have
1141          * visibility on the synced entry.
1142          */
1143         if (error == 0) {
1144                 if (doprop) {
1145                         hammer_btree_do_propagation(cursor,
1146                                                     record->ip->pfsm,
1147                                                     &record->leaf);
1148                 }
1149                 if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
1150                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
1151                         record->flags &= ~HAMMER_RECF_DELETED_FE;
1152                         record->type = HAMMER_MEM_RECORD_DEL;
1153                         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1154                         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1155                         /* hammer_flush_record_done takes care of the rest */
1156                 } else {
1157                         record->flags |= HAMMER_RECF_DELETED_FE;
1158                         record->flags |= HAMMER_RECF_DELETED_BE;
1159                 }
1160         } else {
1161                 if (record->leaf.data_offset) {
1162                         hammer_blockmap_free(trans, record->leaf.data_offset,
1163                                              record->leaf.data_len);
1164                 }
1165         }
1166 done_unlock:
1167         hammer_sync_unlock(trans);
1168 done:
1169         return(error);
1170 }
1171
1172 /*
1173  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
1174  * entry's key is used to deal with hash collisions in the upper 32 bits.
1175  * A unique 64 bit key is generated in-memory and may be regenerated a
1176  * second time when the directory record is flushed to the on-disk B-Tree.
1177  *
1178  * A referenced record is passed to this function.  This function
1179  * eats the reference.  If an error occurs the record will be deleted.
1180  *
1181  * A copy of the temporary record->data pointer provided by the caller
1182  * will be made.
1183  */
1184 static
1185 int
1186 hammer_mem_add(hammer_record_t record)
1187 {
1188         hammer_mount_t hmp = record->ip->hmp;
1189
1190         /*
1191          * Make a private copy of record->data
1192          */
1193         if (record->data)
1194                 KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA);
1195
1196         /*
1197          * Insert into the RB tree.  A unique key should have already
1198          * been selected if this is a directory entry.
1199          */
1200         if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1201                 record->flags |= HAMMER_RECF_DELETED_FE;
1202                 hammer_rel_mem_record(record);
1203                 return (EEXIST);
1204         }
1205         ++hmp->count_newrecords;
1206         ++hmp->rsv_recs;
1207         ++record->ip->rsv_recs;
1208         record->ip->hmp->rsv_databytes += record->leaf.data_len;
1209         record->flags |= HAMMER_RECF_ONRBTREE;
1210         hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY);
1211         hammer_rel_mem_record(record);
1212         return(0);
1213 }
1214
1215 /************************************************************************
1216  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1217  ************************************************************************
1218  *
1219  * These functions augment the B-Tree scanning functions in hammer_btree.c
1220  * by merging in-memory records with on-disk records.
1221  */
1222
1223 /*
1224  * Locate a particular record either in-memory or on-disk.
1225  *
1226  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1227  * NOT be called to iterate results.
1228  */
1229 int
1230 hammer_ip_lookup(hammer_cursor_t cursor)
1231 {
1232         int error;
1233
1234         /*
1235          * If the element is in-memory return it without searching the
1236          * on-disk B-Tree
1237          */
1238         KKASSERT(cursor->ip);
1239         error = hammer_mem_lookup(cursor);
1240         if (error == 0) {
1241                 cursor->leaf = &cursor->iprec->leaf;
1242                 return(error);
1243         }
1244         if (error != ENOENT)
1245                 return(error);
1246
1247         /*
1248          * If the inode has on-disk components search the on-disk B-Tree.
1249          */
1250         if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1251                 return(error);
1252         error = hammer_btree_lookup(cursor);
1253         if (error == 0)
1254                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1255         return(error);
1256 }
1257
1258 /*
1259  * Locate the first record within the cursor's key_beg/key_end range,
1260  * restricted to a particular inode.  0 is returned on success, ENOENT
1261  * if no records matched the requested range, or some other error.
1262  *
1263  * When 0 is returned hammer_ip_next() may be used to iterate additional
1264  * records within the requested range.
1265  *
1266  * This function can return EDEADLK, requiring the caller to terminate
1267  * the cursor and try again.
1268  */
1269 int
1270 hammer_ip_first(hammer_cursor_t cursor)
1271 {
1272         hammer_inode_t ip = cursor->ip;
1273         int error;
1274
1275         KKASSERT(ip != NULL);
1276
1277         /*
1278          * Clean up fields and setup for merged scan
1279          */
1280         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1281         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1282         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1283         if (cursor->iprec) {
1284                 hammer_rel_mem_record(cursor->iprec);
1285                 cursor->iprec = NULL;
1286         }
1287
1288         /*
1289          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1290          * exact lookup so if we get ENOENT we have to call the iterate
1291          * function to validate the first record after the begin key.
1292          *
1293          * The ATEDISK flag is used by hammer_btree_iterate to determine
1294          * whether it must index forwards or not.  It is also used here
1295          * to select the next record from in-memory or on-disk.
1296          *
1297          * EDEADLK can only occur if the lookup hit an empty internal
1298          * element and couldn't delete it.  Since this could only occur
1299          * in-range, we can just iterate from the failure point.
1300          */
1301         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1302                 error = hammer_btree_lookup(cursor);
1303                 if (error == ENOENT || error == EDEADLK) {
1304                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1305                         if (hammer_debug_general & 0x2000)
1306                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1307                         error = hammer_btree_iterate(cursor);
1308                 }
1309                 if (error && error != ENOENT) 
1310                         return(error);
1311                 if (error == 0) {
1312                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1313                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1314                 } else {
1315                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1316                 }
1317         }
1318
1319         /*
1320          * Search the in-memory record list (Red-Black tree).  Unlike the
1321          * B-Tree search, mem_first checks for records in the range.
1322          */
1323         error = hammer_mem_first(cursor);
1324         if (error && error != ENOENT)
1325                 return(error);
1326         if (error == 0) {
1327                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1328                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1329                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1330                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1331         }
1332
1333         /*
1334          * This will return the first matching record.
1335          */
1336         return(hammer_ip_next(cursor));
1337 }
1338
1339 /*
1340  * Retrieve the next record in a merged iteration within the bounds of the
1341  * cursor.  This call may be made multiple times after the cursor has been
1342  * initially searched with hammer_ip_first().
1343  *
1344  * 0 is returned on success, ENOENT if no further records match the
1345  * requested range, or some other error code is returned.
1346  */
1347 int
1348 hammer_ip_next(hammer_cursor_t cursor)
1349 {
1350         hammer_btree_elm_t elm;
1351         hammer_record_t rec, save;
1352         int error;
1353         int r;
1354
1355 next_btree:
1356         /*
1357          * Load the current on-disk and in-memory record.  If we ate any
1358          * records we have to get the next one. 
1359          *
1360          * If we deleted the last on-disk record we had scanned ATEDISK will
1361          * be clear and DELBTREE will be set, forcing a call to iterate. The
1362          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1363          * element.  If ATEDISK is set, iterate will skip the 'current'
1364          * element.
1365          *
1366          * Get the next on-disk record
1367          */
1368         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1369                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1370                         error = hammer_btree_iterate(cursor);
1371                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1372                         if (error == 0) {
1373                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1374                                 hammer_cache_node(&cursor->ip->cache[1],
1375                                                   cursor->node);
1376                         } else {
1377                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1378                                                  HAMMER_CURSOR_ATEDISK;
1379                         }
1380                 }
1381         }
1382
1383 next_memory:
1384         /*
1385          * Get the next in-memory record.
1386          *
1387          * hammer_rec_scan_cmp:  Is the record still in our general range,
1388          *                       (non-inclusive of snapshot exclusions)?
1389          * hammer_rec_scan_callback: Is the record in our snapshot?
1390          */
1391         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1392                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1393                         save = cursor->iprec;
1394                         cursor->iprec = NULL;
1395                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1396                         while (rec) {
1397                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1398                                         break;
1399                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1400                                         break;
1401                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1402                         }
1403                         if (save)
1404                                 hammer_rel_mem_record(save);
1405                         if (cursor->iprec) {
1406                                 KKASSERT(cursor->iprec == rec);
1407                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1408                         } else {
1409                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1410                         }
1411                 }
1412         }
1413
1414         /*
1415          * The memory record may have become stale while being held in
1416          * cursor->iprec.  We are interlocked against the backend on 
1417          * with regards to B-Tree entries.
1418          */
1419         if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) {
1420                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) {
1421                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1422                         goto next_memory;
1423                 }
1424         }
1425
1426         /*
1427          * Extract either the disk or memory record depending on their
1428          * relative position.
1429          */
1430         error = 0;
1431         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1432         case 0:
1433                 /*
1434                  * Both entries valid.   Compare the entries and nominally
1435                  * return the first one in the sort order.  Numerous cases
1436                  * require special attention, however.
1437                  */
1438                 elm = &cursor->node->ondisk->elms[cursor->index];
1439                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base);
1440
1441                 /*
1442                  * If the two entries differ only by their key (-2/2) or
1443                  * create_tid (-1/1), and are DATA records, we may have a
1444                  * nominal match.  We have to calculate the base file
1445                  * offset of the data.
1446                  */
1447                 if (r <= 2 && r >= -2 && r != 0 &&
1448                     cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE &&
1449                     cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1450                         int64_t base1 = elm->leaf.base.key - elm->leaf.data_len;
1451                         int64_t base2 = cursor->iprec->leaf.base.key -
1452                                         cursor->iprec->leaf.data_len;
1453                         if (base1 == base2)
1454                                 r = 0;
1455                 }
1456
1457                 if (r < 0) {
1458                         error = hammer_btree_extract(cursor,
1459                                                      HAMMER_CURSOR_GET_LEAF);
1460                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1461                         break;
1462                 }
1463
1464                 /*
1465                  * If the entries match exactly the memory entry is either
1466                  * an on-disk directory entry deletion or a bulk data
1467                  * overwrite.  If it is a directory entry deletion we eat
1468                  * both entries.
1469                  *
1470                  * For the bulk-data overwrite case it is possible to have
1471                  * visibility into both, which simply means the syncer
1472                  * hasn't gotten around to doing the delete+insert sequence
1473                  * on the B-Tree.  Use the memory entry and throw away the
1474                  * on-disk entry.
1475                  *
1476                  * If the in-memory record is not either of these we
1477                  * probably caught the syncer while it was syncing it to
1478                  * the media.  Since we hold a shared lock on the cursor,
1479                  * the in-memory record had better be marked deleted at
1480                  * this point.
1481                  */
1482                 if (r == 0) {
1483                         if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1484                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1485                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1486                                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1487                                         goto next_btree;
1488                                 }
1489                         } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1490                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1491                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1492                                 }
1493                                 /* fall through to memory entry */
1494                         } else {
1495                                 panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags);
1496                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1497                                 goto next_memory;
1498                         }
1499                 }
1500                 /* fall through to the memory entry */
1501         case HAMMER_CURSOR_ATEDISK:
1502                 /*
1503                  * Only the memory entry is valid.
1504                  */
1505                 cursor->leaf = &cursor->iprec->leaf;
1506                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1507
1508                 /*
1509                  * If the memory entry is an on-disk deletion we should have
1510                  * also had found a B-Tree record.  If the backend beat us
1511                  * to it it would have interlocked the cursor and we should
1512                  * have seen the in-memory record marked DELETED_FE.
1513                  */
1514                 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL &&
1515                     (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1516                         panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags);
1517                 }
1518                 break;
1519         case HAMMER_CURSOR_ATEMEM:
1520                 /*
1521                  * Only the disk entry is valid
1522                  */
1523                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1524                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1525                 break;
1526         default:
1527                 /*
1528                  * Neither entry is valid
1529                  *
1530                  * XXX error not set properly
1531                  */
1532                 cursor->leaf = NULL;
1533                 error = ENOENT;
1534                 break;
1535         }
1536         return(error);
1537 }
1538
1539 /*
1540  * Resolve the cursor->data pointer for the current cursor position in
1541  * a merged iteration.
1542  */
1543 int
1544 hammer_ip_resolve_data(hammer_cursor_t cursor)
1545 {
1546         hammer_record_t record;
1547         int error;
1548
1549         if (hammer_cursor_inmem(cursor)) {
1550                 /*
1551                  * The data associated with an in-memory record is usually
1552                  * kmalloced, but reserve-ahead data records will have an
1553                  * on-disk reference.
1554                  *
1555                  * NOTE: Reserve-ahead data records must be handled in the
1556                  * context of the related high level buffer cache buffer
1557                  * to interlock against async writes.
1558                  */
1559                 record = cursor->iprec;
1560                 cursor->data = record->data;
1561                 error = 0;
1562                 if (cursor->data == NULL) {
1563                         KKASSERT(record->leaf.base.rec_type ==
1564                                  HAMMER_RECTYPE_DATA);
1565                         cursor->data = hammer_bread_ext(cursor->trans->hmp,
1566                                                     record->leaf.data_offset,
1567                                                     record->leaf.data_len,
1568                                                     &error,
1569                                                     &cursor->data_buffer);
1570                 }
1571         } else {
1572                 cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf;
1573                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1574         }
1575         return(error);
1576 }
1577
1578 /*
1579  * Backend truncation / record replacement - delete records in range.
1580  *
1581  * Delete all records within the specified range for inode ip.  In-memory
1582  * records still associated with the frontend are ignored. 
1583  *
1584  * If truncating is non-zero in-memory records associated with the back-end
1585  * are ignored.  If truncating is > 1 we can return EWOULDBLOCK.
1586  *
1587  * NOTES:
1588  *
1589  *      * An unaligned range will cause new records to be added to cover
1590  *        the edge cases. (XXX not implemented yet).
1591  *
1592  *      * Replacement via reservations (see hammer_ip_sync_record_cursor())
1593  *        also do not deal with unaligned ranges.
1594  *
1595  *      * ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1596  *
1597  *      * Record keys for regular file data have to be special-cased since
1598  *        they indicate the end of the range (key = base + bytes).
1599  *
1600  *      * This function may be asked to delete ridiculously huge ranges, for
1601  *        example if someone truncates or removes a 1TB regular file.  We
1602  *        must be very careful on restarts and we may have to stop w/
1603  *        EWOULDBLOCK to avoid blowing out the buffer cache.
1604  */
1605 int
1606 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip,
1607                        int64_t ran_beg, int64_t ran_end, int truncating)
1608 {
1609         hammer_transaction_t trans = cursor->trans;
1610         hammer_btree_leaf_elm_t leaf;
1611         int error;
1612         int64_t off;
1613         int64_t tmp64;
1614
1615 #if 0
1616         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1617 #endif
1618
1619         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1620 retry:
1621         hammer_normalize_cursor(cursor);
1622         cursor->key_beg.localization = ip->obj_localization +
1623                                        HAMMER_LOCALIZE_MISC;
1624         cursor->key_beg.obj_id = ip->obj_id;
1625         cursor->key_beg.create_tid = 0;
1626         cursor->key_beg.delete_tid = 0;
1627         cursor->key_beg.obj_type = 0;
1628
1629         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1630                 cursor->key_beg.key = ran_beg;
1631                 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB;
1632         } else {
1633                 /*
1634                  * The key in the B-Tree is (base+bytes), so the first possible
1635                  * matching key is ran_beg + 1.
1636                  */
1637                 cursor->key_beg.key = ran_beg + 1;
1638                 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
1639         }
1640
1641         cursor->key_end = cursor->key_beg;
1642         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1643                 cursor->key_end.key = ran_end;
1644         } else {
1645                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1646                 if (tmp64 < ran_end)
1647                         cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1648                 else
1649                         cursor->key_end.key = ran_end + MAXPHYS + 1;
1650         }
1651
1652         cursor->asof = ip->obj_asof;
1653         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1654         cursor->flags |= HAMMER_CURSOR_ASOF;
1655         cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1656         cursor->flags |= HAMMER_CURSOR_BACKEND;
1657         cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE;
1658
1659         error = hammer_ip_first(cursor);
1660
1661         /*
1662          * Iterate through matching records and mark them as deleted.
1663          */
1664         while (error == 0) {
1665                 leaf = cursor->leaf;
1666
1667                 KKASSERT(leaf->base.delete_tid == 0);
1668                 KKASSERT(leaf->base.obj_id == ip->obj_id);
1669
1670                 /*
1671                  * There may be overlap cases for regular file data.  Also
1672                  * remember the key for a regular file record is (base + len),
1673                  * NOT (base).
1674                  *
1675                  * Note that do to duplicates (mem & media) allowed by
1676                  * DELETE_VISIBILITY, off can wind up less then ran_beg.
1677                  */
1678                 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
1679                         off = leaf->base.key - leaf->data_len;
1680                         /*
1681                          * Check the left edge case.  We currently do not
1682                          * split existing records.
1683                          */
1684                         if (off < ran_beg && leaf->base.key > ran_beg) {
1685                                 panic("hammer left edge case %016llx %d\n",
1686                                         leaf->base.key, leaf->data_len);
1687                         }
1688
1689                         /*
1690                          * Check the right edge case.  Note that the
1691                          * record can be completely out of bounds, which
1692                          * terminates the search.
1693                          *
1694                          * base->key is exclusive of the right edge while
1695                          * ran_end is inclusive of the right edge.  The
1696                          * (key - data_len) left boundary is inclusive.
1697                          *
1698                          * XXX theory-check this test at some point, are
1699                          * we missing a + 1 somewhere?  Note that ran_end
1700                          * could overflow.
1701                          */
1702                         if (leaf->base.key - 1 > ran_end) {
1703                                 if (leaf->base.key - leaf->data_len > ran_end)
1704                                         break;
1705                                 panic("hammer right edge case\n");
1706                         }
1707                 } else {
1708                         off = leaf->base.key;
1709                 }
1710
1711                 /*
1712                  * Delete the record.  When truncating we do not delete
1713                  * in-memory (data) records because they represent data
1714                  * written after the truncation.
1715                  *
1716                  * This will also physically destroy the B-Tree entry and
1717                  * data if the retention policy dictates.  The function
1718                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1719                  * uses to perform a fixup.
1720                  */
1721                 if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
1722                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1723                         /*
1724                          * If we have built up too many meta-buffers we risk
1725                          * deadlocking the kernel and must stop.  This can
1726                          * occur when deleting ridiculously huge files.
1727                          * sync_trunc_off is updated so the next cycle does
1728                          * not re-iterate records we have already deleted.
1729                          *
1730                          * This is only done with formal truncations.
1731                          */
1732                         if (truncating > 1 && error == 0 &&
1733                             hammer_flusher_meta_limit(ip->hmp)) {
1734                                 ip->sync_trunc_off = off;
1735                                 error = EWOULDBLOCK;
1736                         }
1737                 }
1738                 if (error)
1739                         break;
1740                 ran_beg = off;  /* for restart */
1741                 error = hammer_ip_next(cursor);
1742         }
1743         if (cursor->node)
1744                 hammer_cache_node(&ip->cache[1], cursor->node);
1745
1746         if (error == EDEADLK) {
1747                 hammer_done_cursor(cursor);
1748                 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1749                 if (error == 0)
1750                         goto retry;
1751         }
1752         if (error == ENOENT)
1753                 error = 0;
1754         return(error);
1755 }
1756
1757 /*
1758  * This backend function deletes the specified record on-disk, similar to
1759  * delete_range but for a specific record.  Unlike the exact deletions
1760  * used when deleting a directory entry this function uses an ASOF search 
1761  * like delete_range.
1762  *
1763  * This function may be called with ip->obj_asof set for a slave snapshot,
1764  * so don't use it.  We always delete non-historical records only.
1765  */
1766 static int
1767 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
1768                       hammer_btree_leaf_elm_t leaf)
1769 {
1770         hammer_transaction_t trans = cursor->trans;
1771         int error;
1772
1773         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1774 retry:
1775         hammer_normalize_cursor(cursor);
1776         cursor->key_beg = leaf->base;
1777         cursor->asof = HAMMER_MAX_TID;
1778         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1779         cursor->flags |= HAMMER_CURSOR_ASOF;
1780         cursor->flags |= HAMMER_CURSOR_BACKEND;
1781         cursor->flags &= ~HAMMER_CURSOR_INSERT;
1782
1783         error = hammer_btree_lookup(cursor);
1784         if (error == 0) {
1785                 error = hammer_ip_delete_record(cursor, ip, trans->tid);
1786         }
1787         if (error == EDEADLK) {
1788                 hammer_done_cursor(cursor);
1789                 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1790                 if (error == 0)
1791                         goto retry;
1792         }
1793         return(error);
1794 }
1795
1796 /*
1797  * This function deletes remaining auxillary records when an inode is
1798  * being deleted.  This function explicitly does not delete the
1799  * inode record, directory entry, data, or db records.  Those must be
1800  * properly disposed of prior to this call.
1801  */
1802 int
1803 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp)
1804 {
1805         hammer_transaction_t trans = cursor->trans;
1806         hammer_btree_leaf_elm_t leaf;
1807         int error;
1808
1809         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1810 retry:
1811         hammer_normalize_cursor(cursor);
1812         cursor->key_beg.localization = ip->obj_localization +
1813                                        HAMMER_LOCALIZE_MISC;
1814         cursor->key_beg.obj_id = ip->obj_id;
1815         cursor->key_beg.create_tid = 0;
1816         cursor->key_beg.delete_tid = 0;
1817         cursor->key_beg.obj_type = 0;
1818         cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START;
1819         cursor->key_beg.key = HAMMER_MIN_KEY;
1820
1821         cursor->key_end = cursor->key_beg;
1822         cursor->key_end.rec_type = HAMMER_RECTYPE_MAX;
1823         cursor->key_end.key = HAMMER_MAX_KEY;
1824
1825         cursor->asof = ip->obj_asof;
1826         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1827         cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1828         cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1829         cursor->flags |= HAMMER_CURSOR_BACKEND;
1830
1831         error = hammer_ip_first(cursor);
1832
1833         /*
1834          * Iterate through matching records and mark them as deleted.
1835          */
1836         while (error == 0) {
1837                 leaf = cursor->leaf;
1838
1839                 KKASSERT(leaf->base.delete_tid == 0);
1840
1841                 /*
1842                  * Mark the record and B-Tree entry as deleted.  This will
1843                  * also physically delete the B-Tree entry, record, and
1844                  * data if the retention policy dictates.  The function
1845                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1846                  * uses to perform a fixup.
1847                  *
1848                  * Directory entries (and delete-on-disk directory entries)
1849                  * must be synced and cannot be deleted.
1850                  */
1851                 error = hammer_ip_delete_record(cursor, ip, trans->tid);
1852                 ++*countp;
1853                 if (error)
1854                         break;
1855                 error = hammer_ip_next(cursor);
1856         }
1857         if (cursor->node)
1858                 hammer_cache_node(&ip->cache[1], cursor->node);
1859         if (error == EDEADLK) {
1860                 hammer_done_cursor(cursor);
1861                 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1862                 if (error == 0)
1863                         goto retry;
1864         }
1865         if (error == ENOENT)
1866                 error = 0;
1867         return(error);
1868 }
1869
1870 /*
1871  * Delete the record at the current cursor.  On success the cursor will
1872  * be positioned appropriately for an iteration but may no longer be at
1873  * a leaf node.
1874  *
1875  * This routine is only called from the backend.
1876  *
1877  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1878  * cursor and retry.
1879  */
1880 int
1881 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
1882                         hammer_tid_t tid)
1883 {
1884         hammer_off_t zone2_offset;
1885         hammer_record_t iprec;
1886         hammer_btree_elm_t elm;
1887         hammer_mount_t hmp;
1888         int error;
1889
1890         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1891         KKASSERT(tid != 0);
1892         hmp = cursor->node->hmp;
1893
1894         /*
1895          * In-memory (unsynchronized) records can simply be freed.  This
1896          * only occurs in range iterations since all other records are
1897          * individually synchronized.  Thus there should be no confusion with
1898          * the interlock.
1899          *
1900          * An in-memory record may be deleted before being committed to disk,
1901          * but could have been accessed in the mean time.  The backing store
1902          * may never been marked allocated and so hammer_blockmap_free() may
1903          * never get called on it.  Because of this we have to make sure that
1904          * we've gotten rid of any related hammer_buffer or buffer cache
1905          * buffer.
1906          */
1907         if (hammer_cursor_inmem(cursor)) {
1908                 iprec = cursor->iprec;
1909                 KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1910                 iprec->flags |= HAMMER_RECF_DELETED_FE;
1911                 iprec->flags |= HAMMER_RECF_DELETED_BE;
1912
1913                 if (iprec->leaf.data_offset && iprec->leaf.data_len) {
1914                         zone2_offset = hammer_blockmap_lookup(hmp, iprec->leaf.data_offset, &error);
1915                         KKASSERT(error == 0);
1916                         hammer_del_buffers(hmp,
1917                                            iprec->leaf.data_offset,
1918                                            zone2_offset,
1919                                            iprec->leaf.data_len);
1920                 }
1921                 return(0);
1922         }
1923
1924         /*
1925          * On-disk records are marked as deleted by updating their delete_tid.
1926          * This does not effect their position in the B-Tree (which is based
1927          * on their create_tid).
1928          */
1929         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1930         elm = NULL;
1931
1932         if (error == 0) {
1933                 error = hammer_delete_at_cursor(
1934                                 cursor,
1935                                 HAMMER_DELETE_ADJUST | hammer_nohistory(ip),
1936                                 NULL);
1937         }
1938         return(error);
1939 }
1940
1941 /*
1942  * Delete the B-Tree element at the current cursor and do any necessary
1943  * mirror propagation.
1944  *
1945  * The cursor must be properly positioned for an iteration on return but
1946  * may be pointing at an internal element.
1947  */
1948 int
1949 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
1950                         int64_t *stat_bytes)
1951 {
1952         struct hammer_btree_leaf_elm save_leaf;
1953         hammer_btree_leaf_elm_t leaf;
1954         hammer_node_t node;
1955         hammer_btree_elm_t elm;
1956         hammer_off_t data_offset;
1957         int32_t data_len;
1958         u_int16_t rec_type;
1959         int error;
1960         int doprop;
1961
1962         error = hammer_cursor_upgrade(cursor);
1963         if (error)
1964                 return(error);
1965
1966         node = cursor->node;
1967         elm = &node->ondisk->elms[cursor->index];
1968         leaf = &elm->leaf;
1969         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1970
1971         /*
1972          * Adjust the delete_tid.  Update the mirror_tid propagation field
1973          * as well.
1974          */
1975         hammer_sync_lock_sh(cursor->trans);
1976         doprop = 0;
1977         if (delete_flags & HAMMER_DELETE_ADJUST) {
1978                 hammer_modify_node(cursor->trans, node, elm, sizeof(*elm));
1979                 elm->leaf.base.delete_tid = cursor->trans->tid;
1980                 elm->leaf.delete_ts = cursor->trans->time32;
1981                 hammer_modify_node_done(node);
1982
1983                 if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) {
1984                         hammer_modify_node_field(cursor->trans, node, mirror_tid);
1985                         node->ondisk->mirror_tid = elm->leaf.base.delete_tid;
1986                         hammer_modify_node_done(node);
1987                         doprop = 1;
1988                 }
1989
1990                 /*
1991                  * Adjust for the iteration.  We have deleted the current
1992                  * element and want to clear ATEDISK so the iteration does
1993                  * not skip the element after, which now becomes the current
1994                  * element.
1995                  */
1996                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1997                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1998                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1999                 }
2000
2001                 /*
2002                  * An on-disk record cannot have the same delete_tid
2003                  * as its create_tid.  In a chain of record updates
2004                  * this could result in a duplicate record.
2005                  */
2006                 KKASSERT(elm->leaf.base.delete_tid !=
2007                          elm->leaf.base.create_tid);
2008         }
2009
2010         /*
2011          * Destroy the B-Tree element if asked (typically if a nohistory
2012          * file or mount, or when called by the pruning code).
2013          *
2014          * Adjust the ATEDISK flag to properly support iterations.
2015          */
2016         if (delete_flags & HAMMER_DELETE_DESTROY) {
2017                 data_offset = elm->leaf.data_offset;
2018                 data_len = elm->leaf.data_len;
2019                 rec_type = elm->leaf.base.rec_type;
2020                 if (doprop) {
2021                         save_leaf = elm->leaf;
2022                         leaf = &save_leaf;
2023                 }
2024
2025                 error = hammer_btree_delete(cursor);
2026                 if (error == 0) {
2027                         /*
2028                          * This forces a fixup for the iteration because
2029                          * the cursor is now either sitting at the 'next'
2030                          * element or sitting at the end of a leaf.
2031                          */
2032                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2033                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
2034                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2035                         }
2036                 }
2037                 if (error == 0) {
2038                         switch(data_offset & HAMMER_OFF_ZONE_MASK) {
2039                         case HAMMER_ZONE_LARGE_DATA:
2040                         case HAMMER_ZONE_SMALL_DATA:
2041                         case HAMMER_ZONE_META:
2042                                 hammer_blockmap_free(cursor->trans,
2043                                                      data_offset, data_len);
2044                                 break;
2045                         default:
2046                                 break;
2047                         }
2048                 }
2049         }
2050
2051         /*
2052          * mirror_tid propagation occurs if the node's mirror_tid had to be
2053          * updated while adjusting the delete_tid.
2054          *
2055          * This occurs when deleting even in nohistory mode, but does not
2056          * occur when pruning an already-deleted node.
2057          */
2058         if (doprop) {
2059                 KKASSERT(cursor->ip != NULL);
2060                 hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf);
2061         }
2062         hammer_sync_unlock(cursor->trans);
2063         return (error);
2064 }
2065
2066 /*
2067  * Determine whether we can remove a directory.  This routine checks whether
2068  * a directory is empty or not and enforces flush connectivity.
2069  *
2070  * Flush connectivity requires that we block if the target directory is
2071  * currently flushing, otherwise it may not end up in the same flush group.
2072  *
2073  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
2074  */
2075 int
2076 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
2077 {
2078         struct hammer_cursor cursor;
2079         int error;
2080
2081         /*
2082          * Check directory empty
2083          */
2084         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2085
2086         cursor.key_beg.localization = ip->obj_localization +
2087                                       HAMMER_LOCALIZE_MISC;
2088         cursor.key_beg.obj_id = ip->obj_id;
2089         cursor.key_beg.create_tid = 0;
2090         cursor.key_beg.delete_tid = 0;
2091         cursor.key_beg.obj_type = 0;
2092         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
2093         cursor.key_beg.key = HAMMER_MIN_KEY;
2094
2095         cursor.key_end = cursor.key_beg;
2096         cursor.key_end.rec_type = 0xFFFF;
2097         cursor.key_end.key = HAMMER_MAX_KEY;
2098
2099         cursor.asof = ip->obj_asof;
2100         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2101
2102         error = hammer_ip_first(&cursor);
2103         if (error == ENOENT)
2104                 error = 0;
2105         else if (error == 0)
2106                 error = ENOTEMPTY;
2107         hammer_done_cursor(&cursor);
2108         return(error);
2109 }
2110