HAMMER 60H/Many: Stabilization pass
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.84 2008/07/08 04:34:41 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor);
41 static int hammer_mem_first(hammer_cursor_t cursor);
42 static int hammer_frontend_trunc_callback(hammer_record_t record,
43                                 void *data __unused);
44 static int hammer_record_needs_overwrite_delete(hammer_record_t record);
45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
46                       hammer_btree_leaf_elm_t leaf);
47
48 struct rec_trunc_info {
49         u_int16_t       rec_type;
50         int64_t         trunc_off;
51 };
52
53 /*
54  * Red-black tree support.  Comparison code for insertion.
55  */
56 static int
57 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
58 {
59         if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type)
60                 return(-1);
61         if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type)
62                 return(1);
63
64         if (rec1->leaf.base.key < rec2->leaf.base.key)
65                 return(-1);
66         if (rec1->leaf.base.key > rec2->leaf.base.key)
67                 return(1);
68
69         /*
70          * Never match against an item deleted by the front-end.
71          *
72          * rec1 is greater then rec2 if rec1 is marked deleted.
73          * rec1 is less then rec2 if rec2 is marked deleted.
74          *
75          * Multiple deleted records may be present, do not return 0
76          * if both are marked deleted.
77          */
78         if (rec1->flags & HAMMER_RECF_DELETED_FE)
79                 return(1);
80         if (rec2->flags & HAMMER_RECF_DELETED_FE)
81                 return(-1);
82
83         return(0);
84 }
85
86 /*
87  * Basic record comparison code similar to hammer_btree_cmp().
88  */
89 static int
90 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec)
91 {
92         if (elm->rec_type < rec->leaf.base.rec_type)
93                 return(-3);
94         if (elm->rec_type > rec->leaf.base.rec_type)
95                 return(3);
96
97         if (elm->key < rec->leaf.base.key)
98                 return(-2);
99         if (elm->key > rec->leaf.base.key)
100                 return(2);
101
102         /*
103          * Never match against an item deleted by the front-end.
104          * elm is less then rec if rec is marked deleted.
105          */
106         if (rec->flags & HAMMER_RECF_DELETED_FE)
107                 return(-1);
108         return(0);
109 }
110
111 /*
112  * Special LOOKUP_INFO to locate an overlapping record.  This used by
113  * the reservation code to implement small-block records (whos keys will
114  * be different depending on data_len, when representing the same base
115  * offset).
116  *
117  * NOTE: The base file offset of a data record is (key - data_len), not (key).
118  */
119 static int
120 hammer_rec_overlap_compare(hammer_btree_leaf_elm_t leaf, hammer_record_t rec)
121 {
122         if (leaf->base.rec_type < rec->leaf.base.rec_type)
123                 return(-3);
124         if (leaf->base.rec_type > rec->leaf.base.rec_type)
125                 return(3);
126
127         /*
128          * Overlap compare
129          */
130         if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
131                 /* leaf_end <= rec_beg */
132                 if (leaf->base.key <= rec->leaf.base.key - rec->leaf.data_len)
133                         return(-2);
134                 /* leaf_beg >= rec_end */
135                 if (leaf->base.key - leaf->data_len >= rec->leaf.base.key)
136                         return(2);
137         } else {
138                 if (leaf->base.key < rec->leaf.base.key)
139                         return(-2);
140                 if (leaf->base.key > rec->leaf.base.key)
141                         return(2);
142         }
143
144         /*
145          * Never match against an item deleted by the front-end.
146          * leaf is less then rec if rec is marked deleted.
147          *
148          * We must still return the proper code for the scan to continue
149          * along the correct branches.
150          */
151         if (rec->flags & HAMMER_RECF_DELETED_FE) {
152                 if (leaf->base.key < rec->leaf.base.key)
153                         return(-2);
154                 if (leaf->base.key > rec->leaf.base.key)
155                         return(2);
156                 return(-1);
157         }
158         return(0);
159 }
160
161 /*
162  * RB_SCAN comparison code for hammer_mem_first().  The argument order
163  * is reversed so the comparison result has to be negated.  key_beg and
164  * key_end are both range-inclusive.
165  *
166  * Localized deletions are not cached in-memory.
167  */
168 static
169 int
170 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
171 {
172         hammer_cursor_t cursor = data;
173         int r;
174
175         r = hammer_rec_cmp(&cursor->key_beg, rec);
176         if (r > 1)
177                 return(-1);
178         r = hammer_rec_cmp(&cursor->key_end, rec);
179         if (r < -1)
180                 return(1);
181         return(0);
182 }
183
184 /*
185  * This compare function is used when simply looking up key_beg.
186  */
187 static
188 int
189 hammer_rec_find_cmp(hammer_record_t rec, void *data)
190 {
191         hammer_cursor_t cursor = data;
192         int r;
193
194         r = hammer_rec_cmp(&cursor->key_beg, rec);
195         if (r > 1)
196                 return(-1);
197         if (r < -1)
198                 return(1);
199         return(0);
200 }
201
202 /*
203  * Locate blocks within the truncation range.  Partial blocks do not count.
204  */
205 static
206 int
207 hammer_rec_trunc_cmp(hammer_record_t rec, void *data)
208 {
209         struct rec_trunc_info *info = data;
210
211         if (rec->leaf.base.rec_type < info->rec_type)
212                 return(-1);
213         if (rec->leaf.base.rec_type > info->rec_type)
214                 return(1);
215
216         switch(rec->leaf.base.rec_type) {
217         case HAMMER_RECTYPE_DB:
218                 /*
219                  * DB record key is not beyond the truncation point, retain.
220                  */
221                 if (rec->leaf.base.key < info->trunc_off)
222                         return(-1);
223                 break;
224         case HAMMER_RECTYPE_DATA:
225                 /*
226                  * DATA record offset start is not beyond the truncation point,
227                  * retain.
228                  */
229                 if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off)
230                         return(-1);
231                 break;
232         default:
233                 panic("hammer_rec_trunc_cmp: unexpected record type");
234         }
235
236         /*
237          * The record start is >= the truncation point, return match,
238          * the record should be destroyed.
239          */
240         return(0);
241 }
242
243 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
244 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
245                     hammer_rec_overlap_compare, hammer_btree_leaf_elm_t);
246
247 /*
248  * Allocate a record for the caller to finish filling in.  The record is
249  * returned referenced.
250  */
251 hammer_record_t
252 hammer_alloc_mem_record(hammer_inode_t ip, int data_len)
253 {
254         hammer_record_t record;
255
256         ++hammer_count_records;
257         record = kmalloc(sizeof(*record), M_HAMMER,
258                          M_WAITOK | M_ZERO | M_USE_RESERVE);
259         record->flush_state = HAMMER_FST_IDLE;
260         record->ip = ip;
261         record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
262         record->leaf.data_len = data_len;
263         hammer_ref(&record->lock);
264
265         if (data_len) {
266                 record->data = kmalloc(data_len, M_HAMMER, M_WAITOK | M_ZERO);
267                 record->flags |= HAMMER_RECF_ALLOCDATA;
268                 ++hammer_count_record_datas;
269         }
270
271         return (record);
272 }
273
274 void
275 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident)
276 {
277         while (record->flush_state == HAMMER_FST_FLUSH) {
278                 record->flags |= HAMMER_RECF_WANTED;
279                 tsleep(record, 0, ident, 0);
280         }
281 }
282
283 /*
284  * Called from the backend, hammer_inode.c, after a record has been
285  * flushed to disk.  The record has been exclusively locked by the
286  * caller and interlocked with BE.
287  *
288  * We clean up the state, unlock, and release the record (the record
289  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
290  */
291 void
292 hammer_flush_record_done(hammer_record_t record, int error)
293 {
294         hammer_inode_t target_ip;
295
296         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
297         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
298
299         if (error) {
300                 /*
301                  * An error occured, the backend was unable to sync the
302                  * record to its media.  Leave the record intact.
303                  */
304                 Debugger("flush_record_done error");
305         }
306
307         if (record->flags & HAMMER_RECF_DELETED_BE) {
308                 if ((target_ip = record->target_ip) != NULL) {
309                         TAILQ_REMOVE(&target_ip->target_list, record,
310                                      target_entry);
311                         record->target_ip = NULL;
312                         hammer_test_inode(target_ip);
313                 }
314                 record->flush_state = HAMMER_FST_IDLE;
315         } else {
316                 if (record->target_ip) {
317                         record->flush_state = HAMMER_FST_SETUP;
318                         hammer_test_inode(record->ip);
319                         hammer_test_inode(record->target_ip);
320                 } else {
321                         record->flush_state = HAMMER_FST_IDLE;
322                 }
323         }
324         record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
325         if (record->flags & HAMMER_RECF_WANTED) {
326                 record->flags &= ~HAMMER_RECF_WANTED;
327                 wakeup(record);
328         }
329         hammer_rel_mem_record(record);
330 }
331
332 /*
333  * Release a memory record.  Records marked for deletion are immediately
334  * removed from the RB-Tree but otherwise left intact until the last ref
335  * goes away.
336  */
337 void
338 hammer_rel_mem_record(struct hammer_record *record)
339 {
340         hammer_inode_t ip, target_ip;
341
342         hammer_unref(&record->lock);
343
344         if (record->lock.refs == 0) {
345                 /*
346                  * Upon release of the last reference wakeup any waiters.
347                  * The record structure may get destroyed so callers will
348                  * loop up and do a relookup.
349                  *
350                  * WARNING!  Record must be removed from RB-TREE before we
351                  * might possibly block.  hammer_test_inode() can block!
352                  */
353                 ip = record->ip;
354
355                 /*
356                  * Upon release of the last reference a record marked deleted
357                  * is destroyed.
358                  */
359                 if (record->flags & HAMMER_RECF_DELETED_FE) {
360                         KKASSERT(ip->lock.refs > 0);
361                         KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
362
363                         /*
364                          * target_ip may have zero refs, we have to ref it
365                          * to prevent it from being ripped out from under
366                          * us.
367                          */
368                         if ((target_ip = record->target_ip) != NULL) {
369                                 TAILQ_REMOVE(&target_ip->target_list,
370                                              record, target_entry);
371                                 record->target_ip = NULL;
372                                 hammer_ref(&target_ip->lock);
373                         }
374
375                         if (record->flags & HAMMER_RECF_ONRBTREE) {
376                                 RB_REMOVE(hammer_rec_rb_tree,
377                                           &record->ip->rec_tree,
378                                           record);
379                                 KKASSERT(ip->rsv_recs > 0);
380                                 --ip->hmp->rsv_recs;
381                                 --ip->rsv_recs;
382                                 ip->hmp->rsv_databytes -= record->leaf.data_len;
383                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
384
385                                 if (RB_EMPTY(&record->ip->rec_tree)) {
386                                         record->ip->flags &= ~HAMMER_INODE_XDIRTY;
387                                         record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
388                                         hammer_test_inode(record->ip);
389                                 }
390                         }
391
392                         /*
393                          * Do this test after removing record from the B-Tree.
394                          */
395                         if (target_ip) {
396                                 hammer_test_inode(target_ip);
397                                 hammer_rel_inode(target_ip, 0);
398                         }
399
400                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
401                                 --hammer_count_record_datas;
402                                 kfree(record->data, M_HAMMER);
403                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
404                         }
405                         if (record->resv) {
406                                 hammer_blockmap_reserve_complete(ip->hmp,
407                                                                  record->resv);
408                                 record->resv = NULL;
409                         }
410                         record->data = NULL;
411                         --hammer_count_records;
412                         kfree(record, M_HAMMER);
413                 }
414         }
415 }
416
417 /*
418  * Record visibility depends on whether the record is being accessed by
419  * the backend or the frontend.
420  *
421  * Return non-zero if the record is visible, zero if it isn't or if it is
422  * deleted.
423  */
424 static __inline
425 int
426 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
427 {
428         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
429                 if (record->flags & HAMMER_RECF_DELETED_BE)
430                         return(0);
431         } else {
432                 if (record->flags & HAMMER_RECF_DELETED_FE)
433                         return(0);
434         }
435         return(1);
436 }
437
438 /*
439  * This callback is used as part of the RB_SCAN function for in-memory
440  * records.  We terminate it (return -1) as soon as we get a match.
441  *
442  * This routine is used by frontend code.
443  *
444  * The primary compare code does not account for ASOF lookups.  This
445  * code handles that case as well as a few others.
446  */
447 static
448 int
449 hammer_rec_scan_callback(hammer_record_t rec, void *data)
450 {
451         hammer_cursor_t cursor = data;
452
453         /*
454          * We terminate on success, so this should be NULL on entry.
455          */
456         KKASSERT(cursor->iprec == NULL);
457
458         /*
459          * Skip if the record was marked deleted.
460          */
461         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
462                 return(0);
463
464         /*
465          * Skip if not visible due to our as-of TID
466          */
467         if (cursor->flags & HAMMER_CURSOR_ASOF) {
468                 if (cursor->asof < rec->leaf.base.create_tid)
469                         return(0);
470                 if (rec->leaf.base.delete_tid &&
471                     cursor->asof >= rec->leaf.base.delete_tid) {
472                         return(0);
473                 }
474         }
475
476         /*
477          * ref the record.  The record is protected from backend B-Tree
478          * interactions by virtue of the cursor's IP lock.
479          */
480         hammer_ref(&rec->lock);
481
482         /*
483          * The record may have been deleted while we were blocked.
484          */
485         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
486                 hammer_rel_mem_record(rec);
487                 return(0);
488         }
489
490         /*
491          * Set the matching record and stop the scan.
492          */
493         cursor->iprec = rec;
494         return(-1);
495 }
496
497
498 /*
499  * Lookup an in-memory record given the key specified in the cursor.  Works
500  * just like hammer_btree_lookup() but operates on an inode's in-memory
501  * record list.
502  *
503  * The lookup must fail if the record is marked for deferred deletion.
504  */
505 static
506 int
507 hammer_mem_lookup(hammer_cursor_t cursor)
508 {
509         int error;
510
511         KKASSERT(cursor->ip);
512         if (cursor->iprec) {
513                 hammer_rel_mem_record(cursor->iprec);
514                 cursor->iprec = NULL;
515         }
516         hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp,
517                                    hammer_rec_scan_callback, cursor);
518
519         if (cursor->iprec == NULL)
520                 error = ENOENT;
521         else
522                 error = 0;
523         return(error);
524 }
525
526 /*
527  * hammer_mem_first() - locate the first in-memory record matching the
528  * cursor within the bounds of the key range.
529  */
530 static
531 int
532 hammer_mem_first(hammer_cursor_t cursor)
533 {
534         hammer_inode_t ip;
535
536         ip = cursor->ip;
537         KKASSERT(ip != NULL);
538
539         if (cursor->iprec) {
540                 hammer_rel_mem_record(cursor->iprec);
541                 cursor->iprec = NULL;
542         }
543
544         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
545                                    hammer_rec_scan_callback, cursor);
546
547         /*
548          * Adjust scan.node and keep it linked into the RB-tree so we can
549          * hold the cursor through third party modifications of the RB-tree.
550          */
551         if (cursor->iprec)
552                 return(0);
553         return(ENOENT);
554 }
555
556 /************************************************************************
557  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
558  ************************************************************************
559  *
560  * These functions manipulate in-memory records.  Such records typically
561  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
562  */
563
564 /*
565  * Add a directory entry (dip,ncp) which references inode (ip).
566  *
567  * Note that the low 32 bits of the namekey are set temporarily to create
568  * a unique in-memory record, and may be modified a second time when the
569  * record is synchronized to disk.  In particular, the low 32 bits cannot be
570  * all 0's when synching to disk, which is not handled here.
571  *
572  * NOTE: bytes does not include any terminating \0 on name, and name might
573  * not be terminated.
574  */
575 int
576 hammer_ip_add_directory(struct hammer_transaction *trans,
577                      struct hammer_inode *dip, const char *name, int bytes,
578                      struct hammer_inode *ip)
579 {
580         struct hammer_cursor cursor;
581         hammer_record_t record;
582         int error;
583         int count;
584         u_int32_t iterator;
585
586         record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes));
587         if (++trans->hmp->namekey_iterator == 0)
588                 ++trans->hmp->namekey_iterator;
589
590         record->type = HAMMER_MEM_RECORD_ADD;
591         record->leaf.base.localization = dip->obj_localization +
592                                          HAMMER_LOCALIZE_MISC;
593         record->leaf.base.obj_id = dip->obj_id;
594         record->leaf.base.key = hammer_directory_namekey(name, bytes);
595         record->leaf.base.key += trans->hmp->namekey_iterator;
596         record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
597         record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
598         record->data->entry.obj_id = ip->obj_id;
599         record->data->entry.localization = ip->obj_localization;
600         bcopy(name, record->data->entry.name, bytes);
601
602         ++ip->ino_data.nlinks;
603         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
604
605         /*
606          * Find an unused namekey.  Both the in-memory record tree and
607          * the B-Tree are checked.  Exact matches also match create_tid
608          * so use an ASOF search to (mostly) ignore it.
609          *
610          * delete-visibility is set so pending deletions do not give us
611          * a false-negative on our ability to use an iterator.
612          */
613         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
614         cursor.key_beg = record->leaf.base;
615         cursor.flags |= HAMMER_CURSOR_ASOF;
616         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
617         cursor.asof = ip->obj_asof;
618
619         count = 0;
620         while (hammer_ip_lookup(&cursor) == 0) {
621                 iterator = (u_int32_t)record->leaf.base.key + 1;
622                 if (iterator == 0)
623                         iterator = 1;
624                 record->leaf.base.key &= ~0xFFFFFFFFLL;
625                 record->leaf.base.key |= iterator;
626                 cursor.key_beg.key = record->leaf.base.key;
627                 if (++count == 1000000000) {
628                         hammer_rel_mem_record(record);
629                         error = ENOSPC;
630                         goto failed;
631                 }
632         }
633
634         /*
635          * The target inode and the directory entry are bound together.
636          */
637         record->target_ip = ip;
638         record->flush_state = HAMMER_FST_SETUP;
639         TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
640
641         /*
642          * The inode now has a dependancy and must be taken out of the idle
643          * state.  An inode not in an idle state is given an extra reference.
644          */
645         if (ip->flush_state == HAMMER_FST_IDLE) {
646                 hammer_ref(&ip->lock);
647                 ip->flush_state = HAMMER_FST_SETUP;
648         }
649         error = hammer_mem_add(record);
650 failed:
651         hammer_done_cursor(&cursor);
652         return(error);
653 }
654
655 /*
656  * Delete the directory entry and update the inode link count.  The
657  * cursor must be seeked to the directory entry record being deleted.
658  *
659  * The related inode should be share-locked by the caller.  The caller is
660  * on the frontend.
661  *
662  * This function can return EDEADLK requiring the caller to terminate
663  * the cursor, any locks, wait on the returned record, and retry.
664  */
665 int
666 hammer_ip_del_directory(struct hammer_transaction *trans,
667                      hammer_cursor_t cursor, struct hammer_inode *dip,
668                      struct hammer_inode *ip)
669 {
670         hammer_record_t record;
671         int error;
672
673         if (hammer_cursor_inmem(cursor)) {
674                 /*
675                  * In-memory (unsynchronized) records can simply be freed.
676                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
677                  * by the backend, we must still avoid races against the
678                  * backend potentially syncing the record to the media. 
679                  *
680                  * We cannot call hammer_ip_delete_record(), that routine may
681                  * only be called from the backend.
682                  */
683                 record = cursor->iprec;
684                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
685                         KKASSERT(cursor->deadlk_rec == NULL);
686                         hammer_ref(&record->lock);
687                         cursor->deadlk_rec = record;
688                         error = EDEADLK;
689                 } else {
690                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
691                         record->flags |= HAMMER_RECF_DELETED_FE;
692                         error = 0;
693                 }
694         } else {
695                 /*
696                  * If the record is on-disk we have to queue the deletion by
697                  * the record's key.  This also causes lookups to skip the
698                  * record.
699                  */
700                 KKASSERT(dip->flags &
701                          (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK));
702                 record = hammer_alloc_mem_record(dip, 0);
703                 record->type = HAMMER_MEM_RECORD_DEL;
704                 record->leaf.base = cursor->leaf->base;
705
706                 record->target_ip = ip;
707                 record->flush_state = HAMMER_FST_SETUP;
708                 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
709
710                 /*
711                  * The inode now has a dependancy and must be taken out of
712                  * the idle state.  An inode not in an idle state is given
713                  * an extra reference.
714                  */
715                 if (ip->flush_state == HAMMER_FST_IDLE) {
716                         hammer_ref(&ip->lock);
717                         ip->flush_state = HAMMER_FST_SETUP;
718                 }
719
720                 error = hammer_mem_add(record);
721         }
722
723         /*
724          * One less link.  The file may still be open in the OS even after
725          * all links have gone away.
726          *
727          * We have to terminate the cursor before syncing the inode to
728          * avoid deadlocking against ourselves.  XXX this may no longer
729          * be true.
730          *
731          * If nlinks drops to zero and the vnode is inactive (or there is
732          * no vnode), call hammer_inode_unloadable_check() to zonk the
733          * inode.  If we don't do this here the inode will not be destroyed
734          * on-media until we unmount.
735          */
736         if (error == 0) {
737                 --ip->ino_data.nlinks;
738                 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
739                 if (ip->ino_data.nlinks == 0 &&
740                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
741                         hammer_done_cursor(cursor);
742                         hammer_inode_unloadable_check(ip, 1);
743                         hammer_flush_inode(ip, 0);
744                 }
745
746         }
747         return(error);
748 }
749
750 /*
751  * Add a record to an inode.
752  *
753  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
754  * initialize the following additional fields:
755  *
756  * The related inode should be share-locked by the caller.  The caller is
757  * on the frontend.
758  *
759  * record->rec.entry.base.base.key
760  * record->rec.entry.base.base.rec_type
761  * record->rec.entry.base.base.data_len
762  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
763  */
764 int
765 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
766 {
767         hammer_inode_t ip = record->ip;
768         int error;
769
770         KKASSERT(record->leaf.base.localization != 0);
771         record->leaf.base.obj_id = ip->obj_id;
772         record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
773         error = hammer_mem_add(record);
774         return(error);
775 }
776
777 /*
778  * Locate a bulk record in-memory.  Bulk records allow disk space to be
779  * reserved so the front-end can flush large data writes without having
780  * to queue the BIO to the flusher.  Only the related record gets queued
781  * to the flusher.
782  */
783 static hammer_record_t
784 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes)
785 {
786         hammer_record_t record;
787         struct hammer_btree_leaf_elm leaf;
788
789         bzero(&leaf, sizeof(leaf));
790         leaf.base.obj_id = ip->obj_id;
791         leaf.base.key = file_offset + bytes;
792         leaf.base.create_tid = 0;
793         leaf.base.delete_tid = 0;
794         leaf.base.rec_type = HAMMER_RECTYPE_DATA;
795         leaf.base.obj_type = 0;                 /* unused */
796         leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;     /* unused */
797         leaf.base.localization = ip->obj_localization + HAMMER_LOCALIZE_MISC;
798         leaf.data_len = bytes;
799
800         record = hammer_rec_rb_tree_RB_LOOKUP_INFO(&ip->rec_tree, &leaf);
801         if (record)
802                 hammer_ref(&record->lock);
803         return(record);
804 }
805
806 /*
807  * Reserve blockmap space placemarked with an in-memory record.  
808  *
809  * This routine is called by the frontend in order to be able to directly
810  * flush a buffer cache buffer.  The frontend has locked the related buffer
811  * cache buffers and we should be able to manipulate any overlapping
812  * in-memory records.
813  */
814 hammer_record_t
815 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes,
816                    int *errorp)
817 {
818         hammer_record_t record;
819         hammer_record_t conflict;
820         int zone;
821         int flags;
822
823         /*
824          * Deal with conflicting in-memory records.  We cannot have multiple
825          * in-memory records for the same offset without seriously confusing
826          * the backend, including but not limited to the backend issuing
827          * delete-create-delete sequences and asserting on the delete_tid
828          * being the same as the create_tid.
829          *
830          * If we encounter a record with the backend interlock set we cannot
831          * immediately delete it without confusing the backend.
832          */
833         while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) {
834                 if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) {
835                         conflict->flags |= HAMMER_RECF_WANTED;
836                         tsleep(conflict, 0, "hmrrc3", 0);
837                 } else {
838                         conflict->flags |= HAMMER_RECF_DELETED_FE;
839                 }
840                 hammer_rel_mem_record(conflict);
841         }
842
843         /*
844          * Create a record to cover the direct write.  This is called with
845          * the related BIO locked so there should be no possible conflict.
846          *
847          * The backend is responsible for finalizing the space reserved in
848          * this record.
849          *
850          * XXX bytes not aligned, depend on the reservation code to
851          * align the reservation.
852          */
853         record = hammer_alloc_mem_record(ip, 0);
854         zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX :
855                                            HAMMER_ZONE_SMALL_DATA_INDEX;
856         record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes,
857                                                &record->leaf.data_offset,
858                                                errorp);
859         if (record->resv == NULL) {
860                 kprintf("hammer_ip_add_bulk: reservation failed\n");
861                 hammer_rel_mem_record(record);
862                 return(NULL);
863         }
864         record->type = HAMMER_MEM_RECORD_DATA;
865         record->leaf.base.rec_type = HAMMER_RECTYPE_DATA;
866         record->leaf.base.obj_type = ip->ino_leaf.base.obj_type;
867         record->leaf.base.obj_id = ip->obj_id;
868         record->leaf.base.key = file_offset + bytes;
869         record->leaf.base.localization = ip->obj_localization +
870                                          HAMMER_LOCALIZE_MISC;
871         record->leaf.data_len = bytes;
872         hammer_crc_set_leaf(data, &record->leaf);
873         flags = record->flags;
874
875         hammer_ref(&record->lock);      /* mem_add eats a reference */
876         *errorp = hammer_mem_add(record);
877         if (*errorp) {
878                 conflict = hammer_ip_get_bulk(ip, file_offset, bytes);
879                 kprintf("hammer_ip_add_bulk: error %d conflict %p file_offset %lld bytes %d\n",
880                         *errorp, conflict, file_offset, bytes);
881                 if (conflict)
882                         kprintf("conflict %lld %d\n", conflict->leaf.base.key, conflict->leaf.data_len);
883                 if (conflict)
884                         hammer_rel_mem_record(conflict);
885         }
886         KKASSERT(*errorp == 0);
887         conflict = hammer_ip_get_bulk(ip, file_offset, bytes);
888         if (conflict != record) {
889                 kprintf("conflict mismatch %p %p %08x\n", conflict, record, record->flags);
890                 if (conflict)
891                     kprintf("conflict mismatch %lld/%d %lld/%d\n", conflict->leaf.base.key, conflict->leaf.data_len, record->leaf.base.key, record->leaf.data_len);
892         }
893         KKASSERT(conflict == record);
894         hammer_rel_mem_record(conflict);
895
896         return (record);
897 }
898
899 /*
900  * Frontend truncation code.  Scan in-memory records only.  On-disk records
901  * and records in a flushing state are handled by the backend.  The vnops
902  * setattr code will handle the block containing the truncation point.
903  *
904  * Partial blocks are not deleted.
905  */
906 int
907 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size)
908 {
909         struct rec_trunc_info info;
910
911         switch(ip->ino_data.obj_type) {
912         case HAMMER_OBJTYPE_REGFILE:
913                 info.rec_type = HAMMER_RECTYPE_DATA;
914                 break;
915         case HAMMER_OBJTYPE_DBFILE:
916                 info.rec_type = HAMMER_RECTYPE_DB;
917                 break;
918         default:
919                 return(EINVAL);
920         }
921         info.trunc_off = file_size;
922         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp,
923                                    hammer_frontend_trunc_callback, &info);
924         return(0);
925 }
926
927 static int
928 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused)
929 {
930         if (record->flags & HAMMER_RECF_DELETED_FE)
931                 return(0);
932         if (record->flush_state == HAMMER_FST_FLUSH)
933                 return(0);
934         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
935         hammer_ref(&record->lock);
936         record->flags |= HAMMER_RECF_DELETED_FE;
937         hammer_rel_mem_record(record);
938         return(0);
939 }
940
941 /*
942  * Return 1 if the caller must check for and delete existing records
943  * before writing out a new data record.
944  *
945  * Return 0 if the caller can just insert the record into the B-Tree without
946  * checking.
947  */
948 static int
949 hammer_record_needs_overwrite_delete(hammer_record_t record)
950 {
951         hammer_inode_t ip = record->ip;
952         int64_t file_offset;
953         int r;
954
955         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE)
956                 file_offset = record->leaf.base.key;
957         else
958                 file_offset = record->leaf.base.key - record->leaf.data_len;
959         r = (file_offset < ip->save_trunc_off);
960         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
961                 if (ip->save_trunc_off <= record->leaf.base.key)
962                         ip->save_trunc_off = record->leaf.base.key + 1;
963         } else {
964                 if (ip->save_trunc_off < record->leaf.base.key)
965                         ip->save_trunc_off = record->leaf.base.key;
966         }
967         return(r);
968 }
969
970 /*
971  * Backend code.  Sync a record to the media.
972  */
973 int
974 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
975 {
976         hammer_transaction_t trans = cursor->trans;
977         int64_t file_offset;
978         int bytes;
979         void *bdata;
980         int error;
981         int doprop;
982
983         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
984         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
985         KKASSERT(record->leaf.base.localization != 0);
986
987         /*
988          * If this is a bulk-data record placemarker there may be an existing
989          * record on-disk, indicating a data overwrite.  If there is the
990          * on-disk record must be deleted before we can insert our new record.
991          *
992          * We've synthesized this record and do not know what the create_tid
993          * on-disk is, nor how much data it represents.
994          *
995          * Keep in mind that (key) for data records is (base_offset + len),
996          * not (base_offset).  Also, we only want to get rid of on-disk
997          * records since we are trying to sync our in-memory record, call
998          * hammer_ip_delete_range() with truncating set to 1 to make sure
999          * it skips in-memory records.
1000          *
1001          * It is ok for the lookup to return ENOENT.
1002          *
1003          * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have
1004          * to call hammer_ip_delete_range() or not.  This also means we must
1005          * update sync_trunc_off() as we write.
1006          */
1007         if (record->type == HAMMER_MEM_RECORD_DATA &&
1008             hammer_record_needs_overwrite_delete(record)) {
1009                 file_offset = record->leaf.base.key - record->leaf.data_len;
1010                 bytes = (record->leaf.data_len + HAMMER_BUFMASK) & 
1011                         ~HAMMER_BUFMASK;
1012                 KKASSERT((file_offset & HAMMER_BUFMASK) == 0);
1013                 error = hammer_ip_delete_range(
1014                                 cursor, record->ip,
1015                                 file_offset, file_offset + bytes - 1,
1016                                 1);
1017                 if (error && error != ENOENT)
1018                         goto done;
1019         }
1020
1021         /*
1022          * If this is a general record there may be an on-disk version
1023          * that must be deleted before we can insert the new record.
1024          */
1025         if (record->type == HAMMER_MEM_RECORD_GENERAL) {
1026                 error = hammer_delete_general(cursor, record->ip,
1027                                               &record->leaf);
1028                 if (error && error != ENOENT)
1029                         goto done;
1030         }
1031
1032         /*
1033          * Setup the cursor.
1034          */
1035         hammer_normalize_cursor(cursor);
1036         cursor->key_beg = record->leaf.base;
1037         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1038         cursor->flags |= HAMMER_CURSOR_BACKEND;
1039         cursor->flags &= ~HAMMER_CURSOR_INSERT;
1040
1041         /*
1042          * Records can wind up on-media before the inode itself is on-media.
1043          * Flag the case.
1044          */
1045         record->ip->flags |= HAMMER_INODE_DONDISK;
1046
1047         /*
1048          * If we are deleting a directory entry an exact match must be
1049          * found on-disk.
1050          */
1051         if (record->type == HAMMER_MEM_RECORD_DEL) {
1052                 error = hammer_btree_lookup(cursor);
1053                 if (error == 0) {
1054                         error = hammer_ip_delete_record(cursor, record->ip,
1055                                                         trans->tid);
1056                         if (error == 0) {
1057                                 record->flags |= HAMMER_RECF_DELETED_FE;
1058                                 record->flags |= HAMMER_RECF_DELETED_BE;
1059                         }
1060                 }
1061                 goto done;
1062         }
1063
1064         /*
1065          * We are inserting.
1066          *
1067          * Issue a lookup to position the cursor and locate the cluster.  The
1068          * target key should not exist.  If we are creating a directory entry
1069          * we may have to iterate the low 32 bits of the key to find an unused
1070          * key.
1071          */
1072         cursor->flags |= HAMMER_CURSOR_INSERT;
1073
1074         error = hammer_btree_lookup(cursor);
1075         if (hammer_debug_inode)
1076                 kprintf("DOINSERT LOOKUP %d\n", error);
1077         if (error == 0) {
1078                 kprintf("hammer_ip_sync_record: duplicate rec "
1079                         "at (%016llx)\n", record->leaf.base.key);
1080                 Debugger("duplicate record1");
1081                 error = EIO;
1082         }
1083 #if 0
1084         if (record->type == HAMMER_MEM_RECORD_DATA)
1085                 kprintf("sync_record  %016llx ---------------- %016llx %d\n",
1086                         record->leaf.base.key - record->leaf.data_len,
1087                         record->leaf.data_offset, error);
1088 #endif
1089
1090         if (error != ENOENT)
1091                 goto done;
1092
1093         /*
1094          * Allocate the record and data.  The result buffers will be
1095          * marked as being modified and further calls to
1096          * hammer_modify_buffer() will result in unneeded UNDO records.
1097          *
1098          * Support zero-fill records (data == NULL and data_len != 0)
1099          */
1100         if (record->type == HAMMER_MEM_RECORD_DATA) {
1101                 /*
1102                  * The data portion of a bulk-data record has already been
1103                  * committed to disk, we need only adjust the layer2
1104                  * statistics in the same transaction as our B-Tree insert.
1105                  */
1106                 KKASSERT(record->leaf.data_offset != 0);
1107                 hammer_blockmap_finalize(trans, record->leaf.data_offset,
1108                                          record->leaf.data_len);
1109                 error = 0;
1110         } else if (record->data && record->leaf.data_len) {
1111                 /*
1112                  * Wholely cached record, with data.  Allocate the data.
1113                  */
1114                 bdata = hammer_alloc_data(trans, record->leaf.data_len,
1115                                           record->leaf.base.rec_type,
1116                                           &record->leaf.data_offset,
1117                                           &cursor->data_buffer, &error);
1118                 if (bdata == NULL)
1119                         goto done;
1120                 hammer_crc_set_leaf(record->data, &record->leaf);
1121                 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
1122                 bcopy(record->data, bdata, record->leaf.data_len);
1123                 hammer_modify_buffer_done(cursor->data_buffer);
1124         } else {
1125                 /*
1126                  * Wholely cached record, without data.
1127                  */
1128                 record->leaf.data_offset = 0;
1129                 record->leaf.data_crc = 0;
1130         }
1131
1132         error = hammer_btree_insert(cursor, &record->leaf, &doprop);
1133         if (hammer_debug_inode && error)
1134                 kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key);
1135
1136         /*
1137          * Our record is on-disk, normally mark the in-memory version as
1138          * deleted.  If the record represented a directory deletion but
1139          * we had to sync a valid directory entry to disk we must convert
1140          * the record to a covering delete so the frontend does not have
1141          * visibility on the synced entry.
1142          */
1143         if (error == 0) {
1144                 if (doprop) {
1145                         hammer_btree_do_propagation(cursor, record->ip,
1146                                                     &record->leaf);
1147                 }
1148                 if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
1149                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
1150                         record->flags &= ~HAMMER_RECF_DELETED_FE;
1151                         record->type = HAMMER_MEM_RECORD_DEL;
1152                         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
1153                         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1154                         /* hammer_flush_record_done takes care of the rest */
1155                 } else {
1156                         record->flags |= HAMMER_RECF_DELETED_FE;
1157                         record->flags |= HAMMER_RECF_DELETED_BE;
1158                 }
1159         } else {
1160                 if (record->leaf.data_offset) {
1161                         hammer_blockmap_free(trans, record->leaf.data_offset,
1162                                              record->leaf.data_len);
1163                 }
1164         }
1165
1166 done:
1167         return(error);
1168 }
1169
1170 /*
1171  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
1172  * entry's key is used to deal with hash collisions in the upper 32 bits.
1173  * A unique 64 bit key is generated in-memory and may be regenerated a
1174  * second time when the directory record is flushed to the on-disk B-Tree.
1175  *
1176  * A referenced record is passed to this function.  This function
1177  * eats the reference.  If an error occurs the record will be deleted.
1178  *
1179  * A copy of the temporary record->data pointer provided by the caller
1180  * will be made.
1181  */
1182 static
1183 int
1184 hammer_mem_add(hammer_record_t record)
1185 {
1186         hammer_mount_t hmp = record->ip->hmp;
1187
1188         /*
1189          * Make a private copy of record->data
1190          */
1191         if (record->data)
1192                 KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA);
1193
1194         /*
1195          * Insert into the RB tree.  A unique key should have already
1196          * been selected if this is a directory entry.
1197          */
1198         if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1199                 record->flags |= HAMMER_RECF_DELETED_FE;
1200                 hammer_rel_mem_record(record);
1201                 return (EEXIST);
1202         }
1203         ++hmp->count_newrecords;
1204         ++hmp->rsv_recs;
1205         ++record->ip->rsv_recs;
1206         record->ip->hmp->rsv_databytes += record->leaf.data_len;
1207         record->flags |= HAMMER_RECF_ONRBTREE;
1208         hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY);
1209         hammer_rel_mem_record(record);
1210         return(0);
1211 }
1212
1213 /************************************************************************
1214  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1215  ************************************************************************
1216  *
1217  * These functions augment the B-Tree scanning functions in hammer_btree.c
1218  * by merging in-memory records with on-disk records.
1219  */
1220
1221 /*
1222  * Locate a particular record either in-memory or on-disk.
1223  *
1224  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1225  * NOT be called to iterate results.
1226  */
1227 int
1228 hammer_ip_lookup(hammer_cursor_t cursor)
1229 {
1230         int error;
1231
1232         /*
1233          * If the element is in-memory return it without searching the
1234          * on-disk B-Tree
1235          */
1236         KKASSERT(cursor->ip);
1237         error = hammer_mem_lookup(cursor);
1238         if (error == 0) {
1239                 cursor->leaf = &cursor->iprec->leaf;
1240                 return(error);
1241         }
1242         if (error != ENOENT)
1243                 return(error);
1244
1245         /*
1246          * If the inode has on-disk components search the on-disk B-Tree.
1247          */
1248         if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1249                 return(error);
1250         error = hammer_btree_lookup(cursor);
1251         if (error == 0)
1252                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1253         return(error);
1254 }
1255
1256 /*
1257  * Locate the first record within the cursor's key_beg/key_end range,
1258  * restricted to a particular inode.  0 is returned on success, ENOENT
1259  * if no records matched the requested range, or some other error.
1260  *
1261  * When 0 is returned hammer_ip_next() may be used to iterate additional
1262  * records within the requested range.
1263  *
1264  * This function can return EDEADLK, requiring the caller to terminate
1265  * the cursor and try again.
1266  */
1267 int
1268 hammer_ip_first(hammer_cursor_t cursor)
1269 {
1270         hammer_inode_t ip = cursor->ip;
1271         int error;
1272
1273         KKASSERT(ip != NULL);
1274
1275         /*
1276          * Clean up fields and setup for merged scan
1277          */
1278         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1279         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1280         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1281         if (cursor->iprec) {
1282                 hammer_rel_mem_record(cursor->iprec);
1283                 cursor->iprec = NULL;
1284         }
1285
1286         /*
1287          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1288          * exact lookup so if we get ENOENT we have to call the iterate
1289          * function to validate the first record after the begin key.
1290          *
1291          * The ATEDISK flag is used by hammer_btree_iterate to determine
1292          * whether it must index forwards or not.  It is also used here
1293          * to select the next record from in-memory or on-disk.
1294          *
1295          * EDEADLK can only occur if the lookup hit an empty internal
1296          * element and couldn't delete it.  Since this could only occur
1297          * in-range, we can just iterate from the failure point.
1298          */
1299         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1300                 error = hammer_btree_lookup(cursor);
1301                 if (error == ENOENT || error == EDEADLK) {
1302                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1303                         if (hammer_debug_general & 0x2000)
1304                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1305                         error = hammer_btree_iterate(cursor);
1306                 }
1307                 if (error && error != ENOENT) 
1308                         return(error);
1309                 if (error == 0) {
1310                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1311                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1312                 } else {
1313                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1314                 }
1315         }
1316
1317         /*
1318          * Search the in-memory record list (Red-Black tree).  Unlike the
1319          * B-Tree search, mem_first checks for records in the range.
1320          */
1321         error = hammer_mem_first(cursor);
1322         if (error && error != ENOENT)
1323                 return(error);
1324         if (error == 0) {
1325                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1326                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1327                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1328                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1329         }
1330
1331         /*
1332          * This will return the first matching record.
1333          */
1334         return(hammer_ip_next(cursor));
1335 }
1336
1337 /*
1338  * Retrieve the next record in a merged iteration within the bounds of the
1339  * cursor.  This call may be made multiple times after the cursor has been
1340  * initially searched with hammer_ip_first().
1341  *
1342  * 0 is returned on success, ENOENT if no further records match the
1343  * requested range, or some other error code is returned.
1344  */
1345 int
1346 hammer_ip_next(hammer_cursor_t cursor)
1347 {
1348         hammer_btree_elm_t elm;
1349         hammer_record_t rec, save;
1350         int error;
1351         int r;
1352
1353 next_btree:
1354         /*
1355          * Load the current on-disk and in-memory record.  If we ate any
1356          * records we have to get the next one. 
1357          *
1358          * If we deleted the last on-disk record we had scanned ATEDISK will
1359          * be clear and DELBTREE will be set, forcing a call to iterate. The
1360          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1361          * element.  If ATEDISK is set, iterate will skip the 'current'
1362          * element.
1363          *
1364          * Get the next on-disk record
1365          */
1366         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1367                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1368                         error = hammer_btree_iterate(cursor);
1369                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1370                         if (error == 0) {
1371                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1372                                 hammer_cache_node(&cursor->ip->cache[1],
1373                                                   cursor->node);
1374                         } else {
1375                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1376                                                  HAMMER_CURSOR_ATEDISK;
1377                         }
1378                 }
1379         }
1380
1381 next_memory:
1382         /*
1383          * Get the next in-memory record.
1384          *
1385          * hammer_rec_scan_cmp:  Is the record still in our general range,
1386          *                       (non-inclusive of snapshot exclusions)?
1387          * hammer_rec_scan_callback: Is the record in our snapshot?
1388          */
1389         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1390                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1391                         save = cursor->iprec;
1392                         cursor->iprec = NULL;
1393                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1394                         while (rec) {
1395                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1396                                         break;
1397                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1398                                         break;
1399                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1400                         }
1401                         if (save)
1402                                 hammer_rel_mem_record(save);
1403                         if (cursor->iprec) {
1404                                 KKASSERT(cursor->iprec == rec);
1405                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1406                         } else {
1407                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1408                         }
1409                 }
1410         }
1411
1412         /*
1413          * The memory record may have become stale while being held in
1414          * cursor->iprec.  We are interlocked against the backend on 
1415          * with regards to B-Tree entries.
1416          */
1417         if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) {
1418                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) {
1419                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1420                         goto next_memory;
1421                 }
1422         }
1423
1424         /*
1425          * Extract either the disk or memory record depending on their
1426          * relative position.
1427          */
1428         error = 0;
1429         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1430         case 0:
1431                 /*
1432                  * Both entries valid.   Compare the entries and nominally
1433                  * return the first one in the sort order.  Numerous cases
1434                  * require special attention, however.
1435                  */
1436                 elm = &cursor->node->ondisk->elms[cursor->index];
1437                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base);
1438
1439                 /*
1440                  * If the two entries differ only by their key (-2/2) or
1441                  * create_tid (-1/1), and are DATA records, we may have a
1442                  * nominal match.  We have to calculate the base file
1443                  * offset of the data.
1444                  */
1445                 if (r <= 2 && r >= -2 && r != 0 &&
1446                     cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE &&
1447                     cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1448                         int64_t base1 = elm->leaf.base.key - elm->leaf.data_len;
1449                         int64_t base2 = cursor->iprec->leaf.base.key -
1450                                         cursor->iprec->leaf.data_len;
1451                         if (base1 == base2)
1452                                 r = 0;
1453                 }
1454
1455                 if (r < 0) {
1456                         error = hammer_btree_extract(cursor,
1457                                                      HAMMER_CURSOR_GET_LEAF);
1458                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1459                         break;
1460                 }
1461
1462                 /*
1463                  * If the entries match exactly the memory entry is either
1464                  * an on-disk directory entry deletion or a bulk data
1465                  * overwrite.  If it is a directory entry deletion we eat
1466                  * both entries.
1467                  *
1468                  * For the bulk-data overwrite case it is possible to have
1469                  * visibility into both, which simply means the syncer
1470                  * hasn't gotten around to doing the delete+insert sequence
1471                  * on the B-Tree.  Use the memory entry and throw away the
1472                  * on-disk entry.
1473                  *
1474                  * If the in-memory record is not either of these we
1475                  * probably caught the syncer while it was syncing it to
1476                  * the media.  Since we hold a shared lock on the cursor,
1477                  * the in-memory record had better be marked deleted at
1478                  * this point.
1479                  */
1480                 if (r == 0) {
1481                         if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1482                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1483                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1484                                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1485                                         goto next_btree;
1486                                 }
1487                         } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) {
1488                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1489                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1490                                 }
1491                                 /* fall through to memory entry */
1492                         } else {
1493                                 panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags);
1494                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1495                                 goto next_memory;
1496                         }
1497                 }
1498                 /* fall through to the memory entry */
1499         case HAMMER_CURSOR_ATEDISK:
1500                 /*
1501                  * Only the memory entry is valid.
1502                  */
1503                 cursor->leaf = &cursor->iprec->leaf;
1504                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1505
1506                 /*
1507                  * If the memory entry is an on-disk deletion we should have
1508                  * also had found a B-Tree record.  If the backend beat us
1509                  * to it it would have interlocked the cursor and we should
1510                  * have seen the in-memory record marked DELETED_FE.
1511                  */
1512                 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL &&
1513                     (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1514                         panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags);
1515                 }
1516                 break;
1517         case HAMMER_CURSOR_ATEMEM:
1518                 /*
1519                  * Only the disk entry is valid
1520                  */
1521                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1522                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1523                 break;
1524         default:
1525                 /*
1526                  * Neither entry is valid
1527                  *
1528                  * XXX error not set properly
1529                  */
1530                 cursor->leaf = NULL;
1531                 error = ENOENT;
1532                 break;
1533         }
1534         return(error);
1535 }
1536
1537 /*
1538  * Resolve the cursor->data pointer for the current cursor position in
1539  * a merged iteration.
1540  */
1541 int
1542 hammer_ip_resolve_data(hammer_cursor_t cursor)
1543 {
1544         hammer_record_t record;
1545         int error;
1546
1547         if (hammer_cursor_inmem(cursor)) {
1548                 /*
1549                  * The data associated with an in-memory record is usually
1550                  * kmalloced, but reserve-ahead data records will have an
1551                  * on-disk reference.
1552                  *
1553                  * NOTE: Reserve-ahead data records must be handled in the
1554                  * context of the related high level buffer cache buffer
1555                  * to interlock against async writes.
1556                  */
1557                 record = cursor->iprec;
1558                 cursor->data = record->data;
1559                 error = 0;
1560                 if (cursor->data == NULL) {
1561                         KKASSERT(record->leaf.base.rec_type ==
1562                                  HAMMER_RECTYPE_DATA);
1563                         cursor->data = hammer_bread_ext(cursor->trans->hmp,
1564                                                     record->leaf.data_offset,
1565                                                     record->leaf.data_len,
1566                                                     &error,
1567                                                     &cursor->data_buffer);
1568                 }
1569         } else {
1570                 cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf;
1571                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1572         }
1573         return(error);
1574 }
1575
1576 /*
1577  * Backend truncation / record replacement - delete records in range.
1578  *
1579  * Delete all records within the specified range for inode ip.  In-memory
1580  * records still associated with the frontend are ignored. 
1581  *
1582  * If truncating is non-zero in-memory records associated with the back-end
1583  * are ignored.  If truncating is > 1 we can return EWOULDBLOCK.
1584  *
1585  * NOTES:
1586  *
1587  *      * An unaligned range will cause new records to be added to cover
1588  *        the edge cases. (XXX not implemented yet).
1589  *
1590  *      * Replacement via reservations (see hammer_ip_sync_record_cursor())
1591  *        also do not deal with unaligned ranges.
1592  *
1593  *      * ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1594  *
1595  *      * Record keys for regular file data have to be special-cased since
1596  *        they indicate the end of the range (key = base + bytes).
1597  *
1598  *      * This function may be asked to delete ridiculously huge ranges, for
1599  *        example if someone truncates or removes a 1TB regular file.  We
1600  *        must be very careful on restarts and we may have to stop w/
1601  *        EWOULDBLOCK to avoid blowing out the buffer cache.
1602  */
1603 int
1604 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip,
1605                        int64_t ran_beg, int64_t ran_end, int truncating)
1606 {
1607         hammer_transaction_t trans = cursor->trans;
1608         hammer_btree_leaf_elm_t leaf;
1609         int error;
1610         int64_t off;
1611         int64_t tmp64;
1612
1613 #if 0
1614         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1615 #endif
1616
1617         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1618 retry:
1619         hammer_normalize_cursor(cursor);
1620         cursor->key_beg.localization = ip->obj_localization +
1621                                        HAMMER_LOCALIZE_MISC;
1622         cursor->key_beg.obj_id = ip->obj_id;
1623         cursor->key_beg.create_tid = 0;
1624         cursor->key_beg.delete_tid = 0;
1625         cursor->key_beg.obj_type = 0;
1626
1627         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1628                 cursor->key_beg.key = ran_beg;
1629                 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB;
1630         } else {
1631                 /*
1632                  * The key in the B-Tree is (base+bytes), so the first possible
1633                  * matching key is ran_beg + 1.
1634                  */
1635                 cursor->key_beg.key = ran_beg + 1;
1636                 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
1637         }
1638
1639         cursor->key_end = cursor->key_beg;
1640         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1641                 cursor->key_end.key = ran_end;
1642         } else {
1643                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1644                 if (tmp64 < ran_end)
1645                         cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1646                 else
1647                         cursor->key_end.key = ran_end + MAXPHYS + 1;
1648         }
1649
1650         cursor->asof = ip->obj_asof;
1651         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1652         cursor->flags |= HAMMER_CURSOR_ASOF;
1653         cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1654         cursor->flags |= HAMMER_CURSOR_BACKEND;
1655         cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE;
1656
1657         error = hammer_ip_first(cursor);
1658
1659         /*
1660          * Iterate through matching records and mark them as deleted.
1661          */
1662         while (error == 0) {
1663                 leaf = cursor->leaf;
1664
1665                 KKASSERT(leaf->base.delete_tid == 0);
1666                 KKASSERT(leaf->base.obj_id == ip->obj_id);
1667
1668                 /*
1669                  * There may be overlap cases for regular file data.  Also
1670                  * remember the key for a regular file record is (base + len),
1671                  * NOT (base).
1672                  *
1673                  * Note that do to duplicates (mem & media) allowed by
1674                  * DELETE_VISIBILITY, off can wind up less then ran_beg.
1675                  */
1676                 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) {
1677                         off = leaf->base.key - leaf->data_len;
1678                         /*
1679                          * Check the left edge case.  We currently do not
1680                          * split existing records.
1681                          */
1682                         if (off < ran_beg && leaf->base.key > ran_beg) {
1683                                 panic("hammer left edge case %016llx %d\n",
1684                                         leaf->base.key, leaf->data_len);
1685                         }
1686
1687                         /*
1688                          * Check the right edge case.  Note that the
1689                          * record can be completely out of bounds, which
1690                          * terminates the search.
1691                          *
1692                          * base->key is exclusive of the right edge while
1693                          * ran_end is inclusive of the right edge.  The
1694                          * (key - data_len) left boundary is inclusive.
1695                          *
1696                          * XXX theory-check this test at some point, are
1697                          * we missing a + 1 somewhere?  Note that ran_end
1698                          * could overflow.
1699                          */
1700                         if (leaf->base.key - 1 > ran_end) {
1701                                 if (leaf->base.key - leaf->data_len > ran_end)
1702                                         break;
1703                                 panic("hammer right edge case\n");
1704                         }
1705                 } else {
1706                         off = leaf->base.key;
1707                 }
1708
1709                 /*
1710                  * Delete the record.  When truncating we do not delete
1711                  * in-memory (data) records because they represent data
1712                  * written after the truncation.
1713                  *
1714                  * This will also physically destroy the B-Tree entry and
1715                  * data if the retention policy dictates.  The function
1716                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1717                  * uses to perform a fixup.
1718                  */
1719                 if (truncating == 0 || hammer_cursor_ondisk(cursor)) {
1720                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1721                         /*
1722                          * If we have built up too many meta-buffers we risk
1723                          * deadlocking the kernel and must stop.  This can
1724                          * occur when deleting ridiculously huge files.
1725                          * sync_trunc_off is updated so the next cycle does
1726                          * not re-iterate records we have already deleted.
1727                          *
1728                          * This is only done with formal truncations.
1729                          */
1730                         if (truncating > 1 && error == 0 &&
1731                             hammer_flusher_meta_limit(ip->hmp)) {
1732                                 ip->sync_trunc_off = off;
1733                                 error = EWOULDBLOCK;
1734                         }
1735                 }
1736                 if (error)
1737                         break;
1738                 ran_beg = off;  /* for restart */
1739                 error = hammer_ip_next(cursor);
1740         }
1741         if (cursor->node)
1742                 hammer_cache_node(&ip->cache[1], cursor->node);
1743
1744         if (error == EDEADLK) {
1745                 hammer_done_cursor(cursor);
1746                 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1747                 if (error == 0)
1748                         goto retry;
1749         }
1750         if (error == ENOENT)
1751                 error = 0;
1752         return(error);
1753 }
1754
1755 /*
1756  * This backend function deletes the specified record on-disk, similar to
1757  * delete_range but for a specific record.  Unlike the exact deletions
1758  * used when deleting a directory entry this function uses an ASOF search 
1759  * like delete_range.
1760  *
1761  * This function may be called with ip->obj_asof set for a slave snapshot,
1762  * so don't use it.  We always delete non-historical records only.
1763  */
1764 static int
1765 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip,
1766                       hammer_btree_leaf_elm_t leaf)
1767 {
1768         hammer_transaction_t trans = cursor->trans;
1769         int error;
1770
1771         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1772 retry:
1773         hammer_normalize_cursor(cursor);
1774         cursor->key_beg = leaf->base;
1775         cursor->asof = HAMMER_MAX_TID;
1776         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1777         cursor->flags |= HAMMER_CURSOR_ASOF;
1778         cursor->flags |= HAMMER_CURSOR_BACKEND;
1779         cursor->flags &= ~HAMMER_CURSOR_INSERT;
1780
1781         error = hammer_btree_lookup(cursor);
1782         if (error == 0) {
1783                 error = hammer_ip_delete_record(cursor, ip, trans->tid);
1784         }
1785         if (error == EDEADLK) {
1786                 hammer_done_cursor(cursor);
1787                 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1788                 if (error == 0)
1789                         goto retry;
1790         }
1791         return(error);
1792 }
1793
1794 /*
1795  * This function deletes remaining auxillary records when an inode is
1796  * being deleted.  This function explicitly does not delete the
1797  * inode record, directory entry, data, or db records.  Those must be
1798  * properly disposed of prior to this call.
1799  */
1800 int
1801 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp)
1802 {
1803         hammer_transaction_t trans = cursor->trans;
1804         hammer_btree_leaf_elm_t leaf;
1805         int error;
1806
1807         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1808 retry:
1809         hammer_normalize_cursor(cursor);
1810         cursor->key_beg.localization = ip->obj_localization +
1811                                        HAMMER_LOCALIZE_MISC;
1812         cursor->key_beg.obj_id = ip->obj_id;
1813         cursor->key_beg.create_tid = 0;
1814         cursor->key_beg.delete_tid = 0;
1815         cursor->key_beg.obj_type = 0;
1816         cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START;
1817         cursor->key_beg.key = HAMMER_MIN_KEY;
1818
1819         cursor->key_end = cursor->key_beg;
1820         cursor->key_end.rec_type = HAMMER_RECTYPE_MAX;
1821         cursor->key_end.key = HAMMER_MAX_KEY;
1822
1823         cursor->asof = ip->obj_asof;
1824         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1825         cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1826         cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1827         cursor->flags |= HAMMER_CURSOR_BACKEND;
1828
1829         error = hammer_ip_first(cursor);
1830
1831         /*
1832          * Iterate through matching records and mark them as deleted.
1833          */
1834         while (error == 0) {
1835                 leaf = cursor->leaf;
1836
1837                 KKASSERT(leaf->base.delete_tid == 0);
1838
1839                 /*
1840                  * Mark the record and B-Tree entry as deleted.  This will
1841                  * also physically delete the B-Tree entry, record, and
1842                  * data if the retention policy dictates.  The function
1843                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1844                  * uses to perform a fixup.
1845                  *
1846                  * Directory entries (and delete-on-disk directory entries)
1847                  * must be synced and cannot be deleted.
1848                  */
1849                 error = hammer_ip_delete_record(cursor, ip, trans->tid);
1850                 ++*countp;
1851                 if (error)
1852                         break;
1853                 error = hammer_ip_next(cursor);
1854         }
1855         if (cursor->node)
1856                 hammer_cache_node(&ip->cache[1], cursor->node);
1857         if (error == EDEADLK) {
1858                 hammer_done_cursor(cursor);
1859                 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip);
1860                 if (error == 0)
1861                         goto retry;
1862         }
1863         if (error == ENOENT)
1864                 error = 0;
1865         return(error);
1866 }
1867
1868 /*
1869  * Delete the record at the current cursor.  On success the cursor will
1870  * be positioned appropriately for an iteration but may no longer be at
1871  * a leaf node.
1872  *
1873  * This routine is only called from the backend.
1874  *
1875  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1876  * cursor and retry.
1877  */
1878 int
1879 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip,
1880                         hammer_tid_t tid)
1881 {
1882         hammer_off_t zone2_offset;
1883         hammer_record_t iprec;
1884         hammer_btree_elm_t elm;
1885         hammer_mount_t hmp;
1886         int error;
1887
1888         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1889         KKASSERT(tid != 0);
1890         hmp = cursor->node->hmp;
1891
1892         /*
1893          * In-memory (unsynchronized) records can simply be freed.  This
1894          * only occurs in range iterations since all other records are
1895          * individually synchronized.  Thus there should be no confusion with
1896          * the interlock.
1897          *
1898          * An in-memory record may be deleted before being committed to disk,
1899          * but could have been accessed in the mean time.  The backing store
1900          * may never been marked allocated and so hammer_blockmap_free() may
1901          * never get called on it.  Because of this we have to make sure that
1902          * we've gotten rid of any related hammer_buffer or buffer cache
1903          * buffer.
1904          */
1905         if (hammer_cursor_inmem(cursor)) {
1906                 iprec = cursor->iprec;
1907                 KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1908                 iprec->flags |= HAMMER_RECF_DELETED_FE;
1909                 iprec->flags |= HAMMER_RECF_DELETED_BE;
1910
1911                 if (iprec->leaf.data_offset && iprec->leaf.data_len) {
1912                         zone2_offset = hammer_blockmap_lookup(hmp, iprec->leaf.data_offset, &error);
1913                         KKASSERT(error == 0);
1914                         hammer_del_buffers(hmp,
1915                                            iprec->leaf.data_offset,
1916                                            zone2_offset,
1917                                            iprec->leaf.data_len);
1918                 }
1919                 return(0);
1920         }
1921
1922         /*
1923          * On-disk records are marked as deleted by updating their delete_tid.
1924          * This does not effect their position in the B-Tree (which is based
1925          * on their create_tid).
1926          */
1927         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF);
1928         elm = NULL;
1929
1930         if (error == 0) {
1931                 error = hammer_delete_at_cursor(
1932                                 cursor,
1933                                 HAMMER_DELETE_ADJUST | hammer_nohistory(ip),
1934                                 NULL);
1935         }
1936         return(error);
1937 }
1938
1939 /*
1940  * Delete the B-Tree element at the current cursor and do any necessary
1941  * mirror propagation.
1942  *
1943  * The cursor must be properly positioned for an iteration on return but
1944  * may be pointing at an internal element.
1945  */
1946 int
1947 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags,
1948                         int64_t *stat_bytes)
1949 {
1950         struct hammer_btree_leaf_elm save_leaf;
1951         hammer_btree_leaf_elm_t leaf;
1952         hammer_node_t node;
1953         hammer_btree_elm_t elm;
1954         hammer_off_t data_offset;
1955         int32_t data_len;
1956         u_int16_t rec_type;
1957         int error;
1958         int doprop;
1959
1960         error = hammer_cursor_upgrade(cursor);
1961         if (error)
1962                 return(error);
1963
1964         node = cursor->node;
1965         elm = &node->ondisk->elms[cursor->index];
1966         leaf = &elm->leaf;
1967         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1968
1969         /*
1970          * Adjust the delete_tid.  Update the mirror_tid propagation field
1971          * as well.
1972          */
1973         doprop = 0;
1974         if (delete_flags & HAMMER_DELETE_ADJUST) {
1975                 hammer_modify_node(cursor->trans, node, elm, sizeof(*elm));
1976                 elm->leaf.base.delete_tid = cursor->trans->tid;
1977                 elm->leaf.delete_ts = cursor->trans->time32;
1978                 hammer_modify_node_done(node);
1979
1980                 if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) {
1981                         hammer_modify_node_field(cursor->trans, node, mirror_tid);
1982                         node->ondisk->mirror_tid = elm->leaf.base.delete_tid;
1983                         hammer_modify_node_done(node);
1984                         doprop = 1;
1985                 }
1986
1987                 /*
1988                  * Adjust for the iteration.  We have deleted the current
1989                  * element and want to clear ATEDISK so the iteration does
1990                  * not skip the element after, which now becomes the current
1991                  * element.
1992                  */
1993                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1994                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1995                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1996                 }
1997
1998                 /*
1999                  * An on-disk record cannot have the same delete_tid
2000                  * as its create_tid.  In a chain of record updates
2001                  * this could result in a duplicate record.
2002                  */
2003                 KKASSERT(elm->leaf.base.delete_tid !=
2004                          elm->leaf.base.create_tid);
2005         }
2006
2007         /*
2008          * Destroy the B-Tree element if asked (typically if a nohistory
2009          * file or mount, or when called by the pruning code).
2010          *
2011          * Adjust the ATEDISK flag to properly support iterations.
2012          */
2013         if (delete_flags & HAMMER_DELETE_DESTROY) {
2014                 data_offset = elm->leaf.data_offset;
2015                 data_len = elm->leaf.data_len;
2016                 rec_type = elm->leaf.base.rec_type;
2017                 if (doprop) {
2018                         save_leaf = elm->leaf;
2019                         leaf = &save_leaf;
2020                 }
2021
2022                 error = hammer_btree_delete(cursor);
2023                 if (error == 0) {
2024                         /*
2025                          * This forces a fixup for the iteration because
2026                          * the cursor is now either sitting at the 'next'
2027                          * element or sitting at the end of a leaf.
2028                          */
2029                         if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
2030                                 cursor->flags |= HAMMER_CURSOR_DELBTREE;
2031                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
2032                         }
2033                 }
2034                 if (error == 0) {
2035                         switch(data_offset & HAMMER_OFF_ZONE_MASK) {
2036                         case HAMMER_ZONE_LARGE_DATA:
2037                         case HAMMER_ZONE_SMALL_DATA:
2038                         case HAMMER_ZONE_META:
2039                                 hammer_blockmap_free(cursor->trans,
2040                                                      data_offset, data_len);
2041                                 break;
2042                         default:
2043                                 break;
2044                         }
2045                 }
2046         }
2047
2048         /*
2049          * mirror_tid propagation occurs if the node's mirror_tid had to be
2050          * updated while adjusting the delete_tid.
2051          *
2052          * This occurs when deleting even in nohistory mode, but does not
2053          * occur when pruning an already-deleted node.
2054          */
2055         if (doprop) {
2056                 KKASSERT(cursor->ip != NULL);
2057                 hammer_btree_do_propagation(cursor, cursor->ip, leaf);
2058         }
2059         return (error);
2060 }
2061
2062 /*
2063  * Determine whether we can remove a directory.  This routine checks whether
2064  * a directory is empty or not and enforces flush connectivity.
2065  *
2066  * Flush connectivity requires that we block if the target directory is
2067  * currently flushing, otherwise it may not end up in the same flush group.
2068  *
2069  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
2070  */
2071 int
2072 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
2073 {
2074         struct hammer_cursor cursor;
2075         int error;
2076
2077         /*
2078          * Check directory empty
2079          */
2080         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2081
2082         cursor.key_beg.localization = ip->obj_localization +
2083                                       HAMMER_LOCALIZE_MISC;
2084         cursor.key_beg.obj_id = ip->obj_id;
2085         cursor.key_beg.create_tid = 0;
2086         cursor.key_beg.delete_tid = 0;
2087         cursor.key_beg.obj_type = 0;
2088         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
2089         cursor.key_beg.key = HAMMER_MIN_KEY;
2090
2091         cursor.key_end = cursor.key_beg;
2092         cursor.key_end.rec_type = 0xFFFF;
2093         cursor.key_end.key = HAMMER_MAX_KEY;
2094
2095         cursor.asof = ip->obj_asof;
2096         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2097
2098         error = hammer_ip_first(&cursor);
2099         if (error == ENOENT)
2100                 error = 0;
2101         else if (error == 0)
2102                 error = ENOTEMPTY;
2103         hammer_done_cursor(&cursor);
2104         return(error);
2105 }
2106