HAMMER 40E/Many: Inode/link-count sequencer cleanup pass.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.52 2008/05/03 07:59:06 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71
72         /*
73          * Never match against an item deleted by the front-end.
74          */
75         if (rec1->flags & HAMMER_RECF_DELETED_FE)
76                 return(1);
77         if (rec2->flags & HAMMER_RECF_DELETED_FE)
78                 return(-1);
79
80         return(0);
81 }
82
83 static int
84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
85 {
86         if (info->rec_type < rec->rec.base.base.rec_type)
87                 return(-3);
88         if (info->rec_type > rec->rec.base.base.rec_type)
89                 return(3);
90
91         if (info->key < rec->rec.base.base.key)
92                 return(-2);
93         if (info->key > rec->rec.base.base.key)
94                 return(2);
95
96         if (info->create_tid == 0) {
97                 if (rec->rec.base.base.create_tid == 0)
98                         return(0);
99                 return(1);
100         }
101         if (rec->rec.base.base.create_tid == 0)
102                 return(-1);
103         if (info->create_tid < rec->rec.base.base.create_tid)
104                 return(-1);
105         if (info->create_tid > rec->rec.base.base.create_tid)
106                 return(1);
107         return(0);
108 }
109
110 /*
111  * RB_SCAN comparison code for hammer_mem_first().  The argument order
112  * is reversed so the comparison result has to be negated.  key_beg and
113  * key_end are both range-inclusive.
114  *
115  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
116  * These do not stop the scan.
117  *
118  * Localized deletions are not cached in-memory.
119  */
120 static
121 int
122 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
123 {
124         hammer_cursor_t cursor = data;
125         int r;
126
127         r = hammer_rec_compare(&cursor->key_beg, rec);
128         if (r > 1)
129                 return(-1);
130         r = hammer_rec_compare(&cursor->key_end, rec);
131         if (r < -1)
132                 return(1);
133         return(0);
134 }
135
136 /*
137  * This compare function is used when simply looking up key_beg.
138  */
139 static
140 int
141 hammer_rec_find_cmp(hammer_record_t rec, void *data)
142 {
143         hammer_cursor_t cursor = data;
144         int r;
145
146         r = hammer_rec_compare(&cursor->key_beg, rec);
147         if (r > 1)
148                 return(-1);
149         if (r < -1)
150                 return(1);
151         return(0);
152 }
153
154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
156                     hammer_rec_compare, hammer_base_elm_t);
157
158 /*
159  * Allocate a record for the caller to finish filling in.  The record is
160  * returned referenced.
161  */
162 hammer_record_t
163 hammer_alloc_mem_record(hammer_inode_t ip)
164 {
165         hammer_record_t record;
166
167         ++hammer_count_records;
168         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
169         record->flush_state = HAMMER_FST_IDLE;
170         record->ip = ip;
171         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
172         hammer_ref(&record->lock);
173         return (record);
174 }
175
176 void
177 hammer_wait_mem_record(hammer_record_t record)
178 {
179         while (record->flush_state == HAMMER_FST_FLUSH) {
180                 record->flags |= HAMMER_RECF_WANTED;
181                 tsleep(record, 0, "hmrrc2", 0);
182         }
183 }
184
185 /*
186  * Called from the backend, hammer_inode.c, after a record has been
187  * flushed to disk.  The record has been exclusively locked by the
188  * caller and interlocked with BE.
189  *
190  * We clean up the state, unlock, and release the record (the record
191  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
192  */
193 void
194 hammer_flush_record_done(hammer_record_t record, int error)
195 {
196         hammer_inode_t target_ip;
197
198         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
199         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
200
201         if (error) {
202                 /*
203                  * An error occured, the backend was unable to sync the
204                  * record to its media.  Leave the record intact.
205                  */
206                 Debugger("flush_record_done error");
207         }
208
209         if (record->flags & HAMMER_RECF_DELETED_BE) {
210                 if ((target_ip = record->target_ip) != NULL) {
211                         TAILQ_REMOVE(&target_ip->target_list, record,
212                                      target_entry);
213                         record->target_ip = NULL;
214                         hammer_test_inode(target_ip);
215                 }
216                 record->flush_state = HAMMER_FST_IDLE;
217         } else {
218                 if (record->target_ip)
219                         record->flush_state = HAMMER_FST_SETUP;
220                 else
221                         record->flush_state = HAMMER_FST_IDLE;
222         }
223         record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
224         if (record->flags & HAMMER_RECF_WANTED) {
225                 record->flags &= ~HAMMER_RECF_WANTED;
226                 wakeup(record);
227         }
228         hammer_rel_mem_record(record);
229 }
230
231 /*
232  * Release a memory record.  Records marked for deletion are immediately
233  * removed from the RB-Tree but otherwise left intact until the last ref
234  * goes away.
235  */
236 void
237 hammer_rel_mem_record(struct hammer_record *record)
238 {
239         hammer_inode_t ip, target_ip;
240
241         hammer_unref(&record->lock);
242
243         if (record->flags & HAMMER_RECF_DELETED_FE) {
244                 if (record->lock.refs == 0) {
245                         KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
246
247                         ip = record->ip;
248                         if ((target_ip = record->target_ip) != NULL) {
249                                 TAILQ_REMOVE(&target_ip->target_list,
250                                              record, target_entry);
251                                 record->target_ip = NULL;
252                                 hammer_test_inode(target_ip);
253                         }
254
255                         if (record->flags & HAMMER_RECF_ONRBTREE) {
256                                 RB_REMOVE(hammer_rec_rb_tree,
257                                           &record->ip->rec_tree,
258                                           record);
259                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
260                                 if (RB_EMPTY(&record->ip->rec_tree)) {
261                                         record->ip->flags &= ~HAMMER_INODE_XDIRTY;
262                                         hammer_test_inode(record->ip);
263                                 }
264                         }
265                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
266                                 --hammer_count_record_datas;
267                                 kfree(record->data, M_HAMMER);
268                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
269                         }
270                         record->data = NULL;
271                         --hammer_count_records;
272                         kfree(record, M_HAMMER);
273                         return;
274                 }
275         }
276 }
277
278 /*
279  * Record visibility depends on whether the record is being accessed by
280  * the backend or the frontend.
281  *
282  * Return non-zero if the record is visible, zero if it isn't or if it is
283  * deleted.
284  */
285 static __inline
286 int
287 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
288 {
289         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
290                 if (record->flags & HAMMER_RECF_DELETED_BE)
291                         return(0);
292 #if 0
293                 if ((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0)
294                         return(0);
295 #endif
296         } else {
297                 if (record->flags & HAMMER_RECF_DELETED_FE)
298                         return(0);
299         }
300         return(1);
301 }
302
303 /*
304  * This callback is used as part of the RB_SCAN function for in-memory
305  * records.  We terminate it (return -1) as soon as we get a match.
306  *
307  * This routine is used by frontend code.
308  *
309  * The primary compare code does not account for ASOF lookups.  This
310  * code handles that case as well as a few others.
311  */
312 static
313 int
314 hammer_rec_scan_callback(hammer_record_t rec, void *data)
315 {
316         hammer_cursor_t cursor = data;
317
318         /*
319          * We terminate on success, so this should be NULL on entry.
320          */
321         KKASSERT(cursor->iprec == NULL);
322
323         /*
324          * Skip if the record was marked deleted.
325          */
326         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
327                 return(0);
328
329         /*
330          * Skip if not visible due to our as-of TID
331          */
332         if (cursor->flags & HAMMER_CURSOR_ASOF) {
333                 if (cursor->asof < rec->rec.base.base.create_tid)
334                         return(0);
335                 if (rec->rec.base.base.delete_tid &&
336                     cursor->asof >= rec->rec.base.base.delete_tid) {
337                         return(0);
338                 }
339         }
340
341         /*
342          * If the record is queued to the flusher we have to block until
343          * it isn't.  Otherwise we may see duplication between our memory
344          * cache and the media.
345          */
346         hammer_ref(&rec->lock);
347
348 #warning "This deadlocks"
349 #if 0
350         if (rec->flush_state == HAMMER_FST_FLUSH)
351                 hammer_wait_mem_record(rec);
352 #endif
353
354         /*
355          * The record may have been deleted while we were blocked.
356          */
357         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
358                 hammer_rel_mem_record(rec);
359                 return(0);
360         }
361
362         /*
363          * Set the matching record and stop the scan.
364          */
365         cursor->iprec = rec;
366         return(-1);
367 }
368
369
370 /*
371  * Lookup an in-memory record given the key specified in the cursor.  Works
372  * just like hammer_btree_lookup() but operates on an inode's in-memory
373  * record list.
374  *
375  * The lookup must fail if the record is marked for deferred deletion.
376  */
377 static
378 int
379 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
380 {
381         int error;
382
383         if (cursor->iprec) {
384                 hammer_rel_mem_record(cursor->iprec);
385                 cursor->iprec = NULL;
386         }
387         if (cursor->ip) {
388                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
389                 --cursor->ip->cursor_ip_refs;
390 #if 0
391                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
392                                                   &cursor->ip->rec_tree);
393 #endif
394         }
395         cursor->ip = ip;
396 #if 0
397         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
398 #endif
399         ++ip->cursor_ip_refs;
400
401 #if 0
402         cursor->scan.node = NULL;
403 #endif
404         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
405                                    hammer_rec_scan_callback, cursor);
406
407         if (cursor->iprec == NULL)
408                 error = ENOENT;
409         else
410                 error = 0;
411         return(error);
412 }
413
414 /*
415  * hammer_mem_first() - locate the first in-memory record matching the
416  * cursor within the bounds of the key range.
417  */
418 static
419 int
420 hammer_mem_first(hammer_cursor_t cursor)
421 {
422         hammer_inode_t ip;
423
424         ip = cursor->ip;
425         KKASSERT(ip != NULL);
426
427         if (cursor->iprec) {
428                 hammer_rel_mem_record(cursor->iprec);
429                 cursor->iprec = NULL;
430         }
431
432         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
433                                    hammer_rec_scan_callback, cursor);
434
435         /*
436          * Adjust scan.node and keep it linked into the RB-tree so we can
437          * hold the cursor through third party modifications of the RB-tree.
438          */
439         if (cursor->iprec)
440                 return(0);
441         return(ENOENT);
442 }
443
444 void
445 hammer_mem_done(hammer_cursor_t cursor)
446 {
447         if (cursor->iprec) {
448                 hammer_rel_mem_record(cursor->iprec);
449                 cursor->iprec = NULL;
450         }
451 }
452
453 /************************************************************************
454  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
455  ************************************************************************
456  *
457  * These functions manipulate in-memory records.  Such records typically
458  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
459  */
460
461 /*
462  * Add a directory entry (dip,ncp) which references inode (ip).
463  *
464  * Note that the low 32 bits of the namekey are set temporarily to create
465  * a unique in-memory record, and may be modified a second time when the
466  * record is synchronized to disk.  In particular, the low 32 bits cannot be
467  * all 0's when synching to disk, which is not handled here.
468  */
469 int
470 hammer_ip_add_directory(struct hammer_transaction *trans,
471                      struct hammer_inode *dip, struct namecache *ncp,
472                      struct hammer_inode *ip)
473 {
474         hammer_record_t record;
475         int error;
476         int bytes;
477
478         record = hammer_alloc_mem_record(dip);
479
480         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
481         if (++trans->hmp->namekey_iterator == 0)
482                 ++trans->hmp->namekey_iterator;
483
484         record->type = HAMMER_MEM_RECORD_ADD;
485         record->rec.entry.base.base.obj_id = dip->obj_id;
486         record->rec.entry.base.base.key =
487                 hammer_directory_namekey(ncp->nc_name, bytes);
488         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
489         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
490         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
491         record->rec.entry.obj_id = ip->obj_id;
492         record->data = (void *)ncp->nc_name;
493         record->rec.entry.base.data_len = bytes;
494         ++ip->ino_rec.ino_nlinks;
495         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
496
497         /*
498          * The target inode and the directory entry are bound together.
499          */
500         record->target_ip = ip;
501         record->flush_state = HAMMER_FST_SETUP;
502         TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
503
504         /*
505          * The inode now has a dependancy and must be taken out of the idle
506          * state.  An inode not in an idle state is given an extra reference.
507          */
508         if (ip->flush_state == HAMMER_FST_IDLE) {
509                 hammer_ref(&ip->lock);
510                 ip->flush_state = HAMMER_FST_SETUP;
511         }
512
513         /* NOTE: copies record->data */
514         error = hammer_mem_add(trans, record);
515         return(error);
516 }
517
518 /*
519  * Delete the directory entry and update the inode link count.  The
520  * cursor must be seeked to the directory entry record being deleted.
521  *
522  * The related inode should be share-locked by the caller.  The caller is
523  * on the frontend.
524  *
525  * This function can return EDEADLK requiring the caller to terminate
526  * the cursor, any locks, wait on the returned record, and retry.
527  */
528 int
529 hammer_ip_del_directory(struct hammer_transaction *trans,
530                      hammer_cursor_t cursor, struct hammer_inode *dip,
531                      struct hammer_inode *ip)
532 {
533         hammer_record_t record;
534         int error;
535
536         if (cursor->record == &cursor->iprec->rec) {
537                 /*
538                  * In-memory (unsynchronized) records can simply be freed.
539                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
540                  * by the backend, we must still avoid races against the
541                  * backend potentially syncing the record to the media. 
542                  *
543                  * We cannot call hammer_ip_delete_record(), that routine may
544                  * only be called from the backend.
545                  */
546                 record = cursor->iprec;
547                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
548                         KKASSERT(cursor->deadlk_rec == NULL);
549                         hammer_ref(&record->lock);
550                         cursor->deadlk_rec = record;
551                         error = EDEADLK;
552                 } else {
553                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
554                         record->flags |= HAMMER_RECF_DELETED_FE;
555                         error = 0;
556                 }
557         } else {
558                 /*
559                  * If the record is on-disk we have to queue the deletion by
560                  * the record's key.  This also causes lookups to skip the
561                  * record.
562                  */
563                 record = hammer_alloc_mem_record(dip);
564                 record->type = HAMMER_MEM_RECORD_DEL;
565                 record->rec.entry.base.base = cursor->record->base.base;
566                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
567
568                 record->target_ip = ip;
569                 record->flush_state = HAMMER_FST_SETUP;
570                 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
571
572                 /*
573                  * The inode now has a dependancy and must be taken out of
574                  * the idle state.  An inode not in an idle state is given
575                  * an extra reference.
576                  */
577                 if (ip->flush_state == HAMMER_FST_IDLE) {
578                         hammer_ref(&ip->lock);
579                         ip->flush_state = HAMMER_FST_SETUP;
580                 }
581
582                 error = hammer_mem_add(trans, record);
583         }
584
585         /*
586          * One less link.  The file may still be open in the OS even after
587          * all links have gone away.
588          *
589          * We have to terminate the cursor before syncing the inode to
590          * avoid deadlocking against ourselves.  XXX this may no longer
591          * be true.
592          *
593          * If nlinks drops to zero and the vnode is inactive (or there is
594          * no vnode), call hammer_inode_unloadable_check() to zonk the
595          * inode.  If we don't do this here the inode will not be destroyed
596          * on-media until we unmount.
597          */
598         if (error == 0) {
599                 --ip->ino_rec.ino_nlinks;
600                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
601                 if (ip->ino_rec.ino_nlinks == 0 &&
602                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
603                         hammer_done_cursor(cursor);
604                         hammer_inode_unloadable_check(ip);
605                         hammer_flush_inode(ip, 0);
606                 }
607
608         }
609         return(error);
610 }
611
612 /*
613  * Add a record to an inode.
614  *
615  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
616  * initialize the following additional fields:
617  *
618  * The related inode should be share-locked by the caller.  The caller is
619  * on the frontend.
620  *
621  * record->rec.entry.base.base.key
622  * record->rec.entry.base.base.rec_type
623  * record->rec.entry.base.base.data_len
624  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
625  */
626 int
627 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
628 {
629         hammer_inode_t ip = record->ip;
630         int error;
631
632         record->rec.base.base.obj_id = ip->obj_id;
633         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
634
635         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
636
637         /* NOTE: copies record->data */
638         error = hammer_mem_add(trans, record);
639         return(error);
640 }
641
642 /*
643  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
644  * is called via the strategy called from a cached data source.  This code
645  * is responsible for actually writing a data record out to the disk.
646  *
647  * This can only occur non-historically (i.e. 'current' data only).
648  *
649  * The file offset must be HAMMER_BUFSIZE aligned but the data length
650  * can be truncated.  The record (currently) always represents a BUFSIZE
651  * swath of space whether the data is truncated or not.
652  */
653 int
654 hammer_ip_sync_data(hammer_cursor_t cursor, hammer_inode_t ip,
655                        int64_t offset, void *data, int bytes)
656 {
657         hammer_transaction_t trans = cursor->trans;
658         hammer_record_ondisk_t rec;
659         union hammer_btree_elm elm;
660         hammer_off_t rec_offset;
661         void *bdata;
662         int error;
663
664         KKASSERT((offset & HAMMER_BUFMASK) == 0);
665         KKASSERT(trans->type == HAMMER_TRANS_FLS);
666         KKASSERT(bytes != 0);
667 retry:
668         hammer_normalize_cursor(cursor);
669         cursor->key_beg.obj_id = ip->obj_id;
670         cursor->key_beg.key = offset + bytes;
671         cursor->key_beg.create_tid = trans->tid;
672         cursor->key_beg.delete_tid = 0;
673         cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
674         cursor->asof = trans->tid;
675         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
676         cursor->flags |= HAMMER_CURSOR_INSERT;
677         cursor->flags |= HAMMER_CURSOR_BACKEND;
678
679         /*
680          * Issue a lookup to position the cursor.
681          */
682         error = hammer_btree_lookup(cursor);
683         if (error == 0) {
684                 kprintf("hammer_ip_sync_data: duplicate data at "
685                         "(%lld,%d) tid %016llx\n",
686                         offset, bytes, trans->tid);
687                 hammer_print_btree_elm(&cursor->node->ondisk->
688                                         elms[cursor->index],
689                                        HAMMER_BTREE_TYPE_LEAF, cursor->index);
690                 panic("Duplicate data");
691                 error = EIO;
692         }
693         if (error != ENOENT)
694                 goto done;
695
696         /*
697          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
698          * can cross buffer boundaries so we may have to split our bcopy.
699          */
700         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
701                                   &cursor->record_buffer,
702                                   bytes, &bdata,
703                                   &cursor->data_buffer, &error);
704         if (rec == NULL)
705                 goto done;
706         if (hammer_debug_general & 0x1000)
707                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
708
709         /*
710          * Fill everything in and insert our B-Tree node.
711          *
712          * NOTE: hammer_alloc_record() has already marked the related
713          * buffers as modified.  If we do it again we will generate
714          * unnecessary undo elements.
715          */
716         hammer_modify_buffer(trans, cursor->record_buffer, NULL, 0);
717         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
718         rec->base.base.obj_id = ip->obj_id;
719         rec->base.base.key = offset + bytes;
720         rec->base.base.create_tid = trans->tid;
721         rec->base.base.delete_tid = 0;
722         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
723         rec->base.data_crc = crc32(data, bytes);
724         hammer_modify_buffer_done(cursor->record_buffer);
725         KKASSERT(rec->base.data_len == bytes);
726
727         hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
728         bcopy(data, bdata, bytes);
729         hammer_modify_buffer_done(cursor->data_buffer);
730
731         elm.leaf.base = rec->base.base;
732         elm.leaf.rec_offset = rec_offset;
733         elm.leaf.data_offset = rec->base.data_off;
734         elm.leaf.data_len = bytes;
735         elm.leaf.data_crc = rec->base.data_crc;
736
737         /*
738          * Data records can wind up on-disk before the inode itself is
739          * on-disk.  One must assume data records may be on-disk if either
740          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
741          */
742         ip->flags |= HAMMER_INODE_DONDISK;
743
744         error = hammer_btree_insert(cursor, &elm);
745         if (error == 0)
746                 goto done;
747
748         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
749 done:
750         if (error == EDEADLK) {
751                 hammer_done_cursor(cursor);
752                 error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
753                 if (error == 0)
754                         goto retry;
755         }
756         return(error);
757 }
758
759 /*
760  * Sync an in-memory record to the disk.  This is called by the backend.
761  * This code is responsible for actually writing a record out to the disk.
762  *
763  * This routine can only be called by the backend and the record
764  * must have been interlocked with BE.  It will remain interlocked on
765  * return.  If no error occurs the record will be marked deleted but
766  * the caller is responsible for its final disposition.
767  *
768  * Multiple calls may be aggregated with the same cursor using
769  * hammer_ip_sync_record_cursor().  The caller must handle EDEADLK
770  * in that case.
771  */
772 int
773 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
774 {
775         struct hammer_cursor cursor;
776         int error;
777
778         do {
779                 error = hammer_init_cursor(trans, &cursor,
780                                            &record->ip->cache[0], record->ip);
781                 if (error)
782                         return(error);
783                 error = hammer_ip_sync_record_cursor(&cursor, record);
784                 hammer_done_cursor(&cursor);
785         } while (error == EDEADLK);
786         return (error);
787 }
788
789 int
790 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record)
791 {
792         hammer_transaction_t trans = cursor->trans;
793         hammer_record_ondisk_t rec;
794         union hammer_btree_elm elm;
795         hammer_off_t rec_offset;
796         void *bdata;
797         int error;
798
799         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
800         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
801
802         hammer_normalize_cursor(cursor);
803         cursor->key_beg = record->rec.base.base;
804         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
805         cursor->flags |= HAMMER_CURSOR_BACKEND;
806         cursor->flags &= ~HAMMER_CURSOR_INSERT;
807
808         /*
809          * If we are deleting an exact match must be found on-disk.
810          */
811         if (record->type == HAMMER_MEM_RECORD_DEL) {
812                 error = hammer_btree_lookup(cursor);
813                 if (error == 0) {
814                         error = hammer_ip_delete_record(cursor, trans->tid);
815                         if (error == 0) {
816                                 record->flags |= HAMMER_RECF_DELETED_FE;
817                                 record->flags |= HAMMER_RECF_DELETED_BE;
818                         }
819                 }
820                 goto done;
821         }
822
823         /*
824          * We are inserting.
825          *
826          * Issue a lookup to position the cursor and locate the cluster.  The
827          * target key should not exist.  If we are creating a directory entry
828          * we may have to iterate the low 32 bits of the key to find an unused
829          * key.
830          */
831         cursor->flags |= HAMMER_CURSOR_INSERT;
832
833         for (;;) {
834                 error = hammer_btree_lookup(cursor);
835                 if (error)
836                         break;
837                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
838                         kprintf("hammer_ip_sync_record: duplicate rec "
839                                 "at (%016llx)\n", record->rec.base.base.key);
840                         Debugger("duplicate record1");
841                         error = EIO;
842                         break;
843                 }
844                 if (++trans->hmp->namekey_iterator == 0)
845                         ++trans->hmp->namekey_iterator;
846                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
847                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
848                 cursor->key_beg.key = record->rec.base.base.key;
849         }
850         if (error != ENOENT)
851                 goto done;
852
853         /*
854          * Allocate the record and data.  The result buffers will be
855          * marked as being modified and further calls to
856          * hammer_modify_buffer() will result in unneeded UNDO records.
857          *
858          * Support zero-fill records (data == NULL and data_len != 0)
859          */
860         if (record->data == NULL) {
861                 rec = hammer_alloc_record(trans, &rec_offset,
862                                           record->rec.base.base.rec_type,
863                                           &cursor->record_buffer,
864                                           0, &bdata,
865                                           NULL, &error);
866                 if (hammer_debug_general & 0x1000)
867                         kprintf("NULL RECORD DATA\n");
868         } else if (record->flags & HAMMER_RECF_INBAND) {
869                 rec = hammer_alloc_record(trans, &rec_offset,
870                                           record->rec.base.base.rec_type,
871                                           &cursor->record_buffer,
872                                           record->rec.base.data_len, &bdata,
873                                           NULL, &error);
874                 if (hammer_debug_general & 0x1000)
875                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
876         } else {
877                 rec = hammer_alloc_record(trans, &rec_offset,
878                                           record->rec.base.base.rec_type,
879                                           &cursor->record_buffer,
880                                           record->rec.base.data_len, &bdata,
881                                           &cursor->data_buffer, &error);
882                 if (hammer_debug_general & 0x1000)
883                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
884         }
885
886         if (rec == NULL)
887                 goto done;
888
889         /*
890          * Fill in the remaining fields and insert our B-Tree node.
891          */
892         hammer_modify_buffer(trans, cursor->record_buffer, NULL, 0);
893         rec->base.base = record->rec.base.base;
894         bcopy(&record->rec.base + 1, &rec->base + 1,
895               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
896
897         /*
898          * Copy the data and deal with zero-fill support.
899          */
900         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
901                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
902                 bcopy(record->data, bdata, rec->base.data_len);
903         } else if (record->data) {
904                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
905                 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0);
906                 bcopy(record->data, bdata, rec->base.data_len);
907                 hammer_modify_buffer_done(cursor->data_buffer);
908         } else {
909                 rec->base.data_len = record->rec.base.data_len;
910         }
911         hammer_modify_buffer_done(cursor->record_buffer);
912
913         elm.leaf.base = record->rec.base.base;
914         elm.leaf.rec_offset = rec_offset;
915         elm.leaf.data_offset = rec->base.data_off;
916         elm.leaf.data_len = rec->base.data_len;
917         elm.leaf.data_crc = rec->base.data_crc;
918
919         error = hammer_btree_insert(cursor, &elm);
920
921         /*
922          * This occurs when the frontend creates a record and queues it to
923          * the backend, then tries to delete the record.  The backend must
924          * still sync the record to the media as if it were not deleted,
925          * but must interlock with the frontend to ensure that the 
926          * synchronized record is not visible to the frontend, which means
927          * converting it from an ADD record to a DEL record.
928          *
929          * The DEL record then masks the record synced to disk until another
930          * round can delete it for real.
931          */
932         if (error == 0) {
933                 if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
934                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
935                         record->flags &= ~HAMMER_RECF_DELETED_FE;
936                         record->type = HAMMER_MEM_RECORD_DEL;
937                         if (record->flush_state == HAMMER_FST_SETUP) {
938                                 hammer_test_inode(record->ip);
939                                 hammer_test_inode(record->target_ip);
940                         }
941                         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
942                 } else {
943                         record->flags |= HAMMER_RECF_DELETED_FE;
944                         record->flags |= HAMMER_RECF_DELETED_BE;
945                 }
946         } else {
947                 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
948                 /* XXX free data buffer? */
949         }
950
951 done:
952         return(error);
953 }
954
955 /*
956  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
957  * entry's key is used to deal with hash collisions in the upper 32 bits.
958  * A unique 64 bit key is generated in-memory and may be regenerated a
959  * second time when the directory record is flushed to the on-disk B-Tree.
960  *
961  * A referenced record is passed to this function.  This function
962  * eats the reference.  If an error occurs the record will be deleted.
963  *
964  * A copy of the temporary record->data pointer provided by the caller
965  * will be made.
966  */
967 static
968 int
969 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
970 {
971         void *data;
972         int bytes;
973         int reclen;
974                 
975         /*
976          * Make a private copy of record->data
977          */
978         if (record->data) {
979                 /*
980                  * Try to embed the data in extra space in the record
981                  * union, otherwise allocate a copy.
982                  */
983                 bytes = record->rec.base.data_len;
984                 switch(record->rec.base.base.rec_type) {
985                 case HAMMER_RECTYPE_DIRENTRY:
986                         reclen = offsetof(struct hammer_entry_record, name[0]);
987                         break;
988                 case HAMMER_RECTYPE_DATA:
989                         reclen = offsetof(struct hammer_data_record, data[0]);
990                         break;
991                 default:
992                         reclen = sizeof(record->rec);
993                         break;
994                 }
995                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
996                         bcopy(record->data, (char *)&record->rec + reclen,
997                               bytes);
998                         record->data = (void *)((char *)&record->rec + reclen);
999                         record->flags |= HAMMER_RECF_INBAND;
1000                 } else {
1001                         ++hammer_count_record_datas;
1002                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
1003                         record->flags |= HAMMER_RECF_ALLOCDATA;
1004                         bcopy(record->data, data, bytes);
1005                         record->data = data;
1006                 }
1007         }
1008
1009         /*
1010          * Insert into the RB tree, find an unused iterator if this is
1011          * a directory entry.
1012          */
1013         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1014                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
1015                         record->flags |= HAMMER_RECF_DELETED_FE;
1016                         hammer_rel_mem_record(record);
1017                         return (EEXIST);
1018                 }
1019                 if (++trans->hmp->namekey_iterator == 0)
1020                         ++trans->hmp->namekey_iterator;
1021                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
1022                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
1023         }
1024         record->flags |= HAMMER_RECF_ONRBTREE;
1025         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
1026         hammer_rel_mem_record(record);
1027         return(0);
1028 }
1029
1030 /************************************************************************
1031  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1032  ************************************************************************
1033  *
1034  * These functions augment the B-Tree scanning functions in hammer_btree.c
1035  * by merging in-memory records with on-disk records.
1036  */
1037
1038 /*
1039  * Locate a particular record either in-memory or on-disk.
1040  *
1041  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1042  * NOT be called to iterate results.
1043  */
1044 int
1045 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
1046 {
1047         int error;
1048
1049         /*
1050          * If the element is in-memory return it without searching the
1051          * on-disk B-Tree
1052          */
1053         error = hammer_mem_lookup(cursor, ip);
1054         if (error == 0) {
1055                 cursor->record = &cursor->iprec->rec;
1056                 return(error);
1057         }
1058         if (error != ENOENT)
1059                 return(error);
1060
1061         /*
1062          * If the inode has on-disk components search the on-disk B-Tree.
1063          */
1064         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1065                 return(error);
1066         error = hammer_btree_lookup(cursor);
1067         if (error == 0)
1068                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1069         return(error);
1070 }
1071
1072 /*
1073  * Locate the first record within the cursor's key_beg/key_end range,
1074  * restricted to a particular inode.  0 is returned on success, ENOENT
1075  * if no records matched the requested range, or some other error.
1076  *
1077  * When 0 is returned hammer_ip_next() may be used to iterate additional
1078  * records within the requested range.
1079  *
1080  * This function can return EDEADLK, requiring the caller to terminate
1081  * the cursor and try again.
1082  */
1083 int
1084 hammer_ip_first(hammer_cursor_t cursor)
1085 {
1086         hammer_inode_t ip = cursor->ip;
1087         int error;
1088
1089         KKASSERT(ip != NULL);
1090
1091         /*
1092          * Clean up fields and setup for merged scan
1093          */
1094         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1095         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1096         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1097         if (cursor->iprec) {
1098                 hammer_rel_mem_record(cursor->iprec);
1099                 cursor->iprec = NULL;
1100         }
1101
1102         /*
1103          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1104          * exact lookup so if we get ENOENT we have to call the iterate
1105          * function to validate the first record after the begin key.
1106          *
1107          * The ATEDISK flag is used by hammer_btree_iterate to determine
1108          * whether it must index forwards or not.  It is also used here
1109          * to select the next record from in-memory or on-disk.
1110          *
1111          * EDEADLK can only occur if the lookup hit an empty internal
1112          * element and couldn't delete it.  Since this could only occur
1113          * in-range, we can just iterate from the failure point.
1114          */
1115         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1116                 error = hammer_btree_lookup(cursor);
1117                 if (error == ENOENT || error == EDEADLK) {
1118                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1119                         if (hammer_debug_general & 0x2000)
1120                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1121                         error = hammer_btree_iterate(cursor);
1122                 }
1123                 if (error && error != ENOENT) 
1124                         return(error);
1125                 if (error == 0) {
1126                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1127                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1128                 } else {
1129                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1130                 }
1131         }
1132
1133         /*
1134          * Search the in-memory record list (Red-Black tree).  Unlike the
1135          * B-Tree search, mem_first checks for records in the range.
1136          */
1137         error = hammer_mem_first(cursor);
1138         if (error && error != ENOENT)
1139                 return(error);
1140         if (error == 0) {
1141                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1142                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1143                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1144                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1145         }
1146
1147         /*
1148          * This will return the first matching record.
1149          */
1150         return(hammer_ip_next(cursor));
1151 }
1152
1153 /*
1154  * Retrieve the next record in a merged iteration within the bounds of the
1155  * cursor.  This call may be made multiple times after the cursor has been
1156  * initially searched with hammer_ip_first().
1157  *
1158  * 0 is returned on success, ENOENT if no further records match the
1159  * requested range, or some other error code is returned.
1160  */
1161 int
1162 hammer_ip_next(hammer_cursor_t cursor)
1163 {
1164         hammer_btree_elm_t elm;
1165         hammer_record_t rec, save;
1166         int error;
1167         int r;
1168
1169 next_btree:
1170         /*
1171          * Load the current on-disk and in-memory record.  If we ate any
1172          * records we have to get the next one. 
1173          *
1174          * If we deleted the last on-disk record we had scanned ATEDISK will
1175          * be clear and DELBTREE will be set, forcing a call to iterate. The
1176          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1177          * element.  If ATEDISK is set, iterate will skip the 'current'
1178          * element.
1179          *
1180          * Get the next on-disk record
1181          */
1182         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1183                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1184                         error = hammer_btree_iterate(cursor);
1185                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1186                         if (error == 0)
1187                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1188                         else
1189                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1190                                                  HAMMER_CURSOR_ATEDISK;
1191                 }
1192         }
1193
1194 next_memory:
1195         /*
1196          * Get the next in-memory record.  The record can be ripped out
1197          * of the RB tree so we maintain a scan_info structure to track
1198          * the next node.
1199          *
1200          * hammer_rec_scan_cmp:  Is the record still in our general range,
1201          *                       (non-inclusive of snapshot exclusions)?
1202          * hammer_rec_scan_callback: Is the record in our snapshot?
1203          */
1204         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1205                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1206                         save = cursor->iprec;
1207                         cursor->iprec = NULL;
1208                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1209                         while (rec) {
1210                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1211                                         break;
1212                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1213                                         break;
1214                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1215                         }
1216                         if (save)
1217                                 hammer_rel_mem_record(save);
1218                         if (cursor->iprec) {
1219                                 KKASSERT(cursor->iprec == rec);
1220                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1221                         } else {
1222                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1223                         }
1224                 }
1225         }
1226
1227         /*
1228          * The memory record may have become stale while being held in
1229          * cursor->iprec.  We are interlocked against the backend on 
1230          * with regards to B-Tree entries.
1231          */
1232         if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) {
1233                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) {
1234                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1235                         goto next_memory;
1236                 }
1237         }
1238
1239         /*
1240          * Extract either the disk or memory record depending on their
1241          * relative position.
1242          */
1243         error = 0;
1244         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1245         case 0:
1246                 /*
1247                  * Both entries valid.   Return the btree entry if it is
1248                  * in front of the memory entry.
1249                  */
1250                 elm = &cursor->node->ondisk->elms[cursor->index];
1251                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1252                 if (r < 0) {
1253                         error = hammer_btree_extract(cursor,
1254                                                      HAMMER_CURSOR_GET_RECORD);
1255                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1256                         break;
1257                 }
1258
1259                 /*
1260                  * If the entries match exactly the memory entry typically
1261                  * specifies an on-disk deletion and we eat both entries.
1262                  *
1263                  * If the in-memory record is not an on-disk deletion we
1264                  * probably caught the syncer while it was syncing it to
1265                  * the media.  Since we hold a shared lock on the cursor,
1266                  * the in-memory record had better be marked deleted at
1267                  * this point.
1268                  */
1269                 if (r == 0) {
1270                         if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1271                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1272                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1273                                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1274                                         goto next_btree;
1275                                 }
1276                         } else {
1277                                 panic("hammer_ip_next: duplicate mem/b-tree entry");
1278                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1279                                 goto next_memory;
1280                         }
1281                 }
1282                 /* fall through to the memory entry */
1283         case HAMMER_CURSOR_ATEDISK:
1284                 /*
1285                  * Only the memory entry is valid.
1286                  */
1287                 cursor->record = &cursor->iprec->rec;
1288                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1289
1290                 /*
1291                  * If the memory entry is an on-disk deletion we should have
1292                  * also had found a B-Tree record.  If the backend beat us
1293                  * to it it would have interlocked the cursor and we should
1294                  * have seen the in-memory record marked DELETED_FE.
1295                  */
1296                 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1297                         panic("hammer_ip_next: del-on-disk with no b-tree entry");
1298                 }
1299                 break;
1300         case HAMMER_CURSOR_ATEMEM:
1301                 /*
1302                  * Only the disk entry is valid
1303                  */
1304                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1305                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1306                 break;
1307         default:
1308                 /*
1309                  * Neither entry is valid
1310                  *
1311                  * XXX error not set properly
1312                  */
1313                 cursor->record = NULL;
1314                 error = ENOENT;
1315                 break;
1316         }
1317         return(error);
1318 }
1319
1320 /*
1321  * Resolve the cursor->data pointer for the current cursor position in
1322  * a merged iteration.
1323  */
1324 int
1325 hammer_ip_resolve_data(hammer_cursor_t cursor)
1326 {
1327         int error;
1328
1329         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1330                 cursor->data = cursor->iprec->data;
1331                 error = 0;
1332         } else {
1333                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1334         }
1335         return(error);
1336 }
1337
1338 int
1339 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1340 {
1341         int error;
1342
1343         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1344                 cursor->data = cursor->iprec->data;
1345                 error = 0;
1346         } else {
1347                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1348                                                      HAMMER_CURSOR_GET_RECORD);
1349         }
1350         return(error);
1351 }
1352
1353 /*
1354  * Delete all records within the specified range for inode ip.
1355  *
1356  * NOTE: An unaligned range will cause new records to be added to cover
1357  * the edge cases. (XXX not implemented yet).
1358  *
1359  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1360  *
1361  * NOTE: Record keys for regular file data have to be special-cased since
1362  * they indicate the end of the range (key = base + bytes).
1363  */
1364 int
1365 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip,
1366                        int64_t ran_beg, int64_t ran_end)
1367 {
1368         hammer_transaction_t trans = cursor->trans;
1369         hammer_record_ondisk_t rec;
1370         hammer_base_elm_t base;
1371         int error;
1372         int64_t off;
1373
1374 #if 0
1375         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1376 #endif
1377
1378         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1379 retry:
1380         hammer_normalize_cursor(cursor);
1381         cursor->key_beg.obj_id = ip->obj_id;
1382         cursor->key_beg.create_tid = 0;
1383         cursor->key_beg.delete_tid = 0;
1384         cursor->key_beg.obj_type = 0;
1385         cursor->asof = ip->obj_asof;
1386         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1387         cursor->flags |= HAMMER_CURSOR_ASOF;
1388         cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1389         cursor->flags |= HAMMER_CURSOR_BACKEND;
1390
1391         cursor->key_end = cursor->key_beg;
1392         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1393                 cursor->key_beg.key = ran_beg;
1394                 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB;
1395                 cursor->key_end.rec_type = HAMMER_RECTYPE_DB;
1396                 cursor->key_end.key = ran_end;
1397         } else {
1398                 /*
1399                  * The key in the B-Tree is (base+bytes), so the first possible
1400                  * matching key is ran_beg + 1.
1401                  */
1402                 int64_t tmp64;
1403
1404                 cursor->key_beg.key = ran_beg + 1;
1405                 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA;
1406                 cursor->key_end.rec_type = HAMMER_RECTYPE_DATA;
1407
1408                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1409                 if (tmp64 < ran_end)
1410                         cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1411                 else
1412                         cursor->key_end.key = ran_end + MAXPHYS + 1;
1413         }
1414         cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE;
1415
1416         error = hammer_ip_first(cursor);
1417
1418         /*
1419          * Iterate through matching records and mark them as deleted.
1420          */
1421         while (error == 0) {
1422                 rec = cursor->record;
1423                 base = &rec->base.base;
1424
1425                 KKASSERT(base->delete_tid == 0);
1426
1427                 /*
1428                  * There may be overlap cases for regular file data.  Also
1429                  * remember the key for a regular file record is the offset
1430                  * of the last byte of the record (base + len - 1), NOT the
1431                  * base offset.
1432                  */
1433 #if 0
1434                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1435 #endif
1436                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1437 #if 0
1438                         kprintf("delete_range loop key %016llx,%d\n",
1439                                 base->key - rec->base.data_len, rec->base.data_len);
1440 #endif
1441                         off = base->key - rec->base.data_len;
1442                         /*
1443                          * Check the left edge case.  We currently do not
1444                          * split existing records.
1445                          */
1446                         if (off < ran_beg) {
1447                                 panic("hammer left edge case %016llx %d\n",
1448                                         base->key, rec->base.data_len);
1449                         }
1450
1451                         /*
1452                          * Check the right edge case.  Note that the
1453                          * record can be completely out of bounds, which
1454                          * terminates the search.
1455                          *
1456                          * base->key is exclusive of the right edge while
1457                          * ran_end is inclusive of the right edge.  The
1458                          * (key - data_len) left boundary is inclusive.
1459                          *
1460                          * XXX theory-check this test at some point, are
1461                          * we missing a + 1 somewhere?  Note that ran_end
1462                          * could overflow.
1463                          */
1464                         if (base->key - 1 > ran_end) {
1465                                 if (base->key - rec->base.data_len > ran_end)
1466                                         break;
1467                                 panic("hammer right edge case\n");
1468                         }
1469                 }
1470
1471                 /*
1472                  * Mark the record and B-Tree entry as deleted.  This will
1473                  * also physically delete the B-Tree entry, record, and
1474                  * data if the retention policy dictates.  The function
1475                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1476                  * uses to perform a fixup.
1477                  */
1478                 error = hammer_ip_delete_record(cursor, trans->tid);
1479                 if (error)
1480                         break;
1481                 error = hammer_ip_next(cursor);
1482         }
1483         if (error == EDEADLK) {
1484                 hammer_done_cursor(cursor);
1485                 error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
1486                 if (error == 0)
1487                         goto retry;
1488         }
1489         if (error == ENOENT)
1490                 error = 0;
1491         return(error);
1492 }
1493
1494 /*
1495  * Delete all user records associated with an inode except the inode record
1496  * itself.  Directory entries are not deleted (they must be properly disposed
1497  * of or nlinks would get upset).
1498  */
1499 int
1500 hammer_ip_delete_range_all(hammer_cursor_t cursor, hammer_inode_t ip,
1501                            int *countp)
1502 {
1503         hammer_transaction_t trans = cursor->trans;
1504         hammer_record_ondisk_t rec;
1505         hammer_base_elm_t base;
1506         int error;
1507
1508         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1509 retry:
1510         hammer_normalize_cursor(cursor);
1511         cursor->key_beg.obj_id = ip->obj_id;
1512         cursor->key_beg.create_tid = 0;
1513         cursor->key_beg.delete_tid = 0;
1514         cursor->key_beg.obj_type = 0;
1515         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1516         cursor->key_beg.key = HAMMER_MIN_KEY;
1517
1518         cursor->key_end = cursor->key_beg;
1519         cursor->key_end.rec_type = 0xFFFF;
1520         cursor->key_end.key = HAMMER_MAX_KEY;
1521
1522         cursor->asof = ip->obj_asof;
1523         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1524         cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1525         cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1526         cursor->flags |= HAMMER_CURSOR_BACKEND;
1527
1528         error = hammer_ip_first(cursor);
1529
1530         /*
1531          * Iterate through matching records and mark them as deleted.
1532          */
1533         while (error == 0) {
1534                 rec = cursor->record;
1535                 base = &rec->base.base;
1536
1537                 KKASSERT(base->delete_tid == 0);
1538
1539                 /*
1540                  * Mark the record and B-Tree entry as deleted.  This will
1541                  * also physically delete the B-Tree entry, record, and
1542                  * data if the retention policy dictates.  The function
1543                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1544                  * uses to perform a fixup.
1545                  *
1546                  * Directory entries (and delete-on-disk directory entries)
1547                  * must be synced and cannot be deleted.
1548                  */
1549                 if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
1550                         error = hammer_ip_delete_record(cursor, trans->tid);
1551                         ++*countp;
1552                 }
1553                 if (error)
1554                         break;
1555                 error = hammer_ip_next(cursor);
1556         }
1557         if (error == EDEADLK) {
1558                 hammer_done_cursor(cursor);
1559                 error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
1560                 if (error == 0)
1561                         goto retry;
1562         }
1563         if (error == ENOENT)
1564                 error = 0;
1565         return(error);
1566 }
1567
1568 /*
1569  * Delete the record at the current cursor.  On success the cursor will
1570  * be positioned appropriately for an iteration but may no longer be at
1571  * a leaf node.
1572  *
1573  * This routine is only called from the backend.
1574  *
1575  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1576  * cursor and retry.
1577  */
1578 int
1579 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1580 {
1581         hammer_btree_elm_t elm;
1582         hammer_mount_t hmp;
1583         int error;
1584         int dodelete;
1585
1586         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1587
1588         /*
1589          * In-memory (unsynchronized) records can simply be freed.  This
1590          * only occurs in range iterations since all other records are
1591          * individually synchronized.  Thus there should be no confusion with
1592          * the interlock.
1593          */
1594         if (cursor->record == &cursor->iprec->rec) {
1595                 KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1596                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
1597                 cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
1598                 return(0);
1599         }
1600
1601         /*
1602          * On-disk records are marked as deleted by updating their delete_tid.
1603          * This does not effect their position in the B-Tree (which is based
1604          * on their create_tid).
1605          */
1606         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1607         elm = NULL;
1608         hmp = cursor->node->hmp;
1609
1610         dodelete = 0;
1611         if (error == 0) {
1612                 error = hammer_cursor_upgrade(cursor);
1613                 if (error == 0) {
1614                         elm = &cursor->node->ondisk->elms[cursor->index];
1615                         hammer_modify_node(cursor->trans, cursor->node,
1616                                            &elm->leaf.base.delete_tid,
1617                                            sizeof(elm->leaf.base.delete_tid));
1618                         elm->leaf.base.delete_tid = tid;
1619                         hammer_modify_node_done(cursor->node);
1620
1621                         /*
1622                          * An on-disk record cannot have the same delete_tid
1623                          * as its create_tid.  In a chain of record updates
1624                          * this could result in a duplicate record.
1625                          */
1626                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1627                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1628                         cursor->record->base.base.delete_tid = tid;
1629                         hammer_modify_buffer_done(cursor->record_buffer);
1630                 }
1631         }
1632
1633         /*
1634          * If we were mounted with the nohistory option, we physically
1635          * delete the record.
1636          */
1637         if (hmp->hflags & HMNT_NOHISTORY)
1638                 dodelete = 1;
1639
1640         if (error == 0 && dodelete) {
1641                 error = hammer_delete_at_cursor(cursor, NULL);
1642                 if (error) {
1643                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1644                         error = 0;
1645                 }
1646         }
1647         return(error);
1648 }
1649
1650 int
1651 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1652 {
1653         hammer_btree_elm_t elm;
1654         hammer_off_t rec_offset;
1655         hammer_off_t data_offset;
1656         int32_t data_len;
1657         u_int16_t rec_type;
1658         int error;
1659
1660         elm = &cursor->node->ondisk->elms[cursor->index];
1661         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1662
1663         rec_offset = elm->leaf.rec_offset;
1664         data_offset = elm->leaf.data_offset;
1665         data_len = elm->leaf.data_len;
1666         rec_type = elm->leaf.base.rec_type;
1667
1668         error = hammer_btree_delete(cursor);
1669         if (error == 0) {
1670                 /*
1671                  * This forces a fixup for the iteration because
1672                  * the cursor is now either sitting at the 'next'
1673                  * element or sitting at the end of a leaf.
1674                  */
1675                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1676                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1677                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1678                 }
1679         }
1680         if (error == 0) {
1681                 hammer_blockmap_free(cursor->trans, rec_offset,
1682                                      sizeof(union hammer_record_ondisk));
1683         }
1684         if (error == 0) {
1685                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1686                 case HAMMER_ZONE_LARGE_DATA:
1687                 case HAMMER_ZONE_SMALL_DATA:
1688                         hammer_blockmap_free(cursor->trans,
1689                                              data_offset, data_len);
1690                         break;
1691                 default:
1692                         break;
1693                 }
1694         }
1695 #if 0
1696         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1697                 "(%d remain in cluster)\n",
1698                 cluster->volume->vol_no, cluster->clu_no,
1699                 rec_offset, data_offset, data_len,
1700                 cluster->ondisk->stat_records);
1701 #endif
1702         return (error);
1703 }
1704
1705 /*
1706  * Determine whether we can remove a directory.  This routine checks whether
1707  * a directory is empty or not and enforces flush connectivity.
1708  *
1709  * Flush connectivity requires that we block if the target directory is
1710  * currently flushing, otherwise it may not end up in the same flush group.
1711  *
1712  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
1713  */
1714 int
1715 hammer_ip_check_directory_empty(hammer_transaction_t trans,
1716                         hammer_cursor_t parent_cursor, hammer_inode_t ip)
1717 {
1718         struct hammer_cursor cursor;
1719         int error;
1720
1721         /*
1722          * Check directory empty
1723          */
1724         hammer_init_cursor(trans, &cursor, &ip->cache[0], ip);
1725
1726         cursor.key_beg.obj_id = ip->obj_id;
1727         cursor.key_beg.create_tid = 0;
1728         cursor.key_beg.delete_tid = 0;
1729         cursor.key_beg.obj_type = 0;
1730         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1731         cursor.key_beg.key = HAMMER_MIN_KEY;
1732
1733         cursor.key_end = cursor.key_beg;
1734         cursor.key_end.rec_type = 0xFFFF;
1735         cursor.key_end.key = HAMMER_MAX_KEY;
1736
1737         cursor.asof = ip->obj_asof;
1738         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1739
1740         error = hammer_ip_first(&cursor);
1741         if (error == ENOENT)
1742                 error = 0;
1743         else if (error == 0)
1744                 error = ENOTEMPTY;
1745         hammer_done_cursor(&cursor);
1746         return(error);
1747 }
1748