HAMMER 38C/Many: Undo/Synchronization and crash recovery
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.42 2008/04/25 21:49:49 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71         return(0);
72 }
73
74 static int
75 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
76 {
77         if (info->rec_type < rec->rec.base.base.rec_type)
78                 return(-3);
79         if (info->rec_type > rec->rec.base.base.rec_type)
80                 return(3);
81
82         if (info->key < rec->rec.base.base.key)
83                 return(-2);
84         if (info->key > rec->rec.base.base.key)
85                 return(2);
86
87         if (info->create_tid == 0) {
88                 if (rec->rec.base.base.create_tid == 0)
89                         return(0);
90                 return(1);
91         }
92         if (rec->rec.base.base.create_tid == 0)
93                 return(-1);
94         if (info->create_tid < rec->rec.base.base.create_tid)
95                 return(-1);
96         if (info->create_tid > rec->rec.base.base.create_tid)
97                 return(1);
98         return(0);
99 }
100
101 /*
102  * RB_SCAN comparison code for hammer_mem_first().  The argument order
103  * is reversed so the comparison result has to be negated.  key_beg and
104  * key_end are both range-inclusive.
105  *
106  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
107  * These do not stop the scan.
108  *
109  * Localized deletions are not cached in-memory.
110  */
111 static
112 int
113 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
114 {
115         hammer_cursor_t cursor = data;
116         int r;
117
118         r = hammer_rec_compare(&cursor->key_beg, rec);
119         if (r > 1)
120                 return(-1);
121         r = hammer_rec_compare(&cursor->key_end, rec);
122         if (r < -1)
123                 return(1);
124         return(0);
125 }
126
127 /*
128  * This compare function is used when simply looking up key_beg.
129  */
130 static
131 int
132 hammer_rec_find_cmp(hammer_record_t rec, void *data)
133 {
134         hammer_cursor_t cursor = data;
135         int r;
136
137         r = hammer_rec_compare(&cursor->key_beg, rec);
138         if (r > 1)
139                 return(-1);
140         if (r < -1)
141                 return(1);
142         return(0);
143 }
144
145 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
146 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
147                     hammer_rec_compare, hammer_base_elm_t);
148
149 /*
150  * Allocate a record for the caller to finish filling in.  The record is
151  * returned referenced.
152  */
153 hammer_record_t
154 hammer_alloc_mem_record(hammer_inode_t ip)
155 {
156         hammer_record_t record;
157
158         ++hammer_count_records;
159         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
160         record->state = HAMMER_FST_IDLE;
161         record->ip = ip;
162         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
163         hammer_ref(&record->lock);
164         return (record);
165 }
166
167 void
168 hammer_wait_mem_record(hammer_record_t record)
169 {
170         while (record->state == HAMMER_FST_FLUSH) {
171                 record->flags |= HAMMER_RECF_WANTED;
172                 tsleep(record, 0, "hmrrc2", 0);
173         }
174 }
175
176 /*
177  * Called from the backend, hammer_inode.c, when a record has been
178  * flushed to disk.
179  *
180  * The backend has likely marked this record for deletion as well.
181  */
182 void
183 hammer_flush_record_done(hammer_record_t record)
184 {
185         KKASSERT(record->state == HAMMER_FST_FLUSH);
186         record->state = HAMMER_FST_IDLE;
187         if (record->flags & HAMMER_RECF_WANTED) {
188                 record->flags &= ~HAMMER_RECF_WANTED;
189                 wakeup(record);
190         }
191         hammer_rel_mem_record(record);
192 }
193
194
195 /*
196  * Release a memory record.  Records marked for deletion are immediately
197  * removed from the RB-Tree but otherwise left intact until the last ref
198  * goes away.
199  */
200 void
201 hammer_rel_mem_record(struct hammer_record *record)
202 {
203         hammer_unref(&record->lock);
204
205         if (record->flags & HAMMER_RECF_DELETED_FE) {
206                 if (record->lock.refs == 0) {
207                         if (record->flags & HAMMER_RECF_ONRBTREE) {
208                                 RB_REMOVE(hammer_rec_rb_tree,
209                                           &record->ip->rec_tree,
210                                           record);
211                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
212                         }
213                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
214                                 --hammer_count_record_datas;
215                                 kfree(record->data, M_HAMMER);
216                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
217                         }
218                         record->data = NULL;
219                         --hammer_count_records;
220                         kfree(record, M_HAMMER);
221                         return;
222                 }
223         }
224
225         /*
226          * If someone wanted the record wake them up.
227          */
228         if (record->flags & HAMMER_RECF_WANTED) {
229                 record->flags &= ~HAMMER_RECF_WANTED;
230                 wakeup(record);
231         }
232 }
233
234 /*
235  * The deletion state of a record will appear different to the backend
236  * then it does to the frontend.
237  */
238 static __inline
239 int
240 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t rec)
241 {
242         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
243                 if (rec->flags & HAMMER_RECF_DELETED_BE)
244                         return(0);
245         } else {
246                 if (rec->flags & HAMMER_RECF_DELETED_FE)
247                         return(0);
248         }
249         return(1);
250 }
251
252 /*
253  * This callback is used as part of the RB_SCAN function for in-memory
254  * records.  We terminate it (return -1) as soon as we get a match.
255  *
256  * This routine is used by frontend code.
257  *
258  * The primary compare code does not account for ASOF lookups.  This
259  * code handles that case as well as a few others.
260  */
261 static
262 int
263 hammer_rec_scan_callback(hammer_record_t rec, void *data)
264 {
265         hammer_cursor_t cursor = data;
266
267         /*
268          * We terminate on success, so this should be NULL on entry.
269          */
270         KKASSERT(cursor->iprec == NULL);
271
272         /*
273          * Skip if the record was marked deleted.
274          */
275         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
276                 return(0);
277
278         /*
279          * Skip if not visible due to our as-of TID
280          */
281         if (cursor->flags & HAMMER_CURSOR_ASOF) {
282                 if (cursor->asof < rec->rec.base.base.create_tid)
283                         return(0);
284                 if (rec->rec.base.base.delete_tid &&
285                     cursor->asof >= rec->rec.base.base.delete_tid) {
286                         return(0);
287                 }
288         }
289
290         /*
291          * If the record is queued to the flusher we have to block until
292          * it isn't.  Otherwise we may see duplication between our memory
293          * cache and the media.
294          */
295         hammer_ref(&rec->lock);
296
297 #warning "This deadlocks"
298 #if 0
299         if (rec->state == HAMMER_FST_FLUSH)
300                 hammer_wait_mem_record(rec);
301 #endif
302
303         /*
304          * The record may have been deleted while we were blocked.
305          */
306         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
307                 hammer_rel_mem_record(rec);
308                 return(0);
309         }
310
311         /*
312          * Set the matching record and stop the scan.
313          */
314         cursor->iprec = rec;
315         return(-1);
316 }
317
318
319 /*
320  * Lookup an in-memory record given the key specified in the cursor.  Works
321  * just like hammer_btree_lookup() but operates on an inode's in-memory
322  * record list.
323  *
324  * The lookup must fail if the record is marked for deferred deletion.
325  */
326 static
327 int
328 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
329 {
330         int error;
331
332         if (cursor->iprec) {
333                 hammer_rel_mem_record(cursor->iprec);
334                 cursor->iprec = NULL;
335         }
336         if (cursor->ip) {
337                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
338                 --cursor->ip->cursor_ip_refs;
339 #if 0
340                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
341                                                   &cursor->ip->rec_tree);
342 #endif
343         }
344         cursor->ip = ip;
345 #if 0
346         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
347 #endif
348         ++ip->cursor_ip_refs;
349
350 #if 0
351         cursor->scan.node = NULL;
352 #endif
353         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
354                                    hammer_rec_scan_callback, cursor);
355
356         if (cursor->iprec == NULL)
357                 error = ENOENT;
358         else
359                 error = 0;
360         return(error);
361 }
362
363 /*
364  * hammer_mem_first() - locate the first in-memory record matching the
365  * cursor within the bounds of the key range.
366  */
367 static
368 int
369 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
370 {
371         if (cursor->iprec) {
372                 hammer_rel_mem_record(cursor->iprec);
373                 cursor->iprec = NULL;
374         }
375         if (cursor->ip) {
376                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
377                 --cursor->ip->cursor_ip_refs;
378 #if 0
379                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
380                                                   &cursor->ip->rec_tree);
381 #endif
382         }
383         cursor->ip = ip;
384 #if 0
385         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
386 #endif
387         ++ip->cursor_ip_refs;
388
389 #if 0
390         cursor->scan.node = NULL;
391 #endif
392         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
393                                    hammer_rec_scan_callback, cursor);
394
395         /*
396          * Adjust scan.node and keep it linked into the RB-tree so we can
397          * hold the cursor through third party modifications of the RB-tree.
398          */
399         if (cursor->iprec) {
400 #if 0
401                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
402 #endif
403                 return(0);
404         }
405         return(ENOENT);
406 }
407
408 void
409 hammer_mem_done(hammer_cursor_t cursor)
410 {
411         if (cursor->ip) {
412                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
413                 --cursor->ip->cursor_ip_refs;
414 #if 0
415                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
416                                                   &cursor->ip->rec_tree);
417 #endif
418                 cursor->ip = NULL;
419         }
420         if (cursor->iprec) {
421                 hammer_rel_mem_record(cursor->iprec);
422                 cursor->iprec = NULL;
423         }
424 }
425
426 /************************************************************************
427  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
428  ************************************************************************
429  *
430  * These functions manipulate in-memory records.  Such records typically
431  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
432  */
433
434 /*
435  * Add a directory entry (dip,ncp) which references inode (ip).
436  *
437  * Note that the low 32 bits of the namekey are set temporarily to create
438  * a unique in-memory record, and may be modified a second time when the
439  * record is synchronized to disk.  In particular, the low 32 bits cannot be
440  * all 0's when synching to disk, which is not handled here.
441  */
442 int
443 hammer_ip_add_directory(struct hammer_transaction *trans,
444                      struct hammer_inode *dip, struct namecache *ncp,
445                      struct hammer_inode *ip)
446 {
447         hammer_record_t record;
448         int error;
449         int bytes;
450
451         record = hammer_alloc_mem_record(dip);
452
453         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
454         if (++trans->hmp->namekey_iterator == 0)
455                 ++trans->hmp->namekey_iterator;
456
457         record->rec.entry.base.base.obj_id = dip->obj_id;
458         record->rec.entry.base.base.key =
459                 hammer_directory_namekey(ncp->nc_name, bytes);
460         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
461         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
462         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
463         record->rec.entry.obj_id = ip->obj_id;
464         record->data = (void *)ncp->nc_name;
465         record->rec.entry.base.data_len = bytes;
466         ++ip->ino_rec.ino_nlinks;
467         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
468         /* NOTE: copies record->data */
469         error = hammer_mem_add(trans, record);
470         return(error);
471 }
472
473 /*
474  * Delete the directory entry and update the inode link count.  The
475  * cursor must be seeked to the directory entry record being deleted.
476  *
477  * The related inode should be share-locked by the caller.  The caller is
478  * on the frontend.
479  *
480  * This function can return EDEADLK requiring the caller to terminate
481  * the cursor, any locks, wait on the returned record, and retry.
482  */
483 int
484 hammer_ip_del_directory(struct hammer_transaction *trans,
485                      hammer_cursor_t cursor, struct hammer_inode *dip,
486                      struct hammer_inode *ip)
487 {
488         hammer_record_t record;
489         int error;
490
491         if (cursor->record == &cursor->iprec->rec) {
492                 /*
493                  * In-memory (unsynchronized) records can simply be freed.
494                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
495                  * by the backend, we must still avoid races against the
496                  * backend potentially syncing the record to the media. 
497                  *
498                  * We cannot call hammer_ip_delete_record(), that routine may
499                  * only be called from the backend.
500                  */
501                 record = cursor->iprec;
502                 if (record->state == HAMMER_FST_FLUSH) {
503                         KKASSERT(cursor->deadlk_rec == NULL);
504                         hammer_ref(&record->lock);
505                         cursor->deadlk_rec = record;
506                         error = EDEADLK;
507                 } else {
508                         cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
509                         error = 0;
510                 }
511         } else {
512                 /*
513                  * If the record is on-disk we have to queue the deletion by
514                  * the record's key.  This also causes lookups to skip the
515                  * record.
516                  */
517                 record = hammer_alloc_mem_record(dip);
518                 record->rec.entry.base.base = cursor->record->base.base;
519                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
520                 record->flags |= HAMMER_RECF_DELETE_ONDISK;
521
522                 error = hammer_mem_add(trans, record);
523         }
524
525         /*
526          * One less link.  The file may still be open in the OS even after
527          * all links have gone away so we only try to sync if the OS has
528          * no references and nlinks falls to 0.
529          *
530          * We have to terminate the cursor before syncing the inode to
531          * avoid deadlocking against ourselves.
532          *
533          * XXX we can't sync the inode here because the encompassing
534          * transaction might be a rename and might update the inode
535          * again with a new link.  That would force the delete_tid to be
536          * the same as the create_tid and cause a panic.
537          */
538         if (error == 0) {
539                 --ip->ino_rec.ino_nlinks;
540                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
541                 if (ip->ino_rec.ino_nlinks == 0 &&
542                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
543                         hammer_done_cursor(cursor);
544                 }
545
546         }
547         return(error);
548 }
549
550 /*
551  * Add a record to an inode.
552  *
553  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
554  * initialize the following additional fields:
555  *
556  * The related inode should be share-locked by the caller.  The caller is
557  * on the frontend.
558  *
559  * record->rec.entry.base.base.key
560  * record->rec.entry.base.base.rec_type
561  * record->rec.entry.base.base.data_len
562  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
563  */
564 int
565 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
566 {
567         hammer_inode_t ip = record->ip;
568         int error;
569
570         record->rec.base.base.obj_id = ip->obj_id;
571         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
572
573         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
574         /* NOTE: copies record->data */
575         error = hammer_mem_add(trans, record);
576         return(error);
577 }
578
579 /*
580  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
581  * is called via the strategy called from a cached data source.  This code
582  * is responsible for actually writing a data record out to the disk.
583  *
584  * This can only occur non-historically (i.e. 'current' data only).
585  *
586  * The file offset must be HAMMER_BUFSIZE aligned but the data length
587  * can be truncated.  The record (currently) always represents a BUFSIZE
588  * swath of space whether the data is truncated or not.
589  */
590 int
591 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
592                        int64_t offset, void *data, int bytes)
593 {
594         struct hammer_cursor cursor;
595         hammer_record_ondisk_t rec;
596         union hammer_btree_elm elm;
597         hammer_off_t rec_offset;
598         void *bdata;
599         int error;
600
601         KKASSERT((offset & HAMMER_BUFMASK) == 0);
602         KKASSERT(trans->type == HAMMER_TRANS_FLS);
603 retry:
604         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
605         if (error)
606                 return(error);
607         cursor.key_beg.obj_id = ip->obj_id;
608         cursor.key_beg.key = offset + bytes;
609         cursor.key_beg.create_tid = trans->tid;
610         cursor.key_beg.delete_tid = 0;
611         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
612         cursor.asof = trans->tid;
613         cursor.flags |= HAMMER_CURSOR_INSERT;
614
615         /*
616          * Issue a lookup to position the cursor.
617          */
618         error = hammer_btree_lookup(&cursor);
619         if (error == 0) {
620                 kprintf("hammer_ip_sync_data: duplicate data at "
621                         "(%lld,%d) tid %016llx\n",
622                         offset, bytes, trans->tid);
623                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
624                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
625                 panic("Duplicate data");
626                 error = EIO;
627         }
628         if (error != ENOENT)
629                 goto done;
630
631         /*
632          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
633          * can cross buffer boundaries so we may have to split our bcopy.
634          */
635         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
636                                   &cursor.record_buffer,
637                                   bytes, &bdata,
638                                   &cursor.data_buffer, &error);
639         if (rec == NULL)
640                 goto done;
641         if (hammer_debug_general & 0x1000)
642                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
643
644         /*
645          * Fill everything in and insert our B-Tree node.
646          *
647          * NOTE: hammer_alloc_record() has already marked the related
648          * buffers as modified.  If we do it again we will generate
649          * unnecessary undo elements.
650          */
651         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
652         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
653         rec->base.base.obj_id = ip->obj_id;
654         rec->base.base.key = offset + bytes;
655         rec->base.base.create_tid = trans->tid;
656         rec->base.base.delete_tid = 0;
657         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
658         rec->base.data_crc = crc32(data, bytes);
659         hammer_modify_buffer_done(cursor.record_buffer);
660         KKASSERT(rec->base.data_len == bytes);
661
662         hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
663         bcopy(data, bdata, bytes);
664         hammer_modify_buffer_done(cursor.data_buffer);
665
666         elm.leaf.base = rec->base.base;
667         elm.leaf.rec_offset = rec_offset;
668         elm.leaf.data_offset = rec->base.data_off;
669         elm.leaf.data_len = bytes;
670         elm.leaf.data_crc = rec->base.data_crc;
671
672         /*
673          * Data records can wind up on-disk before the inode itself is
674          * on-disk.  One must assume data records may be on-disk if either
675          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
676          */
677         ip->flags |= HAMMER_INODE_DONDISK;
678
679         error = hammer_btree_insert(&cursor, &elm);
680         if (error == 0)
681                 goto done;
682
683         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
684 done:
685         hammer_done_cursor(&cursor);
686         if (error == EDEADLK)
687                 goto retry;
688         return(error);
689 }
690
691 /*
692  * Sync an in-memory record to the disk.  This is called by the backend.
693  * This code is responsible for actually writing a record out to the disk.
694  *
695  * NOTE: The frontend can mark the record deleted while it is queued to
696  * the backend.  The deletion applies to a frontend operation and the
697  * record must be treated as NOT having been deleted on the backend, so
698  * we ignore the flag.
699  */
700 int
701 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
702 {
703         struct hammer_cursor cursor;
704         hammer_record_ondisk_t rec;
705         union hammer_btree_elm elm;
706         hammer_off_t rec_offset;
707         void *bdata;
708         int error;
709
710         KKASSERT(record->state == HAMMER_FST_FLUSH);
711
712 retry:
713         /*
714          * Get a cursor, we will either be inserting or deleting.
715          */
716         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
717         if (error)
718                 return(error);
719         cursor.key_beg = record->rec.base.base;
720
721         /*
722          * If we are deleting an exact match must be found on-disk.
723          */
724         if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
725                 error = hammer_btree_lookup(&cursor);
726                 kprintf("DELETE MEM ENTRY1 %d\n", error);
727                 if (error == 0)
728                         error = hammer_ip_delete_record(&cursor, trans->tid);
729                 kprintf("DELETE MEM ENTRY2 %d\n", error);
730                 if (error == 0)
731                         record->flags |= HAMMER_RECF_DELETED_FE;
732                 goto done;
733         }
734
735         /*
736          * We are inserting.
737          *
738          * Issue a lookup to position the cursor and locate the cluster.  The
739          * target key should not exist.  If we are creating a directory entry
740          * we may have to iterate the low 32 bits of the key to find an unused
741          * key.
742          */
743         cursor.flags |= HAMMER_CURSOR_INSERT;
744
745         for (;;) {
746                 error = hammer_btree_lookup(&cursor);
747                 if (error)
748                         break;
749                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
750                         kprintf("hammer_ip_sync_record: duplicate rec "
751                                 "at (%016llx)\n", record->rec.base.base.key);
752                         Debugger("duplicate record1");
753                         error = EIO;
754                         break;
755                 }
756                 if (++trans->hmp->namekey_iterator == 0)
757                         ++trans->hmp->namekey_iterator;
758                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
759                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
760                 cursor.key_beg.key = record->rec.base.base.key;
761         }
762         if (error != ENOENT)
763                 goto done;
764
765         /*
766          * Allocate the record and data.  The result buffers will be
767          * marked as being modified and further calls to
768          * hammer_modify_buffer() will result in unneeded UNDO records.
769          *
770          * Support zero-fill records (data == NULL and data_len != 0)
771          */
772         if (record->data == NULL) {
773                 rec = hammer_alloc_record(trans, &rec_offset,
774                                           record->rec.base.base.rec_type,
775                                           &cursor.record_buffer,
776                                           0, &bdata,
777                                           NULL, &error);
778                 if (hammer_debug_general & 0x1000)
779                         kprintf("NULL RECORD DATA\n");
780         } else if (record->flags & HAMMER_RECF_INBAND) {
781                 rec = hammer_alloc_record(trans, &rec_offset,
782                                           record->rec.base.base.rec_type,
783                                           &cursor.record_buffer,
784                                           record->rec.base.data_len, &bdata,
785                                           NULL, &error);
786                 if (hammer_debug_general & 0x1000)
787                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
788         } else {
789                 rec = hammer_alloc_record(trans, &rec_offset,
790                                           record->rec.base.base.rec_type,
791                                           &cursor.record_buffer,
792                                           record->rec.base.data_len, &bdata,
793                                           &cursor.data_buffer, &error);
794                 if (hammer_debug_general & 0x1000)
795                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
796         }
797
798         if (rec == NULL)
799                 goto done;
800
801         /*
802          * Fill in the remaining fields and insert our B-Tree node.
803          */
804         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
805         rec->base.base = record->rec.base.base;
806         bcopy(&record->rec.base + 1, &rec->base + 1,
807               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
808
809         /*
810          * Copy the data and deal with zero-fill support.
811          */
812         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
813                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
814                 bcopy(record->data, bdata, rec->base.data_len);
815         } else if (record->data) {
816                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
817                 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
818                 bcopy(record->data, bdata, rec->base.data_len);
819                 hammer_modify_buffer_done(cursor.data_buffer);
820         } else {
821                 rec->base.data_len = record->rec.base.data_len;
822         }
823         hammer_modify_buffer_done(cursor.record_buffer);
824
825         elm.leaf.base = record->rec.base.base;
826         elm.leaf.rec_offset = rec_offset;
827         elm.leaf.data_offset = rec->base.data_off;
828         elm.leaf.data_len = rec->base.data_len;
829         elm.leaf.data_crc = rec->base.data_crc;
830
831         error = hammer_btree_insert(&cursor, &elm);
832
833         /*
834          * Clean up on success, or fall through on error.
835          */
836         if (error == 0) {
837                 record->flags |= HAMMER_RECF_DELETED_FE;
838                 goto done;
839         }
840
841         /*
842          * Try to unwind the allocation
843          */
844         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
845 done:
846         hammer_done_cursor(&cursor);
847         if (error == EDEADLK)
848                 goto retry;
849         return(error);
850 }
851
852 /*
853  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
854  * entry's key is used to deal with hash collisions in the upper 32 bits.
855  * A unique 64 bit key is generated in-memory and may be regenerated a
856  * second time when the directory record is flushed to the on-disk B-Tree.
857  *
858  * A referenced record is passed to this function.  This function
859  * eats the reference.  If an error occurs the record will be deleted.
860  *
861  * A copy of the temporary record->data pointer provided by the caller
862  * will be made.
863  */
864 static
865 int
866 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
867 {
868         void *data;
869         int bytes;
870         int reclen;
871                 
872         /*
873          * Make a private copy of record->data
874          */
875         if (record->data) {
876                 /*
877                  * Try to embed the data in extra space in the record
878                  * union, otherwise allocate a copy.
879                  */
880                 bytes = record->rec.base.data_len;
881                 switch(record->rec.base.base.rec_type) {
882                 case HAMMER_RECTYPE_DIRENTRY:
883                         reclen = offsetof(struct hammer_entry_record, name[0]);
884                         break;
885                 case HAMMER_RECTYPE_DATA:
886                         reclen = offsetof(struct hammer_data_record, data[0]);
887                         break;
888                 default:
889                         reclen = sizeof(record->rec);
890                         break;
891                 }
892                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
893                         bcopy(record->data, (char *)&record->rec + reclen,
894                               bytes);
895                         record->data = (void *)((char *)&record->rec + reclen);
896                         record->flags |= HAMMER_RECF_INBAND;
897                 } else {
898                         ++hammer_count_record_datas;
899                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
900                         record->flags |= HAMMER_RECF_ALLOCDATA;
901                         bcopy(record->data, data, bytes);
902                         record->data = data;
903                 }
904         }
905
906         /*
907          * Insert into the RB tree, find an unused iterator if this is
908          * a directory entry.
909          */
910         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
911                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
912                         record->flags |= HAMMER_RECF_DELETED_FE;
913                         hammer_rel_mem_record(record);
914                         return (EEXIST);
915                 }
916                 if (++trans->hmp->namekey_iterator == 0)
917                         ++trans->hmp->namekey_iterator;
918                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
919                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
920         }
921         record->flags |= HAMMER_RECF_ONRBTREE;
922         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
923         hammer_rel_mem_record(record);
924         return(0);
925 }
926
927 /************************************************************************
928  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
929  ************************************************************************
930  *
931  * These functions augment the B-Tree scanning functions in hammer_btree.c
932  * by merging in-memory records with on-disk records.
933  */
934
935 /*
936  * Locate a particular record either in-memory or on-disk.
937  *
938  * NOTE: This is basically a standalone routine, hammer_ip_next() may
939  * NOT be called to iterate results.
940  */
941 int
942 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
943 {
944         int error;
945
946         /*
947          * If the element is in-memory return it without searching the
948          * on-disk B-Tree
949          */
950         error = hammer_mem_lookup(cursor, ip);
951         if (error == 0) {
952                 cursor->record = &cursor->iprec->rec;
953                 return(error);
954         }
955         if (error != ENOENT)
956                 return(error);
957
958         /*
959          * If the inode has on-disk components search the on-disk B-Tree.
960          */
961         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
962                 return(error);
963         error = hammer_btree_lookup(cursor);
964         if (error == 0)
965                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
966         return(error);
967 }
968
969 /*
970  * Locate the first record within the cursor's key_beg/key_end range,
971  * restricted to a particular inode.  0 is returned on success, ENOENT
972  * if no records matched the requested range, or some other error.
973  *
974  * When 0 is returned hammer_ip_next() may be used to iterate additional
975  * records within the requested range.
976  *
977  * This function can return EDEADLK, requiring the caller to terminate
978  * the cursor and try again.
979  */
980 int
981 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
982 {
983         int error;
984
985         /*
986          * Clean up fields and setup for merged scan
987          */
988         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
989         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
990         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
991         if (cursor->iprec) {
992                 hammer_rel_mem_record(cursor->iprec);
993                 cursor->iprec = NULL;
994         }
995
996         /*
997          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
998          * exact lookup so if we get ENOENT we have to call the iterate
999          * function to validate the first record after the begin key.
1000          *
1001          * The ATEDISK flag is used by hammer_btree_iterate to determine
1002          * whether it must index forwards or not.  It is also used here
1003          * to select the next record from in-memory or on-disk.
1004          *
1005          * EDEADLK can only occur if the lookup hit an empty internal
1006          * element and couldn't delete it.  Since this could only occur
1007          * in-range, we can just iterate from the failure point.
1008          */
1009         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1010                 error = hammer_btree_lookup(cursor);
1011                 if (error == ENOENT || error == EDEADLK) {
1012                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1013                         if (hammer_debug_general & 0x2000)
1014                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1015                         error = hammer_btree_iterate(cursor);
1016                 }
1017                 if (error && error != ENOENT) 
1018                         return(error);
1019                 if (error == 0) {
1020                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1021                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1022                 } else {
1023                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1024                 }
1025         }
1026
1027         /*
1028          * Search the in-memory record list (Red-Black tree).  Unlike the
1029          * B-Tree search, mem_first checks for records in the range.
1030          */
1031         error = hammer_mem_first(cursor, ip);
1032         if (error && error != ENOENT)
1033                 return(error);
1034         if (error == 0) {
1035                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1036                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1037                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1038                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1039         }
1040
1041         /*
1042          * This will return the first matching record.
1043          */
1044         return(hammer_ip_next(cursor));
1045 }
1046
1047 /*
1048  * Retrieve the next record in a merged iteration within the bounds of the
1049  * cursor.  This call may be made multiple times after the cursor has been
1050  * initially searched with hammer_ip_first().
1051  *
1052  * 0 is returned on success, ENOENT if no further records match the
1053  * requested range, or some other error code is returned.
1054  */
1055 int
1056 hammer_ip_next(hammer_cursor_t cursor)
1057 {
1058         hammer_btree_elm_t elm;
1059         hammer_record_t rec, save;
1060         int error;
1061         int r;
1062
1063 next_btree:
1064         /*
1065          * Load the current on-disk and in-memory record.  If we ate any
1066          * records we have to get the next one. 
1067          *
1068          * If we deleted the last on-disk record we had scanned ATEDISK will
1069          * be clear and DELBTREE will be set, forcing a call to iterate. The
1070          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1071          * element.  If ATEDISK is set, iterate will skip the 'current'
1072          * element.
1073          *
1074          * Get the next on-disk record
1075          */
1076         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1077                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1078                         error = hammer_btree_iterate(cursor);
1079                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1080                         if (error == 0)
1081                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1082                         else
1083                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1084                                                  HAMMER_CURSOR_ATEDISK;
1085                 }
1086         }
1087
1088 next_memory:
1089         /*
1090          * Get the next in-memory record.  The record can be ripped out
1091          * of the RB tree so we maintain a scan_info structure to track
1092          * the next node.
1093          *
1094          * hammer_rec_scan_cmp:  Is the record still in our general range,
1095          *                       (non-inclusive of snapshot exclusions)?
1096          * hammer_rec_scan_callback: Is the record in our snapshot?
1097          */
1098         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1099                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1100                         save = cursor->iprec;
1101                         cursor->iprec = NULL;
1102                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1103                         while (rec) {
1104                                 if (hammer_ip_iterate_mem_good(cursor, rec)) {
1105                                         if (hammer_rec_scan_cmp(rec, cursor) != 0)
1106                                                 break;
1107                                         if (hammer_rec_scan_callback(rec, cursor) != 0)
1108                                                 break;
1109                                 }
1110                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1111                         }
1112                         if (save)
1113                                 hammer_rel_mem_record(save);
1114                         if (cursor->iprec) {
1115                                 KKASSERT(cursor->iprec == rec);
1116                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1117 #if 0
1118                                 cursor->scan.node =
1119                                         hammer_rec_rb_tree_RB_NEXT(rec);
1120 #endif
1121                         } else {
1122                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1123                         }
1124                 }
1125         }
1126
1127         /*
1128          * Extract either the disk or memory record depending on their
1129          * relative position.
1130          */
1131         error = 0;
1132         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1133         case 0:
1134                 /*
1135                  * Both entries valid
1136                  */
1137                 elm = &cursor->node->ondisk->elms[cursor->index];
1138                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1139                 if (r < 0) {
1140                         error = hammer_btree_extract(cursor,
1141                                                      HAMMER_CURSOR_GET_RECORD);
1142                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1143                         break;
1144                 }
1145
1146                 /*
1147                  * If the entries match the memory entry must specify
1148                  * an on-disk deletion.  Eat both entries unless the
1149                  * caller wants visibility into the special records.
1150                  */
1151                 if (r == 0) {
1152                         KKASSERT(cursor->iprec->flags & 
1153                                  HAMMER_RECF_DELETE_ONDISK);
1154                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1155                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1156                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1157                                 kprintf("SKIP MEM ENTRY\n");
1158                                 goto next_btree;
1159                         }
1160                 }
1161                 /* fall through to the memory entry */
1162         case HAMMER_CURSOR_ATEDISK:
1163                 /*
1164                  * Only the memory entry is valid.  If the record is
1165                  * placemarking an on-disk deletion, we skip it unless
1166                  * the caller wants special record visibility.
1167                  */
1168                 cursor->record = &cursor->iprec->rec;
1169                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1170                 if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) {
1171                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1172                                 goto next_memory;
1173                 }
1174                 break;
1175         case HAMMER_CURSOR_ATEMEM:
1176                 /*
1177                  * Only the disk entry is valid
1178                  */
1179                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1180                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1181                 break;
1182         default:
1183                 /*
1184                  * Neither entry is valid
1185                  *
1186                  * XXX error not set properly
1187                  */
1188                 cursor->record = NULL;
1189                 error = ENOENT;
1190                 break;
1191         }
1192         return(error);
1193 }
1194
1195 /*
1196  * Resolve the cursor->data pointer for the current cursor position in
1197  * a merged iteration.
1198  */
1199 int
1200 hammer_ip_resolve_data(hammer_cursor_t cursor)
1201 {
1202         int error;
1203
1204         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1205                 cursor->data = cursor->iprec->data;
1206                 error = 0;
1207         } else {
1208                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1209         }
1210         return(error);
1211 }
1212
1213 int
1214 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1215 {
1216         int error;
1217
1218         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1219                 cursor->data = cursor->iprec->data;
1220                 error = 0;
1221         } else {
1222                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1223                                                      HAMMER_CURSOR_GET_RECORD);
1224         }
1225         return(error);
1226 }
1227
1228 /*
1229  * Delete all records within the specified range for inode ip.
1230  *
1231  * NOTE: An unaligned range will cause new records to be added to cover
1232  * the edge cases. (XXX not implemented yet).
1233  *
1234  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1235  *
1236  * NOTE: Record keys for regular file data have to be special-cased since
1237  * they indicate the end of the range (key = base + bytes).
1238  */
1239 int
1240 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1241                        int64_t ran_beg, int64_t ran_end)
1242 {
1243         struct hammer_cursor cursor;
1244         hammer_record_ondisk_t rec;
1245         hammer_base_elm_t base;
1246         int error;
1247         int64_t off;
1248
1249 #if 0
1250         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1251 #endif
1252
1253         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1254 retry:
1255         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1256
1257         cursor.key_beg.obj_id = ip->obj_id;
1258         cursor.key_beg.create_tid = 0;
1259         cursor.key_beg.delete_tid = 0;
1260         cursor.key_beg.obj_type = 0;
1261         cursor.asof = ip->obj_asof;
1262         cursor.flags |= HAMMER_CURSOR_ASOF;
1263         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1264         cursor.flags |= HAMMER_CURSOR_BACKEND;
1265
1266         cursor.key_end = cursor.key_beg;
1267         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1268                 cursor.key_beg.key = ran_beg;
1269                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1270                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1271                 cursor.key_end.key = ran_end;
1272         } else {
1273                 /*
1274                  * The key in the B-Tree is (base+bytes), so the first possible
1275                  * matching key is ran_beg + 1.
1276                  */
1277                 int64_t tmp64;
1278
1279                 cursor.key_beg.key = ran_beg + 1;
1280                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1281                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1282
1283                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1284                 if (tmp64 < ran_end)
1285                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1286                 else
1287                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1288         }
1289         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1290
1291         error = hammer_ip_first(&cursor, ip);
1292
1293         /*
1294          * Iterate through matching records and mark them as deleted.
1295          */
1296         while (error == 0) {
1297                 rec = cursor.record;
1298                 base = &rec->base.base;
1299
1300                 KKASSERT(base->delete_tid == 0);
1301
1302                 /*
1303                  * There may be overlap cases for regular file data.  Also
1304                  * remember the key for a regular file record is the offset
1305                  * of the last byte of the record (base + len - 1), NOT the
1306                  * base offset.
1307                  */
1308 #if 0
1309                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1310 #endif
1311                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1312 #if 0
1313                         kprintf("delete_range loop key %016llx,%d\n",
1314                                 base->key - rec->base.data_len, rec->base.data_len);
1315 #endif
1316                         off = base->key - rec->base.data_len;
1317                         /*
1318                          * Check the left edge case.  We currently do not
1319                          * split existing records.
1320                          */
1321                         if (off < ran_beg) {
1322                                 panic("hammer left edge case %016llx %d\n",
1323                                         base->key, rec->base.data_len);
1324                         }
1325
1326                         /*
1327                          * Check the right edge case.  Note that the
1328                          * record can be completely out of bounds, which
1329                          * terminates the search.
1330                          *
1331                          * base->key is exclusive of the right edge while
1332                          * ran_end is inclusive of the right edge.  The
1333                          * (key - data_len) left boundary is inclusive.
1334                          *
1335                          * XXX theory-check this test at some point, are
1336                          * we missing a + 1 somewhere?  Note that ran_end
1337                          * could overflow.
1338                          */
1339                         if (base->key - 1 > ran_end) {
1340                                 if (base->key - rec->base.data_len > ran_end)
1341                                         break;
1342                                 panic("hammer right edge case\n");
1343                         }
1344                 }
1345
1346                 /*
1347                  * Mark the record and B-Tree entry as deleted.  This will
1348                  * also physically delete the B-Tree entry, record, and
1349                  * data if the retention policy dictates.  The function
1350                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1351                  * uses to perform a fixup.
1352                  */
1353                 error = hammer_ip_delete_record(&cursor, trans->tid);
1354                 if (error)
1355                         break;
1356                 error = hammer_ip_next(&cursor);
1357         }
1358         hammer_done_cursor(&cursor);
1359         if (error == EDEADLK)
1360                 goto retry;
1361         if (error == ENOENT)
1362                 error = 0;
1363         return(error);
1364 }
1365
1366 /*
1367  * Delete all records associated with an inode except the inode record
1368  * itself.
1369  */
1370 int
1371 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1372 {
1373         struct hammer_cursor cursor;
1374         hammer_record_ondisk_t rec;
1375         hammer_base_elm_t base;
1376         int error;
1377
1378         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1379 retry:
1380         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1381
1382         cursor.key_beg.obj_id = ip->obj_id;
1383         cursor.key_beg.create_tid = 0;
1384         cursor.key_beg.delete_tid = 0;
1385         cursor.key_beg.obj_type = 0;
1386         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1387         cursor.key_beg.key = HAMMER_MIN_KEY;
1388
1389         cursor.key_end = cursor.key_beg;
1390         cursor.key_end.rec_type = 0xFFFF;
1391         cursor.key_end.key = HAMMER_MAX_KEY;
1392
1393         cursor.asof = ip->obj_asof;
1394         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1395         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1396         cursor.flags |= HAMMER_CURSOR_BACKEND;
1397
1398         error = hammer_ip_first(&cursor, ip);
1399
1400         /*
1401          * Iterate through matching records and mark them as deleted.
1402          */
1403         while (error == 0) {
1404                 rec = cursor.record;
1405                 base = &rec->base.base;
1406
1407                 KKASSERT(base->delete_tid == 0);
1408
1409                 /*
1410                  * Mark the record and B-Tree entry as deleted.  This will
1411                  * also physically delete the B-Tree entry, record, and
1412                  * data if the retention policy dictates.  The function
1413                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1414                  * uses to perform a fixup.
1415                  */
1416                 error = hammer_ip_delete_record(&cursor, trans->tid);
1417                 if (error)
1418                         break;
1419                 error = hammer_ip_next(&cursor);
1420         }
1421         hammer_done_cursor(&cursor);
1422         if (error == EDEADLK)
1423                 goto retry;
1424         if (error == ENOENT)
1425                 error = 0;
1426         return(error);
1427 }
1428
1429 /*
1430  * Delete the record at the current cursor.  On success the cursor will
1431  * be positioned appropriately for an iteration but may no longer be at
1432  * a leaf node.
1433  *
1434  * This routine is only called from the backend.
1435  *
1436  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1437  * cursor and retry.
1438  */
1439 int
1440 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1441 {
1442         hammer_btree_elm_t elm;
1443         hammer_mount_t hmp;
1444         int error;
1445         int dodelete;
1446
1447         /*
1448          * In-memory (unsynchronized) records can simply be freed.
1449          */
1450         if (cursor->record == &cursor->iprec->rec) {
1451                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE |
1452                                         HAMMER_RECF_DELETED_BE;
1453                 return(0);
1454         }
1455
1456         /*
1457          * On-disk records are marked as deleted by updating their delete_tid.
1458          * This does not effect their position in the B-Tree (which is based
1459          * on their create_tid).
1460          */
1461         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1462         elm = NULL;
1463         hmp = cursor->node->hmp;
1464
1465         dodelete = 0;
1466         if (error == 0) {
1467                 error = hammer_cursor_upgrade(cursor);
1468                 if (error == 0) {
1469                         elm = &cursor->node->ondisk->elms[cursor->index];
1470                         hammer_modify_node(cursor->trans, cursor->node,
1471                                            &elm->leaf.base.delete_tid,
1472                                            sizeof(elm->leaf.base.delete_tid));
1473                         elm->leaf.base.delete_tid = tid;
1474                         hammer_modify_node_done(cursor->node);
1475
1476                         /*
1477                          * An on-disk record cannot have the same delete_tid
1478                          * as its create_tid.  In a chain of record updates
1479                          * this could result in a duplicate record.
1480                          */
1481                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1482                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1483                         cursor->record->base.base.delete_tid = tid;
1484                         hammer_modify_buffer_done(cursor->record_buffer);
1485                 }
1486         }
1487
1488         /*
1489          * If we were mounted with the nohistory option, we physically
1490          * delete the record.
1491          */
1492         if (hmp->hflags & HMNT_NOHISTORY)
1493                 dodelete = 1;
1494
1495         if (error == 0 && dodelete) {
1496                 error = hammer_delete_at_cursor(cursor, NULL);
1497                 if (error) {
1498                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1499                         error = 0;
1500                 }
1501         }
1502         return(error);
1503 }
1504
1505 int
1506 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1507 {
1508         hammer_btree_elm_t elm;
1509         hammer_off_t rec_offset;
1510         hammer_off_t data_offset;
1511         int32_t data_len;
1512         u_int16_t rec_type;
1513         int error;
1514
1515         elm = &cursor->node->ondisk->elms[cursor->index];
1516         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1517
1518         rec_offset = elm->leaf.rec_offset;
1519         data_offset = elm->leaf.data_offset;
1520         data_len = elm->leaf.data_len;
1521         rec_type = elm->leaf.base.rec_type;
1522
1523         error = hammer_btree_delete(cursor);
1524         if (error == 0) {
1525                 /*
1526                  * This forces a fixup for the iteration because
1527                  * the cursor is now either sitting at the 'next'
1528                  * element or sitting at the end of a leaf.
1529                  */
1530                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1531                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1532                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1533                 }
1534         }
1535         if (error == 0) {
1536                 hammer_blockmap_free(cursor->trans, rec_offset,
1537                                      sizeof(union hammer_record_ondisk));
1538         }
1539         if (error == 0) {
1540                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1541                 case HAMMER_ZONE_LARGE_DATA:
1542                 case HAMMER_ZONE_SMALL_DATA:
1543                         hammer_blockmap_free(cursor->trans,
1544                                              data_offset, data_len);
1545                         break;
1546                 default:
1547                         break;
1548                 }
1549         }
1550 #if 0
1551         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1552                 "(%d remain in cluster)\n",
1553                 cluster->volume->vol_no, cluster->clu_no,
1554                 rec_offset, data_offset, data_len,
1555                 cluster->ondisk->stat_records);
1556 #endif
1557         return (error);
1558 }
1559
1560 /*
1561  * Determine whether a directory is empty or not.  Returns 0 if the directory
1562  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1563  */
1564 int
1565 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1566 {
1567         struct hammer_cursor cursor;
1568         int error;
1569
1570         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1571
1572         cursor.key_beg.obj_id = ip->obj_id;
1573         cursor.key_beg.create_tid = 0;
1574         cursor.key_beg.delete_tid = 0;
1575         cursor.key_beg.obj_type = 0;
1576         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1577         cursor.key_beg.key = HAMMER_MIN_KEY;
1578
1579         cursor.key_end = cursor.key_beg;
1580         cursor.key_end.rec_type = 0xFFFF;
1581         cursor.key_end.key = HAMMER_MAX_KEY;
1582
1583         cursor.asof = ip->obj_asof;
1584         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1585
1586         error = hammer_ip_first(&cursor, ip);
1587         if (error == ENOENT)
1588                 error = 0;
1589         else if (error == 0)
1590                 error = ENOTEMPTY;
1591         hammer_done_cursor(&cursor);
1592         return(error);
1593 }
1594