HAMMER 38D/Many: Undo/Synchronization and crash recovery
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.43 2008/04/26 02:54:00 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71         return(0);
72 }
73
74 static int
75 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
76 {
77         if (info->rec_type < rec->rec.base.base.rec_type)
78                 return(-3);
79         if (info->rec_type > rec->rec.base.base.rec_type)
80                 return(3);
81
82         if (info->key < rec->rec.base.base.key)
83                 return(-2);
84         if (info->key > rec->rec.base.base.key)
85                 return(2);
86
87         if (info->create_tid == 0) {
88                 if (rec->rec.base.base.create_tid == 0)
89                         return(0);
90                 return(1);
91         }
92         if (rec->rec.base.base.create_tid == 0)
93                 return(-1);
94         if (info->create_tid < rec->rec.base.base.create_tid)
95                 return(-1);
96         if (info->create_tid > rec->rec.base.base.create_tid)
97                 return(1);
98         return(0);
99 }
100
101 /*
102  * RB_SCAN comparison code for hammer_mem_first().  The argument order
103  * is reversed so the comparison result has to be negated.  key_beg and
104  * key_end are both range-inclusive.
105  *
106  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
107  * These do not stop the scan.
108  *
109  * Localized deletions are not cached in-memory.
110  */
111 static
112 int
113 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
114 {
115         hammer_cursor_t cursor = data;
116         int r;
117
118         r = hammer_rec_compare(&cursor->key_beg, rec);
119         if (r > 1)
120                 return(-1);
121         r = hammer_rec_compare(&cursor->key_end, rec);
122         if (r < -1)
123                 return(1);
124         return(0);
125 }
126
127 /*
128  * This compare function is used when simply looking up key_beg.
129  */
130 static
131 int
132 hammer_rec_find_cmp(hammer_record_t rec, void *data)
133 {
134         hammer_cursor_t cursor = data;
135         int r;
136
137         r = hammer_rec_compare(&cursor->key_beg, rec);
138         if (r > 1)
139                 return(-1);
140         if (r < -1)
141                 return(1);
142         return(0);
143 }
144
145 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
146 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
147                     hammer_rec_compare, hammer_base_elm_t);
148
149 /*
150  * Allocate a record for the caller to finish filling in.  The record is
151  * returned referenced.
152  */
153 hammer_record_t
154 hammer_alloc_mem_record(hammer_inode_t ip)
155 {
156         hammer_record_t record;
157
158         ++hammer_count_records;
159         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
160         record->state = HAMMER_FST_IDLE;
161         record->ip = ip;
162         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
163         hammer_ref(&record->lock);
164         return (record);
165 }
166
167 void
168 hammer_wait_mem_record(hammer_record_t record)
169 {
170         while (record->state == HAMMER_FST_FLUSH) {
171                 record->flags |= HAMMER_RECF_WANTED;
172                 tsleep(record, 0, "hmrrc2", 0);
173         }
174 }
175
176 /*
177  * Called from the backend, hammer_inode.c, when a record has been
178  * flushed to disk.
179  *
180  * The backend has likely marked this record for deletion as well.
181  */
182 void
183 hammer_flush_record_done(hammer_record_t record)
184 {
185         KKASSERT(record->state == HAMMER_FST_FLUSH);
186         record->state = HAMMER_FST_IDLE;
187         if (record->flags & HAMMER_RECF_WANTED) {
188                 record->flags &= ~HAMMER_RECF_WANTED;
189                 wakeup(record);
190         }
191         hammer_rel_mem_record(record);
192 }
193
194
195 /*
196  * Release a memory record.  Records marked for deletion are immediately
197  * removed from the RB-Tree but otherwise left intact until the last ref
198  * goes away.
199  */
200 void
201 hammer_rel_mem_record(struct hammer_record *record)
202 {
203         hammer_unref(&record->lock);
204
205         if (record->flags & HAMMER_RECF_DELETED_FE) {
206                 if (record->lock.refs == 0) {
207                         if (record->flags & HAMMER_RECF_ONRBTREE) {
208                                 RB_REMOVE(hammer_rec_rb_tree,
209                                           &record->ip->rec_tree,
210                                           record);
211                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
212                         }
213                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
214                                 --hammer_count_record_datas;
215                                 kfree(record->data, M_HAMMER);
216                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
217                         }
218                         record->data = NULL;
219                         --hammer_count_records;
220                         kfree(record, M_HAMMER);
221                         return;
222                 }
223         }
224
225         /*
226          * If someone wanted the record wake them up.
227          */
228         if (record->flags & HAMMER_RECF_WANTED) {
229                 record->flags &= ~HAMMER_RECF_WANTED;
230                 wakeup(record);
231         }
232 }
233
234 /*
235  * The deletion state of a record will appear different to the backend
236  * then it does to the frontend.
237  */
238 static __inline
239 int
240 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t rec)
241 {
242         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
243                 if (rec->flags & HAMMER_RECF_DELETED_BE)
244                         return(0);
245         } else {
246                 if (rec->flags & HAMMER_RECF_DELETED_FE)
247                         return(0);
248         }
249         return(1);
250 }
251
252 /*
253  * This callback is used as part of the RB_SCAN function for in-memory
254  * records.  We terminate it (return -1) as soon as we get a match.
255  *
256  * This routine is used by frontend code.
257  *
258  * The primary compare code does not account for ASOF lookups.  This
259  * code handles that case as well as a few others.
260  */
261 static
262 int
263 hammer_rec_scan_callback(hammer_record_t rec, void *data)
264 {
265         hammer_cursor_t cursor = data;
266
267         /*
268          * We terminate on success, so this should be NULL on entry.
269          */
270         KKASSERT(cursor->iprec == NULL);
271
272         /*
273          * Skip if the record was marked deleted.
274          */
275         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
276                 return(0);
277
278         /*
279          * Skip if not visible due to our as-of TID
280          */
281         if (cursor->flags & HAMMER_CURSOR_ASOF) {
282                 if (cursor->asof < rec->rec.base.base.create_tid)
283                         return(0);
284                 if (rec->rec.base.base.delete_tid &&
285                     cursor->asof >= rec->rec.base.base.delete_tid) {
286                         return(0);
287                 }
288         }
289
290         /*
291          * If the record is queued to the flusher we have to block until
292          * it isn't.  Otherwise we may see duplication between our memory
293          * cache and the media.
294          */
295         hammer_ref(&rec->lock);
296
297 #warning "This deadlocks"
298 #if 0
299         if (rec->state == HAMMER_FST_FLUSH)
300                 hammer_wait_mem_record(rec);
301 #endif
302
303         /*
304          * The record may have been deleted while we were blocked.
305          */
306         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
307                 hammer_rel_mem_record(rec);
308                 return(0);
309         }
310
311         /*
312          * Set the matching record and stop the scan.
313          */
314         cursor->iprec = rec;
315         return(-1);
316 }
317
318
319 /*
320  * Lookup an in-memory record given the key specified in the cursor.  Works
321  * just like hammer_btree_lookup() but operates on an inode's in-memory
322  * record list.
323  *
324  * The lookup must fail if the record is marked for deferred deletion.
325  */
326 static
327 int
328 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
329 {
330         int error;
331
332         if (cursor->iprec) {
333                 hammer_rel_mem_record(cursor->iprec);
334                 cursor->iprec = NULL;
335         }
336         if (cursor->ip) {
337                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
338                 --cursor->ip->cursor_ip_refs;
339 #if 0
340                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
341                                                   &cursor->ip->rec_tree);
342 #endif
343         }
344         cursor->ip = ip;
345 #if 0
346         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
347 #endif
348         ++ip->cursor_ip_refs;
349
350 #if 0
351         cursor->scan.node = NULL;
352 #endif
353         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
354                                    hammer_rec_scan_callback, cursor);
355
356         if (cursor->iprec == NULL)
357                 error = ENOENT;
358         else
359                 error = 0;
360         return(error);
361 }
362
363 /*
364  * hammer_mem_first() - locate the first in-memory record matching the
365  * cursor within the bounds of the key range.
366  */
367 static
368 int
369 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
370 {
371         if (cursor->iprec) {
372                 hammer_rel_mem_record(cursor->iprec);
373                 cursor->iprec = NULL;
374         }
375         if (cursor->ip) {
376                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
377                 --cursor->ip->cursor_ip_refs;
378 #if 0
379                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
380                                                   &cursor->ip->rec_tree);
381 #endif
382         }
383         cursor->ip = ip;
384 #if 0
385         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
386 #endif
387         ++ip->cursor_ip_refs;
388
389 #if 0
390         cursor->scan.node = NULL;
391 #endif
392         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
393                                    hammer_rec_scan_callback, cursor);
394
395         /*
396          * Adjust scan.node and keep it linked into the RB-tree so we can
397          * hold the cursor through third party modifications of the RB-tree.
398          */
399         if (cursor->iprec) {
400 #if 0
401                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
402 #endif
403                 return(0);
404         }
405         return(ENOENT);
406 }
407
408 void
409 hammer_mem_done(hammer_cursor_t cursor)
410 {
411         if (cursor->ip) {
412                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
413                 --cursor->ip->cursor_ip_refs;
414 #if 0
415                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
416                                                   &cursor->ip->rec_tree);
417 #endif
418                 cursor->ip = NULL;
419         }
420         if (cursor->iprec) {
421                 hammer_rel_mem_record(cursor->iprec);
422                 cursor->iprec = NULL;
423         }
424 }
425
426 /************************************************************************
427  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
428  ************************************************************************
429  *
430  * These functions manipulate in-memory records.  Such records typically
431  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
432  */
433
434 /*
435  * Add a directory entry (dip,ncp) which references inode (ip).
436  *
437  * Note that the low 32 bits of the namekey are set temporarily to create
438  * a unique in-memory record, and may be modified a second time when the
439  * record is synchronized to disk.  In particular, the low 32 bits cannot be
440  * all 0's when synching to disk, which is not handled here.
441  */
442 int
443 hammer_ip_add_directory(struct hammer_transaction *trans,
444                      struct hammer_inode *dip, struct namecache *ncp,
445                      struct hammer_inode *ip)
446 {
447         hammer_record_t record;
448         int error;
449         int bytes;
450
451         record = hammer_alloc_mem_record(dip);
452
453         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
454         if (++trans->hmp->namekey_iterator == 0)
455                 ++trans->hmp->namekey_iterator;
456
457         record->rec.entry.base.base.obj_id = dip->obj_id;
458         record->rec.entry.base.base.key =
459                 hammer_directory_namekey(ncp->nc_name, bytes);
460         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
461         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
462         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
463         record->rec.entry.obj_id = ip->obj_id;
464         record->data = (void *)ncp->nc_name;
465         record->rec.entry.base.data_len = bytes;
466         ++ip->ino_rec.ino_nlinks;
467         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
468         /* NOTE: copies record->data */
469         error = hammer_mem_add(trans, record);
470         return(error);
471 }
472
473 /*
474  * Delete the directory entry and update the inode link count.  The
475  * cursor must be seeked to the directory entry record being deleted.
476  *
477  * The related inode should be share-locked by the caller.  The caller is
478  * on the frontend.
479  *
480  * This function can return EDEADLK requiring the caller to terminate
481  * the cursor, any locks, wait on the returned record, and retry.
482  */
483 int
484 hammer_ip_del_directory(struct hammer_transaction *trans,
485                      hammer_cursor_t cursor, struct hammer_inode *dip,
486                      struct hammer_inode *ip)
487 {
488         hammer_record_t record;
489         int error;
490
491         if (cursor->record == &cursor->iprec->rec) {
492                 /*
493                  * In-memory (unsynchronized) records can simply be freed.
494                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
495                  * by the backend, we must still avoid races against the
496                  * backend potentially syncing the record to the media. 
497                  *
498                  * We cannot call hammer_ip_delete_record(), that routine may
499                  * only be called from the backend.
500                  */
501                 record = cursor->iprec;
502                 if (record->state == HAMMER_FST_FLUSH) {
503                         KKASSERT(cursor->deadlk_rec == NULL);
504                         hammer_ref(&record->lock);
505                         cursor->deadlk_rec = record;
506                         error = EDEADLK;
507                 } else {
508                         cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
509                         error = 0;
510                 }
511         } else {
512                 /*
513                  * If the record is on-disk we have to queue the deletion by
514                  * the record's key.  This also causes lookups to skip the
515                  * record.
516                  */
517                 record = hammer_alloc_mem_record(dip);
518                 record->rec.entry.base.base = cursor->record->base.base;
519                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
520                 record->flags |= HAMMER_RECF_DELETE_ONDISK;
521
522                 error = hammer_mem_add(trans, record);
523         }
524
525         /*
526          * One less link.  The file may still be open in the OS even after
527          * all links have gone away so we only try to sync if the OS has
528          * no references and nlinks falls to 0.
529          *
530          * We have to terminate the cursor before syncing the inode to
531          * avoid deadlocking against ourselves.
532          *
533          * XXX we can't sync the inode here because the encompassing
534          * transaction might be a rename and might update the inode
535          * again with a new link.  That would force the delete_tid to be
536          * the same as the create_tid and cause a panic.
537          */
538         if (error == 0) {
539                 --ip->ino_rec.ino_nlinks;
540                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
541                 if (ip->ino_rec.ino_nlinks == 0 &&
542                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
543                         hammer_done_cursor(cursor);
544                 }
545
546         }
547         return(error);
548 }
549
550 /*
551  * Add a record to an inode.
552  *
553  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
554  * initialize the following additional fields:
555  *
556  * The related inode should be share-locked by the caller.  The caller is
557  * on the frontend.
558  *
559  * record->rec.entry.base.base.key
560  * record->rec.entry.base.base.rec_type
561  * record->rec.entry.base.base.data_len
562  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
563  */
564 int
565 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
566 {
567         hammer_inode_t ip = record->ip;
568         int error;
569
570         record->rec.base.base.obj_id = ip->obj_id;
571         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
572
573         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
574         /* NOTE: copies record->data */
575         error = hammer_mem_add(trans, record);
576         return(error);
577 }
578
579 /*
580  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
581  * is called via the strategy called from a cached data source.  This code
582  * is responsible for actually writing a data record out to the disk.
583  *
584  * This can only occur non-historically (i.e. 'current' data only).
585  *
586  * The file offset must be HAMMER_BUFSIZE aligned but the data length
587  * can be truncated.  The record (currently) always represents a BUFSIZE
588  * swath of space whether the data is truncated or not.
589  */
590 int
591 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
592                        int64_t offset, void *data, int bytes)
593 {
594         struct hammer_cursor cursor;
595         hammer_record_ondisk_t rec;
596         union hammer_btree_elm elm;
597         hammer_off_t rec_offset;
598         void *bdata;
599         int error;
600
601         KKASSERT((offset & HAMMER_BUFMASK) == 0);
602         KKASSERT(trans->type == HAMMER_TRANS_FLS);
603 retry:
604         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
605         if (error)
606                 return(error);
607         cursor.key_beg.obj_id = ip->obj_id;
608         cursor.key_beg.key = offset + bytes;
609         cursor.key_beg.create_tid = trans->tid;
610         cursor.key_beg.delete_tid = 0;
611         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
612         cursor.asof = trans->tid;
613         cursor.flags |= HAMMER_CURSOR_INSERT;
614
615         /*
616          * Issue a lookup to position the cursor.
617          */
618         error = hammer_btree_lookup(&cursor);
619         if (error == 0) {
620                 kprintf("hammer_ip_sync_data: duplicate data at "
621                         "(%lld,%d) tid %016llx\n",
622                         offset, bytes, trans->tid);
623                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
624                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
625                 panic("Duplicate data");
626                 error = EIO;
627         }
628         if (error != ENOENT)
629                 goto done;
630
631         /*
632          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
633          * can cross buffer boundaries so we may have to split our bcopy.
634          */
635         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
636                                   &cursor.record_buffer,
637                                   bytes, &bdata,
638                                   &cursor.data_buffer, &error);
639         if (rec == NULL)
640                 goto done;
641         if (hammer_debug_general & 0x1000)
642                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
643
644         /*
645          * Fill everything in and insert our B-Tree node.
646          *
647          * NOTE: hammer_alloc_record() has already marked the related
648          * buffers as modified.  If we do it again we will generate
649          * unnecessary undo elements.
650          */
651         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
652         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
653         rec->base.base.obj_id = ip->obj_id;
654         rec->base.base.key = offset + bytes;
655         rec->base.base.create_tid = trans->tid;
656         rec->base.base.delete_tid = 0;
657         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
658         rec->base.data_crc = crc32(data, bytes);
659         hammer_modify_buffer_done(cursor.record_buffer);
660         KKASSERT(rec->base.data_len == bytes);
661
662         hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
663         bcopy(data, bdata, bytes);
664         hammer_modify_buffer_done(cursor.data_buffer);
665
666         elm.leaf.base = rec->base.base;
667         elm.leaf.rec_offset = rec_offset;
668         elm.leaf.data_offset = rec->base.data_off;
669         elm.leaf.data_len = bytes;
670         elm.leaf.data_crc = rec->base.data_crc;
671
672         /*
673          * Data records can wind up on-disk before the inode itself is
674          * on-disk.  One must assume data records may be on-disk if either
675          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
676          */
677         ip->flags |= HAMMER_INODE_DONDISK;
678
679         error = hammer_btree_insert(&cursor, &elm);
680         if (error == 0)
681                 goto done;
682
683         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
684 done:
685         hammer_done_cursor(&cursor);
686         if (error == EDEADLK)
687                 goto retry;
688         return(error);
689 }
690
691 /*
692  * Sync an in-memory record to the disk.  This is called by the backend.
693  * This code is responsible for actually writing a record out to the disk.
694  *
695  * NOTE: The frontend can mark the record deleted while it is queued to
696  * the backend.  The deletion applies to a frontend operation and the
697  * record must be treated as NOT having been deleted on the backend, so
698  * we ignore the flag.
699  */
700 int
701 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
702 {
703         struct hammer_cursor cursor;
704         hammer_record_ondisk_t rec;
705         union hammer_btree_elm elm;
706         hammer_off_t rec_offset;
707         void *bdata;
708         int error;
709
710         KKASSERT(record->state == HAMMER_FST_FLUSH);
711
712 retry:
713         /*
714          * Get a cursor, we will either be inserting or deleting.
715          */
716         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
717         if (error)
718                 return(error);
719         cursor.key_beg = record->rec.base.base;
720
721         /*
722          * If we are deleting an exact match must be found on-disk.
723          */
724         if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
725                 error = hammer_btree_lookup(&cursor);
726                 if (error == 0)
727                         error = hammer_ip_delete_record(&cursor, trans->tid);
728                 if (error == 0)
729                         record->flags |= HAMMER_RECF_DELETED_FE;
730                 goto done;
731         }
732
733         /*
734          * We are inserting.
735          *
736          * Issue a lookup to position the cursor and locate the cluster.  The
737          * target key should not exist.  If we are creating a directory entry
738          * we may have to iterate the low 32 bits of the key to find an unused
739          * key.
740          */
741         cursor.flags |= HAMMER_CURSOR_INSERT;
742
743         for (;;) {
744                 error = hammer_btree_lookup(&cursor);
745                 if (error)
746                         break;
747                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
748                         kprintf("hammer_ip_sync_record: duplicate rec "
749                                 "at (%016llx)\n", record->rec.base.base.key);
750                         Debugger("duplicate record1");
751                         error = EIO;
752                         break;
753                 }
754                 if (++trans->hmp->namekey_iterator == 0)
755                         ++trans->hmp->namekey_iterator;
756                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
757                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
758                 cursor.key_beg.key = record->rec.base.base.key;
759         }
760         if (error != ENOENT)
761                 goto done;
762
763         /*
764          * Allocate the record and data.  The result buffers will be
765          * marked as being modified and further calls to
766          * hammer_modify_buffer() will result in unneeded UNDO records.
767          *
768          * Support zero-fill records (data == NULL and data_len != 0)
769          */
770         if (record->data == NULL) {
771                 rec = hammer_alloc_record(trans, &rec_offset,
772                                           record->rec.base.base.rec_type,
773                                           &cursor.record_buffer,
774                                           0, &bdata,
775                                           NULL, &error);
776                 if (hammer_debug_general & 0x1000)
777                         kprintf("NULL RECORD DATA\n");
778         } else if (record->flags & HAMMER_RECF_INBAND) {
779                 rec = hammer_alloc_record(trans, &rec_offset,
780                                           record->rec.base.base.rec_type,
781                                           &cursor.record_buffer,
782                                           record->rec.base.data_len, &bdata,
783                                           NULL, &error);
784                 if (hammer_debug_general & 0x1000)
785                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
786         } else {
787                 rec = hammer_alloc_record(trans, &rec_offset,
788                                           record->rec.base.base.rec_type,
789                                           &cursor.record_buffer,
790                                           record->rec.base.data_len, &bdata,
791                                           &cursor.data_buffer, &error);
792                 if (hammer_debug_general & 0x1000)
793                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
794         }
795
796         if (rec == NULL)
797                 goto done;
798
799         /*
800          * Fill in the remaining fields and insert our B-Tree node.
801          */
802         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
803         rec->base.base = record->rec.base.base;
804         bcopy(&record->rec.base + 1, &rec->base + 1,
805               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
806
807         /*
808          * Copy the data and deal with zero-fill support.
809          */
810         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
811                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
812                 bcopy(record->data, bdata, rec->base.data_len);
813         } else if (record->data) {
814                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
815                 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
816                 bcopy(record->data, bdata, rec->base.data_len);
817                 hammer_modify_buffer_done(cursor.data_buffer);
818         } else {
819                 rec->base.data_len = record->rec.base.data_len;
820         }
821         hammer_modify_buffer_done(cursor.record_buffer);
822
823         elm.leaf.base = record->rec.base.base;
824         elm.leaf.rec_offset = rec_offset;
825         elm.leaf.data_offset = rec->base.data_off;
826         elm.leaf.data_len = rec->base.data_len;
827         elm.leaf.data_crc = rec->base.data_crc;
828
829         error = hammer_btree_insert(&cursor, &elm);
830
831         /*
832          * Clean up on success, or fall through on error.
833          */
834         if (error == 0) {
835                 record->flags |= HAMMER_RECF_DELETED_FE;
836                 goto done;
837         }
838
839         /*
840          * Try to unwind the allocation
841          */
842         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
843 done:
844         hammer_done_cursor(&cursor);
845         if (error == EDEADLK)
846                 goto retry;
847         return(error);
848 }
849
850 /*
851  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
852  * entry's key is used to deal with hash collisions in the upper 32 bits.
853  * A unique 64 bit key is generated in-memory and may be regenerated a
854  * second time when the directory record is flushed to the on-disk B-Tree.
855  *
856  * A referenced record is passed to this function.  This function
857  * eats the reference.  If an error occurs the record will be deleted.
858  *
859  * A copy of the temporary record->data pointer provided by the caller
860  * will be made.
861  */
862 static
863 int
864 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
865 {
866         void *data;
867         int bytes;
868         int reclen;
869                 
870         /*
871          * Make a private copy of record->data
872          */
873         if (record->data) {
874                 /*
875                  * Try to embed the data in extra space in the record
876                  * union, otherwise allocate a copy.
877                  */
878                 bytes = record->rec.base.data_len;
879                 switch(record->rec.base.base.rec_type) {
880                 case HAMMER_RECTYPE_DIRENTRY:
881                         reclen = offsetof(struct hammer_entry_record, name[0]);
882                         break;
883                 case HAMMER_RECTYPE_DATA:
884                         reclen = offsetof(struct hammer_data_record, data[0]);
885                         break;
886                 default:
887                         reclen = sizeof(record->rec);
888                         break;
889                 }
890                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
891                         bcopy(record->data, (char *)&record->rec + reclen,
892                               bytes);
893                         record->data = (void *)((char *)&record->rec + reclen);
894                         record->flags |= HAMMER_RECF_INBAND;
895                 } else {
896                         ++hammer_count_record_datas;
897                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
898                         record->flags |= HAMMER_RECF_ALLOCDATA;
899                         bcopy(record->data, data, bytes);
900                         record->data = data;
901                 }
902         }
903
904         /*
905          * Insert into the RB tree, find an unused iterator if this is
906          * a directory entry.
907          */
908         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
909                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
910                         record->flags |= HAMMER_RECF_DELETED_FE;
911                         hammer_rel_mem_record(record);
912                         return (EEXIST);
913                 }
914                 if (++trans->hmp->namekey_iterator == 0)
915                         ++trans->hmp->namekey_iterator;
916                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
917                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
918         }
919         record->flags |= HAMMER_RECF_ONRBTREE;
920         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
921         hammer_rel_mem_record(record);
922         return(0);
923 }
924
925 /************************************************************************
926  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
927  ************************************************************************
928  *
929  * These functions augment the B-Tree scanning functions in hammer_btree.c
930  * by merging in-memory records with on-disk records.
931  */
932
933 /*
934  * Locate a particular record either in-memory or on-disk.
935  *
936  * NOTE: This is basically a standalone routine, hammer_ip_next() may
937  * NOT be called to iterate results.
938  */
939 int
940 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
941 {
942         int error;
943
944         /*
945          * If the element is in-memory return it without searching the
946          * on-disk B-Tree
947          */
948         error = hammer_mem_lookup(cursor, ip);
949         if (error == 0) {
950                 cursor->record = &cursor->iprec->rec;
951                 return(error);
952         }
953         if (error != ENOENT)
954                 return(error);
955
956         /*
957          * If the inode has on-disk components search the on-disk B-Tree.
958          */
959         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
960                 return(error);
961         error = hammer_btree_lookup(cursor);
962         if (error == 0)
963                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
964         return(error);
965 }
966
967 /*
968  * Locate the first record within the cursor's key_beg/key_end range,
969  * restricted to a particular inode.  0 is returned on success, ENOENT
970  * if no records matched the requested range, or some other error.
971  *
972  * When 0 is returned hammer_ip_next() may be used to iterate additional
973  * records within the requested range.
974  *
975  * This function can return EDEADLK, requiring the caller to terminate
976  * the cursor and try again.
977  */
978 int
979 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
980 {
981         int error;
982
983         /*
984          * Clean up fields and setup for merged scan
985          */
986         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
987         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
988         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
989         if (cursor->iprec) {
990                 hammer_rel_mem_record(cursor->iprec);
991                 cursor->iprec = NULL;
992         }
993
994         /*
995          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
996          * exact lookup so if we get ENOENT we have to call the iterate
997          * function to validate the first record after the begin key.
998          *
999          * The ATEDISK flag is used by hammer_btree_iterate to determine
1000          * whether it must index forwards or not.  It is also used here
1001          * to select the next record from in-memory or on-disk.
1002          *
1003          * EDEADLK can only occur if the lookup hit an empty internal
1004          * element and couldn't delete it.  Since this could only occur
1005          * in-range, we can just iterate from the failure point.
1006          */
1007         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1008                 error = hammer_btree_lookup(cursor);
1009                 if (error == ENOENT || error == EDEADLK) {
1010                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1011                         if (hammer_debug_general & 0x2000)
1012                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1013                         error = hammer_btree_iterate(cursor);
1014                 }
1015                 if (error && error != ENOENT) 
1016                         return(error);
1017                 if (error == 0) {
1018                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1019                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1020                 } else {
1021                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1022                 }
1023         }
1024
1025         /*
1026          * Search the in-memory record list (Red-Black tree).  Unlike the
1027          * B-Tree search, mem_first checks for records in the range.
1028          */
1029         error = hammer_mem_first(cursor, ip);
1030         if (error && error != ENOENT)
1031                 return(error);
1032         if (error == 0) {
1033                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1034                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1035                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1036                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1037         }
1038
1039         /*
1040          * This will return the first matching record.
1041          */
1042         return(hammer_ip_next(cursor));
1043 }
1044
1045 /*
1046  * Retrieve the next record in a merged iteration within the bounds of the
1047  * cursor.  This call may be made multiple times after the cursor has been
1048  * initially searched with hammer_ip_first().
1049  *
1050  * 0 is returned on success, ENOENT if no further records match the
1051  * requested range, or some other error code is returned.
1052  */
1053 int
1054 hammer_ip_next(hammer_cursor_t cursor)
1055 {
1056         hammer_btree_elm_t elm;
1057         hammer_record_t rec, save;
1058         int error;
1059         int r;
1060
1061 next_btree:
1062         /*
1063          * Load the current on-disk and in-memory record.  If we ate any
1064          * records we have to get the next one. 
1065          *
1066          * If we deleted the last on-disk record we had scanned ATEDISK will
1067          * be clear and DELBTREE will be set, forcing a call to iterate. The
1068          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1069          * element.  If ATEDISK is set, iterate will skip the 'current'
1070          * element.
1071          *
1072          * Get the next on-disk record
1073          */
1074         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1075                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1076                         error = hammer_btree_iterate(cursor);
1077                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1078                         if (error == 0)
1079                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1080                         else
1081                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1082                                                  HAMMER_CURSOR_ATEDISK;
1083                 }
1084         }
1085
1086 next_memory:
1087         /*
1088          * Get the next in-memory record.  The record can be ripped out
1089          * of the RB tree so we maintain a scan_info structure to track
1090          * the next node.
1091          *
1092          * hammer_rec_scan_cmp:  Is the record still in our general range,
1093          *                       (non-inclusive of snapshot exclusions)?
1094          * hammer_rec_scan_callback: Is the record in our snapshot?
1095          */
1096         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1097                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1098                         save = cursor->iprec;
1099                         cursor->iprec = NULL;
1100                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1101                         while (rec) {
1102                                 if (hammer_ip_iterate_mem_good(cursor, rec)) {
1103                                         if (hammer_rec_scan_cmp(rec, cursor) != 0)
1104                                                 break;
1105                                         if (hammer_rec_scan_callback(rec, cursor) != 0)
1106                                                 break;
1107                                 }
1108                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1109                         }
1110                         if (save)
1111                                 hammer_rel_mem_record(save);
1112                         if (cursor->iprec) {
1113                                 KKASSERT(cursor->iprec == rec);
1114                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1115 #if 0
1116                                 cursor->scan.node =
1117                                         hammer_rec_rb_tree_RB_NEXT(rec);
1118 #endif
1119                         } else {
1120                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1121                         }
1122                 }
1123         }
1124
1125         /*
1126          * Extract either the disk or memory record depending on their
1127          * relative position.
1128          */
1129         error = 0;
1130         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1131         case 0:
1132                 /*
1133                  * Both entries valid
1134                  */
1135                 elm = &cursor->node->ondisk->elms[cursor->index];
1136                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1137                 if (r < 0) {
1138                         error = hammer_btree_extract(cursor,
1139                                                      HAMMER_CURSOR_GET_RECORD);
1140                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1141                         break;
1142                 }
1143
1144                 /*
1145                  * If the entries match the memory entry must specify
1146                  * an on-disk deletion.  Eat both entries unless the
1147                  * caller wants visibility into the special records.
1148                  */
1149                 if (r == 0) {
1150                         KKASSERT(cursor->iprec->flags & 
1151                                  HAMMER_RECF_DELETE_ONDISK);
1152                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1153                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1154                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1155                                 goto next_btree;
1156                         }
1157                 }
1158                 /* fall through to the memory entry */
1159         case HAMMER_CURSOR_ATEDISK:
1160                 /*
1161                  * Only the memory entry is valid.  If the record is
1162                  * placemarking an on-disk deletion, we skip it unless
1163                  * the caller wants special record visibility.
1164                  */
1165                 cursor->record = &cursor->iprec->rec;
1166                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1167                 if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) {
1168                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1169                                 goto next_memory;
1170                 }
1171                 break;
1172         case HAMMER_CURSOR_ATEMEM:
1173                 /*
1174                  * Only the disk entry is valid
1175                  */
1176                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1177                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1178                 break;
1179         default:
1180                 /*
1181                  * Neither entry is valid
1182                  *
1183                  * XXX error not set properly
1184                  */
1185                 cursor->record = NULL;
1186                 error = ENOENT;
1187                 break;
1188         }
1189         return(error);
1190 }
1191
1192 /*
1193  * Resolve the cursor->data pointer for the current cursor position in
1194  * a merged iteration.
1195  */
1196 int
1197 hammer_ip_resolve_data(hammer_cursor_t cursor)
1198 {
1199         int error;
1200
1201         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1202                 cursor->data = cursor->iprec->data;
1203                 error = 0;
1204         } else {
1205                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1206         }
1207         return(error);
1208 }
1209
1210 int
1211 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1212 {
1213         int error;
1214
1215         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1216                 cursor->data = cursor->iprec->data;
1217                 error = 0;
1218         } else {
1219                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1220                                                      HAMMER_CURSOR_GET_RECORD);
1221         }
1222         return(error);
1223 }
1224
1225 /*
1226  * Delete all records within the specified range for inode ip.
1227  *
1228  * NOTE: An unaligned range will cause new records to be added to cover
1229  * the edge cases. (XXX not implemented yet).
1230  *
1231  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1232  *
1233  * NOTE: Record keys for regular file data have to be special-cased since
1234  * they indicate the end of the range (key = base + bytes).
1235  */
1236 int
1237 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1238                        int64_t ran_beg, int64_t ran_end)
1239 {
1240         struct hammer_cursor cursor;
1241         hammer_record_ondisk_t rec;
1242         hammer_base_elm_t base;
1243         int error;
1244         int64_t off;
1245
1246 #if 0
1247         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1248 #endif
1249
1250         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1251 retry:
1252         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1253
1254         cursor.key_beg.obj_id = ip->obj_id;
1255         cursor.key_beg.create_tid = 0;
1256         cursor.key_beg.delete_tid = 0;
1257         cursor.key_beg.obj_type = 0;
1258         cursor.asof = ip->obj_asof;
1259         cursor.flags |= HAMMER_CURSOR_ASOF;
1260         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1261         cursor.flags |= HAMMER_CURSOR_BACKEND;
1262
1263         cursor.key_end = cursor.key_beg;
1264         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1265                 cursor.key_beg.key = ran_beg;
1266                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1267                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1268                 cursor.key_end.key = ran_end;
1269         } else {
1270                 /*
1271                  * The key in the B-Tree is (base+bytes), so the first possible
1272                  * matching key is ran_beg + 1.
1273                  */
1274                 int64_t tmp64;
1275
1276                 cursor.key_beg.key = ran_beg + 1;
1277                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1278                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1279
1280                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1281                 if (tmp64 < ran_end)
1282                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1283                 else
1284                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1285         }
1286         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1287
1288         error = hammer_ip_first(&cursor, ip);
1289
1290         /*
1291          * Iterate through matching records and mark them as deleted.
1292          */
1293         while (error == 0) {
1294                 rec = cursor.record;
1295                 base = &rec->base.base;
1296
1297                 KKASSERT(base->delete_tid == 0);
1298
1299                 /*
1300                  * There may be overlap cases for regular file data.  Also
1301                  * remember the key for a regular file record is the offset
1302                  * of the last byte of the record (base + len - 1), NOT the
1303                  * base offset.
1304                  */
1305 #if 0
1306                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1307 #endif
1308                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1309 #if 0
1310                         kprintf("delete_range loop key %016llx,%d\n",
1311                                 base->key - rec->base.data_len, rec->base.data_len);
1312 #endif
1313                         off = base->key - rec->base.data_len;
1314                         /*
1315                          * Check the left edge case.  We currently do not
1316                          * split existing records.
1317                          */
1318                         if (off < ran_beg) {
1319                                 panic("hammer left edge case %016llx %d\n",
1320                                         base->key, rec->base.data_len);
1321                         }
1322
1323                         /*
1324                          * Check the right edge case.  Note that the
1325                          * record can be completely out of bounds, which
1326                          * terminates the search.
1327                          *
1328                          * base->key is exclusive of the right edge while
1329                          * ran_end is inclusive of the right edge.  The
1330                          * (key - data_len) left boundary is inclusive.
1331                          *
1332                          * XXX theory-check this test at some point, are
1333                          * we missing a + 1 somewhere?  Note that ran_end
1334                          * could overflow.
1335                          */
1336                         if (base->key - 1 > ran_end) {
1337                                 if (base->key - rec->base.data_len > ran_end)
1338                                         break;
1339                                 panic("hammer right edge case\n");
1340                         }
1341                 }
1342
1343                 /*
1344                  * Mark the record and B-Tree entry as deleted.  This will
1345                  * also physically delete the B-Tree entry, record, and
1346                  * data if the retention policy dictates.  The function
1347                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1348                  * uses to perform a fixup.
1349                  */
1350                 error = hammer_ip_delete_record(&cursor, trans->tid);
1351                 if (error)
1352                         break;
1353                 error = hammer_ip_next(&cursor);
1354         }
1355         hammer_done_cursor(&cursor);
1356         if (error == EDEADLK)
1357                 goto retry;
1358         if (error == ENOENT)
1359                 error = 0;
1360         return(error);
1361 }
1362
1363 /*
1364  * Delete all records associated with an inode except the inode record
1365  * itself.
1366  */
1367 int
1368 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1369 {
1370         struct hammer_cursor cursor;
1371         hammer_record_ondisk_t rec;
1372         hammer_base_elm_t base;
1373         int error;
1374
1375         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1376 retry:
1377         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1378
1379         cursor.key_beg.obj_id = ip->obj_id;
1380         cursor.key_beg.create_tid = 0;
1381         cursor.key_beg.delete_tid = 0;
1382         cursor.key_beg.obj_type = 0;
1383         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1384         cursor.key_beg.key = HAMMER_MIN_KEY;
1385
1386         cursor.key_end = cursor.key_beg;
1387         cursor.key_end.rec_type = 0xFFFF;
1388         cursor.key_end.key = HAMMER_MAX_KEY;
1389
1390         cursor.asof = ip->obj_asof;
1391         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1392         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1393         cursor.flags |= HAMMER_CURSOR_BACKEND;
1394
1395         error = hammer_ip_first(&cursor, ip);
1396
1397         /*
1398          * Iterate through matching records and mark them as deleted.
1399          */
1400         while (error == 0) {
1401                 rec = cursor.record;
1402                 base = &rec->base.base;
1403
1404                 KKASSERT(base->delete_tid == 0);
1405
1406                 /*
1407                  * Mark the record and B-Tree entry as deleted.  This will
1408                  * also physically delete the B-Tree entry, record, and
1409                  * data if the retention policy dictates.  The function
1410                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1411                  * uses to perform a fixup.
1412                  */
1413                 error = hammer_ip_delete_record(&cursor, trans->tid);
1414                 if (error)
1415                         break;
1416                 error = hammer_ip_next(&cursor);
1417         }
1418         hammer_done_cursor(&cursor);
1419         if (error == EDEADLK)
1420                 goto retry;
1421         if (error == ENOENT)
1422                 error = 0;
1423         return(error);
1424 }
1425
1426 /*
1427  * Delete the record at the current cursor.  On success the cursor will
1428  * be positioned appropriately for an iteration but may no longer be at
1429  * a leaf node.
1430  *
1431  * This routine is only called from the backend.
1432  *
1433  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1434  * cursor and retry.
1435  */
1436 int
1437 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1438 {
1439         hammer_btree_elm_t elm;
1440         hammer_mount_t hmp;
1441         int error;
1442         int dodelete;
1443
1444         /*
1445          * In-memory (unsynchronized) records can simply be freed.
1446          */
1447         if (cursor->record == &cursor->iprec->rec) {
1448                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE |
1449                                         HAMMER_RECF_DELETED_BE;
1450                 return(0);
1451         }
1452
1453         /*
1454          * On-disk records are marked as deleted by updating their delete_tid.
1455          * This does not effect their position in the B-Tree (which is based
1456          * on their create_tid).
1457          */
1458         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1459         elm = NULL;
1460         hmp = cursor->node->hmp;
1461
1462         dodelete = 0;
1463         if (error == 0) {
1464                 error = hammer_cursor_upgrade(cursor);
1465                 if (error == 0) {
1466                         elm = &cursor->node->ondisk->elms[cursor->index];
1467                         hammer_modify_node(cursor->trans, cursor->node,
1468                                            &elm->leaf.base.delete_tid,
1469                                            sizeof(elm->leaf.base.delete_tid));
1470                         elm->leaf.base.delete_tid = tid;
1471                         hammer_modify_node_done(cursor->node);
1472
1473                         /*
1474                          * An on-disk record cannot have the same delete_tid
1475                          * as its create_tid.  In a chain of record updates
1476                          * this could result in a duplicate record.
1477                          */
1478                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1479                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1480                         cursor->record->base.base.delete_tid = tid;
1481                         hammer_modify_buffer_done(cursor->record_buffer);
1482                 }
1483         }
1484
1485         /*
1486          * If we were mounted with the nohistory option, we physically
1487          * delete the record.
1488          */
1489         if (hmp->hflags & HMNT_NOHISTORY)
1490                 dodelete = 1;
1491
1492         if (error == 0 && dodelete) {
1493                 error = hammer_delete_at_cursor(cursor, NULL);
1494                 if (error) {
1495                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1496                         error = 0;
1497                 }
1498         }
1499         return(error);
1500 }
1501
1502 int
1503 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1504 {
1505         hammer_btree_elm_t elm;
1506         hammer_off_t rec_offset;
1507         hammer_off_t data_offset;
1508         int32_t data_len;
1509         u_int16_t rec_type;
1510         int error;
1511
1512         elm = &cursor->node->ondisk->elms[cursor->index];
1513         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1514
1515         rec_offset = elm->leaf.rec_offset;
1516         data_offset = elm->leaf.data_offset;
1517         data_len = elm->leaf.data_len;
1518         rec_type = elm->leaf.base.rec_type;
1519
1520         error = hammer_btree_delete(cursor);
1521         if (error == 0) {
1522                 /*
1523                  * This forces a fixup for the iteration because
1524                  * the cursor is now either sitting at the 'next'
1525                  * element or sitting at the end of a leaf.
1526                  */
1527                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1528                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1529                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1530                 }
1531         }
1532         if (error == 0) {
1533                 hammer_blockmap_free(cursor->trans, rec_offset,
1534                                      sizeof(union hammer_record_ondisk));
1535         }
1536         if (error == 0) {
1537                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1538                 case HAMMER_ZONE_LARGE_DATA:
1539                 case HAMMER_ZONE_SMALL_DATA:
1540                         hammer_blockmap_free(cursor->trans,
1541                                              data_offset, data_len);
1542                         break;
1543                 default:
1544                         break;
1545                 }
1546         }
1547 #if 0
1548         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1549                 "(%d remain in cluster)\n",
1550                 cluster->volume->vol_no, cluster->clu_no,
1551                 rec_offset, data_offset, data_len,
1552                 cluster->ondisk->stat_records);
1553 #endif
1554         return (error);
1555 }
1556
1557 /*
1558  * Determine whether a directory is empty or not.  Returns 0 if the directory
1559  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1560  */
1561 int
1562 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1563 {
1564         struct hammer_cursor cursor;
1565         int error;
1566
1567         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1568
1569         cursor.key_beg.obj_id = ip->obj_id;
1570         cursor.key_beg.create_tid = 0;
1571         cursor.key_beg.delete_tid = 0;
1572         cursor.key_beg.obj_type = 0;
1573         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1574         cursor.key_beg.key = HAMMER_MIN_KEY;
1575
1576         cursor.key_end = cursor.key_beg;
1577         cursor.key_end.rec_type = 0xFFFF;
1578         cursor.key_end.key = HAMMER_MAX_KEY;
1579
1580         cursor.asof = ip->obj_asof;
1581         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1582
1583         error = hammer_ip_first(&cursor, ip);
1584         if (error == ENOENT)
1585                 error = 0;
1586         else if (error == 0)
1587                 error = ENOTEMPTY;
1588         hammer_done_cursor(&cursor);
1589         return(error);
1590 }
1591