HAMMER 38A/Many: Undo/Synchronization and crash recovery
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.41 2008/04/24 21:20:33 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71         return(0);
72 }
73
74 static int
75 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
76 {
77         if (info->rec_type < rec->rec.base.base.rec_type)
78                 return(-3);
79         if (info->rec_type > rec->rec.base.base.rec_type)
80                 return(3);
81
82         if (info->key < rec->rec.base.base.key)
83                 return(-2);
84         if (info->key > rec->rec.base.base.key)
85                 return(2);
86
87         if (info->create_tid == 0) {
88                 if (rec->rec.base.base.create_tid == 0)
89                         return(0);
90                 return(1);
91         }
92         if (rec->rec.base.base.create_tid == 0)
93                 return(-1);
94         if (info->create_tid < rec->rec.base.base.create_tid)
95                 return(-1);
96         if (info->create_tid > rec->rec.base.base.create_tid)
97                 return(1);
98         return(0);
99 }
100
101 /*
102  * RB_SCAN comparison code for hammer_mem_first().  The argument order
103  * is reversed so the comparison result has to be negated.  key_beg and
104  * key_end are both range-inclusive.
105  *
106  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
107  * These do not stop the scan.
108  *
109  * Localized deletions are not cached in-memory.
110  */
111 static
112 int
113 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
114 {
115         hammer_cursor_t cursor = data;
116         int r;
117
118         r = hammer_rec_compare(&cursor->key_beg, rec);
119         if (r > 1)
120                 return(-1);
121         r = hammer_rec_compare(&cursor->key_end, rec);
122         if (r < -1)
123                 return(1);
124         return(0);
125 }
126
127 /*
128  * This compare function is used when simply looking up key_beg.
129  */
130 static
131 int
132 hammer_rec_find_cmp(hammer_record_t rec, void *data)
133 {
134         hammer_cursor_t cursor = data;
135         int r;
136
137         r = hammer_rec_compare(&cursor->key_beg, rec);
138         if (r > 1)
139                 return(-1);
140         if (r < -1)
141                 return(1);
142         return(0);
143 }
144
145 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
146 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
147                     hammer_rec_compare, hammer_base_elm_t);
148
149 /*
150  * Allocate a record for the caller to finish filling in.  The record is
151  * returned referenced.
152  */
153 hammer_record_t
154 hammer_alloc_mem_record(hammer_inode_t ip)
155 {
156         hammer_record_t record;
157
158         ++hammer_count_records;
159         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
160         record->state = HAMMER_FST_IDLE;
161         record->ip = ip;
162         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
163         hammer_ref(&record->lock);
164         return (record);
165 }
166
167 void
168 hammer_wait_mem_record(hammer_record_t record)
169 {
170         while (record->state == HAMMER_FST_FLUSH) {
171                 record->flags |= HAMMER_RECF_WANTED;
172                 tsleep(record, 0, "hmrrc2", 0);
173         }
174 }
175
176 /*
177  * Called from the backend, hammer_inode.c, when a record has been
178  * flushed to disk.
179  *
180  * The backend has likely marked this record for deletion as well.
181  */
182 void
183 hammer_flush_record_done(hammer_record_t record)
184 {
185         KKASSERT(record->state == HAMMER_FST_FLUSH);
186         record->state = HAMMER_FST_IDLE;
187         if (record->flags & HAMMER_RECF_WANTED) {
188                 record->flags &= ~HAMMER_RECF_WANTED;
189                 wakeup(record);
190         }
191         hammer_rel_mem_record(record);
192 }
193
194
195 /*
196  * Release a memory record.  Records marked for deletion are immediately
197  * removed from the RB-Tree but otherwise left intact until the last ref
198  * goes away.
199  */
200 void
201 hammer_rel_mem_record(struct hammer_record *record)
202 {
203         hammer_unref(&record->lock);
204
205         if (record->flags & HAMMER_RECF_DELETED_FE) {
206                 if (record->lock.refs == 0) {
207                         if (record->flags & HAMMER_RECF_ONRBTREE) {
208                                 RB_REMOVE(hammer_rec_rb_tree,
209                                           &record->ip->rec_tree,
210                                           record);
211                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
212                         }
213                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
214                                 --hammer_count_record_datas;
215                                 kfree(record->data, M_HAMMER);
216                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
217                         }
218                         record->data = NULL;
219                         --hammer_count_records;
220                         kfree(record, M_HAMMER);
221                         return;
222                 }
223         }
224
225         /*
226          * If someone wanted the record wake them up.
227          */
228         if (record->flags & HAMMER_RECF_WANTED) {
229                 record->flags &= ~HAMMER_RECF_WANTED;
230                 wakeup(record);
231         }
232 }
233
234 /*
235  * The deletion state of a record will appear different to the backend
236  * then it does to the frontend.
237  */
238 static __inline
239 int
240 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t rec)
241 {
242         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
243                 if (rec->flags & HAMMER_RECF_DELETED_BE)
244                         return(0);
245         } else {
246                 if (rec->flags & HAMMER_RECF_DELETED_FE)
247                         return(0);
248         }
249         return(1);
250 }
251
252 /*
253  * This callback is used as part of the RB_SCAN function for in-memory
254  * records.  We terminate it (return -1) as soon as we get a match.
255  *
256  * This routine is used by frontend code.
257  *
258  * The primary compare code does not account for ASOF lookups.  This
259  * code handles that case as well as a few others.
260  */
261 static
262 int
263 hammer_rec_scan_callback(hammer_record_t rec, void *data)
264 {
265         hammer_cursor_t cursor = data;
266
267         /*
268          * We terminate on success, so this should be NULL on entry.
269          */
270         KKASSERT(cursor->iprec == NULL);
271
272         /*
273          * Skip if the record was marked deleted.
274          */
275         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
276                 return(0);
277
278         /*
279          * Skip if not visible due to our as-of TID
280          */
281         if (cursor->flags & HAMMER_CURSOR_ASOF) {
282                 if (cursor->asof < rec->rec.base.base.create_tid)
283                         return(0);
284                 if (rec->rec.base.base.delete_tid &&
285                     cursor->asof >= rec->rec.base.base.delete_tid) {
286                         return(0);
287                 }
288         }
289
290         /*
291          * If the record is queued to the flusher we have to block until
292          * it isn't.  Otherwise we may see duplication between our memory
293          * cache and the media.
294          */
295         hammer_ref(&rec->lock);
296
297 #warning "This deadlocks"
298 #if 0
299         if (rec->state == HAMMER_FST_FLUSH)
300                 hammer_wait_mem_record(rec);
301 #endif
302
303         /*
304          * The record may have been deleted while we were blocked.
305          */
306         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
307                 hammer_rel_mem_record(rec);
308                 return(0);
309         }
310
311         /*
312          * Set the matching record and stop the scan.
313          */
314         cursor->iprec = rec;
315         return(-1);
316 }
317
318
319 /*
320  * Lookup an in-memory record given the key specified in the cursor.  Works
321  * just like hammer_btree_lookup() but operates on an inode's in-memory
322  * record list.
323  *
324  * The lookup must fail if the record is marked for deferred deletion.
325  */
326 static
327 int
328 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
329 {
330         int error;
331
332         if (cursor->iprec) {
333                 hammer_rel_mem_record(cursor->iprec);
334                 cursor->iprec = NULL;
335         }
336         if (cursor->ip) {
337                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
338                 --cursor->ip->cursor_ip_refs;
339 #if 0
340                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
341                                                   &cursor->ip->rec_tree);
342 #endif
343         }
344         cursor->ip = ip;
345 #if 0
346         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
347 #endif
348         ++ip->cursor_ip_refs;
349
350 #if 0
351         cursor->scan.node = NULL;
352 #endif
353         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
354                                    hammer_rec_scan_callback, cursor);
355
356         if (cursor->iprec == NULL)
357                 error = ENOENT;
358         else
359                 error = 0;
360         return(error);
361 }
362
363 /*
364  * hammer_mem_first() - locate the first in-memory record matching the
365  * cursor within the bounds of the key range.
366  */
367 static
368 int
369 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
370 {
371         if (cursor->iprec) {
372                 hammer_rel_mem_record(cursor->iprec);
373                 cursor->iprec = NULL;
374         }
375         if (cursor->ip) {
376                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
377                 --cursor->ip->cursor_ip_refs;
378 #if 0
379                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
380                                                   &cursor->ip->rec_tree);
381 #endif
382         }
383         cursor->ip = ip;
384 #if 0
385         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
386 #endif
387         ++ip->cursor_ip_refs;
388
389 #if 0
390         cursor->scan.node = NULL;
391 #endif
392         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
393                                    hammer_rec_scan_callback, cursor);
394
395         /*
396          * Adjust scan.node and keep it linked into the RB-tree so we can
397          * hold the cursor through third party modifications of the RB-tree.
398          */
399         if (cursor->iprec) {
400 #if 0
401                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
402 #endif
403                 return(0);
404         }
405         return(ENOENT);
406 }
407
408 void
409 hammer_mem_done(hammer_cursor_t cursor)
410 {
411         if (cursor->ip) {
412                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
413                 --cursor->ip->cursor_ip_refs;
414 #if 0
415                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
416                                                   &cursor->ip->rec_tree);
417 #endif
418                 cursor->ip = NULL;
419         }
420         if (cursor->iprec) {
421                 hammer_rel_mem_record(cursor->iprec);
422                 cursor->iprec = NULL;
423         }
424 }
425
426 /************************************************************************
427  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
428  ************************************************************************
429  *
430  * These functions manipulate in-memory records.  Such records typically
431  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
432  */
433
434 /*
435  * Add a directory entry (dip,ncp) which references inode (ip).
436  *
437  * Note that the low 32 bits of the namekey are set temporarily to create
438  * a unique in-memory record, and may be modified a second time when the
439  * record is synchronized to disk.  In particular, the low 32 bits cannot be
440  * all 0's when synching to disk, which is not handled here.
441  */
442 int
443 hammer_ip_add_directory(struct hammer_transaction *trans,
444                      struct hammer_inode *dip, struct namecache *ncp,
445                      struct hammer_inode *ip)
446 {
447         hammer_record_t record;
448         int error;
449         int bytes;
450
451         record = hammer_alloc_mem_record(dip);
452
453         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
454         if (++trans->hmp->namekey_iterator == 0)
455                 ++trans->hmp->namekey_iterator;
456
457         record->rec.entry.base.base.obj_id = dip->obj_id;
458         record->rec.entry.base.base.key =
459                 hammer_directory_namekey(ncp->nc_name, bytes);
460         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
461         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
462         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
463         record->rec.entry.obj_id = ip->obj_id;
464         record->data = (void *)ncp->nc_name;
465         record->rec.entry.base.data_len = bytes;
466         ++ip->ino_rec.ino_nlinks;
467         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
468         /* NOTE: copies record->data */
469         error = hammer_mem_add(trans, record);
470         return(error);
471 }
472
473 /*
474  * Delete the directory entry and update the inode link count.  The
475  * cursor must be seeked to the directory entry record being deleted.
476  *
477  * The related inode should be share-locked by the caller.  The caller is
478  * on the frontend.
479  *
480  * This function can return EDEADLK requiring the caller to terminate
481  * the cursor, any locks, wait on the returned record, and retry.
482  */
483 int
484 hammer_ip_del_directory(struct hammer_transaction *trans,
485                      hammer_cursor_t cursor, struct hammer_inode *dip,
486                      struct hammer_inode *ip)
487 {
488         hammer_record_t record;
489         int error;
490
491         if (cursor->record == &cursor->iprec->rec) {
492                 /*
493                  * In-memory (unsynchronized) records can simply be freed.
494                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
495                  * by the backend, we must still avoid races against the
496                  * backend potentially syncing the record to the media. 
497                  *
498                  * We cannot call hammer_ip_delete_record(), that routine may
499                  * only be called from the backend.
500                  */
501                 record = cursor->iprec;
502                 if (record->state == HAMMER_FST_FLUSH) {
503                         KKASSERT(cursor->deadlk_rec == NULL);
504                         hammer_ref(&record->lock);
505                         cursor->deadlk_rec = record;
506                         error = EDEADLK;
507                 } else {
508                         cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
509                         error = 0;
510                 }
511         } else {
512                 /*
513                  * If the record is on-disk we have to queue the deletion by
514                  * the record's key.  This also causes lookups to skip the
515                  * record.
516                  */
517                 record = hammer_alloc_mem_record(dip);
518                 record->rec.entry.base.base = cursor->record->base.base;
519                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
520                 record->flags |= HAMMER_RECF_DELETE_ONDISK;
521
522                 error = hammer_mem_add(trans, record);
523         }
524
525         /*
526          * One less link.  The file may still be open in the OS even after
527          * all links have gone away so we only try to sync if the OS has
528          * no references and nlinks falls to 0.
529          *
530          * We have to terminate the cursor before syncing the inode to
531          * avoid deadlocking against ourselves.
532          *
533          * XXX we can't sync the inode here because the encompassing
534          * transaction might be a rename and might update the inode
535          * again with a new link.  That would force the delete_tid to be
536          * the same as the create_tid and cause a panic.
537          */
538         if (error == 0) {
539                 --ip->ino_rec.ino_nlinks;
540                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
541                 if (ip->ino_rec.ino_nlinks == 0 &&
542                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
543                         hammer_done_cursor(cursor);
544                 }
545
546         }
547         return(error);
548 }
549
550 /*
551  * Add a record to an inode.
552  *
553  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
554  * initialize the following additional fields:
555  *
556  * The related inode should be share-locked by the caller.  The caller is
557  * on the frontend.
558  *
559  * record->rec.entry.base.base.key
560  * record->rec.entry.base.base.rec_type
561  * record->rec.entry.base.base.data_len
562  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
563  */
564 int
565 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
566 {
567         hammer_inode_t ip = record->ip;
568         int error;
569
570         record->rec.base.base.obj_id = ip->obj_id;
571         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
572
573         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
574         /* NOTE: copies record->data */
575         error = hammer_mem_add(trans, record);
576         return(error);
577 }
578
579 /*
580  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
581  * is called via the strategy called from a cached data source.  This code
582  * is responsible for actually writing a data record out to the disk.
583  *
584  * This can only occur non-historically (i.e. 'current' data only).
585  *
586  * The file offset must be HAMMER_BUFSIZE aligned but the data length
587  * can be truncated.  The record (currently) always represents a BUFSIZE
588  * swath of space whether the data is truncated or not.
589  */
590 int
591 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
592                        int64_t offset, void *data, int bytes)
593 {
594         struct hammer_cursor cursor;
595         hammer_record_ondisk_t rec;
596         union hammer_btree_elm elm;
597         hammer_off_t rec_offset;
598         void *bdata;
599         int error;
600
601         KKASSERT((offset & HAMMER_BUFMASK) == 0);
602         KKASSERT(trans->type == HAMMER_TRANS_FLS);
603 retry:
604         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
605         if (error)
606                 return(error);
607         cursor.key_beg.obj_id = ip->obj_id;
608         cursor.key_beg.key = offset + bytes;
609         cursor.key_beg.create_tid = trans->tid;
610         cursor.key_beg.delete_tid = 0;
611         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
612         cursor.asof = trans->tid;
613         cursor.flags |= HAMMER_CURSOR_INSERT;
614
615         /*
616          * Issue a lookup to position the cursor.
617          */
618         error = hammer_btree_lookup(&cursor);
619         if (error == 0) {
620                 kprintf("hammer_ip_sync_data: duplicate data at "
621                         "(%lld,%d) tid %016llx\n",
622                         offset, bytes, trans->tid);
623                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
624                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
625                 panic("Duplicate data");
626                 error = EIO;
627         }
628         if (error != ENOENT)
629                 goto done;
630
631         /*
632          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
633          * can cross buffer boundaries so we may have to split our bcopy.
634          */
635         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
636                                   &cursor.record_buffer,
637                                   bytes, &bdata,
638                                   &cursor.data_buffer, &error);
639         if (rec == NULL)
640                 goto done;
641         if (hammer_debug_general & 0x1000)
642                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
643
644         /*
645          * Fill everything in and insert our B-Tree node.
646          *
647          * NOTE: hammer_alloc_record() has already marked the related
648          * buffers as modified.  If we do it again we will generate
649          * unnecessary undo elements.
650          */
651         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
652         rec->base.base.obj_id = ip->obj_id;
653         rec->base.base.key = offset + bytes;
654         rec->base.base.create_tid = trans->tid;
655         rec->base.base.delete_tid = 0;
656         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
657         rec->base.data_crc = crc32(data, bytes);
658         KKASSERT(rec->base.data_len == bytes);
659
660         bcopy(data, bdata, bytes);
661
662         elm.leaf.base = rec->base.base;
663         elm.leaf.rec_offset = rec_offset;
664         elm.leaf.data_offset = rec->base.data_off;
665         elm.leaf.data_len = bytes;
666         elm.leaf.data_crc = rec->base.data_crc;
667
668         /*
669          * Data records can wind up on-disk before the inode itself is
670          * on-disk.  One must assume data records may be on-disk if either
671          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
672          */
673         ip->flags |= HAMMER_INODE_DONDISK;
674
675         error = hammer_btree_insert(&cursor, &elm);
676         if (error == 0)
677                 goto done;
678
679         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
680 done:
681         hammer_done_cursor(&cursor);
682         if (error == EDEADLK)
683                 goto retry;
684         return(error);
685 }
686
687 /*
688  * Sync an in-memory record to the disk.  This is called by the backend.
689  * This code is responsible for actually writing a record out to the disk.
690  *
691  * NOTE: The frontend can mark the record deleted while it is queued to
692  * the backend.  The deletion applies to a frontend operation and the
693  * record must be treated as NOT having been deleted on the backend, so
694  * we ignore the flag.
695  */
696 int
697 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
698 {
699         struct hammer_cursor cursor;
700         hammer_record_ondisk_t rec;
701         union hammer_btree_elm elm;
702         hammer_off_t rec_offset;
703         void *bdata;
704         int error;
705
706         KKASSERT(record->state == HAMMER_FST_FLUSH);
707
708 retry:
709         /*
710          * Get a cursor, we will either be inserting or deleting.
711          */
712         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
713         if (error)
714                 return(error);
715         cursor.key_beg = record->rec.base.base;
716
717         /*
718          * If we are deleting an exact match must be found on-disk.
719          */
720         if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
721                 error = hammer_btree_lookup(&cursor);
722                 kprintf("DELETE MEM ENTRY1 %d\n", error);
723                 if (error == 0)
724                         error = hammer_ip_delete_record(&cursor, trans->tid);
725                 kprintf("DELETE MEM ENTRY2 %d\n", error);
726                 if (error == 0)
727                         record->flags |= HAMMER_RECF_DELETED_FE;
728                 goto done;
729         }
730
731         /*
732          * We are inserting.
733          *
734          * Issue a lookup to position the cursor and locate the cluster.  The
735          * target key should not exist.  If we are creating a directory entry
736          * we may have to iterate the low 32 bits of the key to find an unused
737          * key.
738          */
739         cursor.flags |= HAMMER_CURSOR_INSERT;
740
741         for (;;) {
742                 error = hammer_btree_lookup(&cursor);
743                 if (error)
744                         break;
745                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
746                         kprintf("hammer_ip_sync_record: duplicate rec "
747                                 "at (%016llx)\n", record->rec.base.base.key);
748                         Debugger("duplicate record1");
749                         error = EIO;
750                         break;
751                 }
752                 if (++trans->hmp->namekey_iterator == 0)
753                         ++trans->hmp->namekey_iterator;
754                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
755                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
756                 cursor.key_beg.key = record->rec.base.base.key;
757         }
758         if (error != ENOENT)
759                 goto done;
760
761         /*
762          * Allocate the record and data.  The result buffers will be
763          * marked as being modified and further calls to
764          * hammer_modify_buffer() will result in unneeded UNDO records.
765          *
766          * Support zero-fill records (data == NULL and data_len != 0)
767          */
768         if (record->data == NULL) {
769                 rec = hammer_alloc_record(trans, &rec_offset,
770                                           record->rec.base.base.rec_type,
771                                           &cursor.record_buffer,
772                                           0, &bdata,
773                                           NULL, &error);
774                 if (hammer_debug_general & 0x1000)
775                         kprintf("NULL RECORD DATA\n");
776         } else if (record->flags & HAMMER_RECF_INBAND) {
777                 rec = hammer_alloc_record(trans, &rec_offset,
778                                           record->rec.base.base.rec_type,
779                                           &cursor.record_buffer,
780                                           record->rec.base.data_len, &bdata,
781                                           NULL, &error);
782                 if (hammer_debug_general & 0x1000)
783                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
784         } else {
785                 rec = hammer_alloc_record(trans, &rec_offset,
786                                           record->rec.base.base.rec_type,
787                                           &cursor.record_buffer,
788                                           record->rec.base.data_len, &bdata,
789                                           &cursor.data_buffer, &error);
790                 if (hammer_debug_general & 0x1000)
791                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
792         }
793
794         if (rec == NULL)
795                 goto done;
796
797         /*
798          * Fill in the remaining fields and insert our B-Tree node.
799          */
800         rec->base.base = record->rec.base.base;
801         bcopy(&record->rec.base + 1, &rec->base + 1,
802               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
803
804         /*
805          * Copy the data and deal with zero-fill support.
806          */
807         if (record->data) {
808                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
809                 bcopy(record->data, bdata, rec->base.data_len);
810         } else {
811                 rec->base.data_len = record->rec.base.data_len;
812         }
813
814         elm.leaf.base = record->rec.base.base;
815         elm.leaf.rec_offset = rec_offset;
816         elm.leaf.data_offset = rec->base.data_off;
817         elm.leaf.data_len = rec->base.data_len;
818         elm.leaf.data_crc = rec->base.data_crc;
819
820         error = hammer_btree_insert(&cursor, &elm);
821
822         /*
823          * Clean up on success, or fall through on error.
824          */
825         if (error == 0) {
826                 record->flags |= HAMMER_RECF_DELETED_FE;
827                 goto done;
828         }
829
830         /*
831          * Try to unwind the allocation
832          */
833         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
834 done:
835         hammer_done_cursor(&cursor);
836         if (error == EDEADLK)
837                 goto retry;
838         return(error);
839 }
840
841 /*
842  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
843  * entry's key is used to deal with hash collisions in the upper 32 bits.
844  * A unique 64 bit key is generated in-memory and may be regenerated a
845  * second time when the directory record is flushed to the on-disk B-Tree.
846  *
847  * A referenced record is passed to this function.  This function
848  * eats the reference.  If an error occurs the record will be deleted.
849  *
850  * A copy of the temporary record->data pointer provided by the caller
851  * will be made.
852  */
853 static
854 int
855 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
856 {
857         void *data;
858         int bytes;
859         int reclen;
860                 
861         /*
862          * Make a private copy of record->data
863          */
864         if (record->data) {
865                 /*
866                  * Try to embed the data in extra space in the record
867                  * union, otherwise allocate a copy.
868                  */
869                 bytes = record->rec.base.data_len;
870                 switch(record->rec.base.base.rec_type) {
871                 case HAMMER_RECTYPE_DIRENTRY:
872                         reclen = offsetof(struct hammer_entry_record, name[0]);
873                         break;
874                 case HAMMER_RECTYPE_DATA:
875                         reclen = offsetof(struct hammer_data_record, data[0]);
876                         break;
877                 default:
878                         reclen = sizeof(record->rec);
879                         break;
880                 }
881                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
882                         bcopy(record->data, (char *)&record->rec + reclen,
883                               bytes);
884                         record->data = (void *)((char *)&record->rec + reclen);
885                         record->flags |= HAMMER_RECF_INBAND;
886                 } else {
887                         ++hammer_count_record_datas;
888                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
889                         record->flags |= HAMMER_RECF_ALLOCDATA;
890                         bcopy(record->data, data, bytes);
891                         record->data = data;
892                 }
893         }
894
895         /*
896          * Insert into the RB tree, find an unused iterator if this is
897          * a directory entry.
898          */
899         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
900                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
901                         record->flags |= HAMMER_RECF_DELETED_FE;
902                         hammer_rel_mem_record(record);
903                         return (EEXIST);
904                 }
905                 if (++trans->hmp->namekey_iterator == 0)
906                         ++trans->hmp->namekey_iterator;
907                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
908                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
909         }
910         record->flags |= HAMMER_RECF_ONRBTREE;
911         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
912         hammer_rel_mem_record(record);
913         return(0);
914 }
915
916 /************************************************************************
917  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
918  ************************************************************************
919  *
920  * These functions augment the B-Tree scanning functions in hammer_btree.c
921  * by merging in-memory records with on-disk records.
922  */
923
924 /*
925  * Locate a particular record either in-memory or on-disk.
926  *
927  * NOTE: This is basically a standalone routine, hammer_ip_next() may
928  * NOT be called to iterate results.
929  */
930 int
931 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
932 {
933         int error;
934
935         /*
936          * If the element is in-memory return it without searching the
937          * on-disk B-Tree
938          */
939         error = hammer_mem_lookup(cursor, ip);
940         if (error == 0) {
941                 cursor->record = &cursor->iprec->rec;
942                 return(error);
943         }
944         if (error != ENOENT)
945                 return(error);
946
947         /*
948          * If the inode has on-disk components search the on-disk B-Tree.
949          */
950         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
951                 return(error);
952         error = hammer_btree_lookup(cursor);
953         if (error == 0)
954                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
955         return(error);
956 }
957
958 /*
959  * Locate the first record within the cursor's key_beg/key_end range,
960  * restricted to a particular inode.  0 is returned on success, ENOENT
961  * if no records matched the requested range, or some other error.
962  *
963  * When 0 is returned hammer_ip_next() may be used to iterate additional
964  * records within the requested range.
965  *
966  * This function can return EDEADLK, requiring the caller to terminate
967  * the cursor and try again.
968  */
969 int
970 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
971 {
972         int error;
973
974         /*
975          * Clean up fields and setup for merged scan
976          */
977         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
978         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
979         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
980         if (cursor->iprec) {
981                 hammer_rel_mem_record(cursor->iprec);
982                 cursor->iprec = NULL;
983         }
984
985         /*
986          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
987          * exact lookup so if we get ENOENT we have to call the iterate
988          * function to validate the first record after the begin key.
989          *
990          * The ATEDISK flag is used by hammer_btree_iterate to determine
991          * whether it must index forwards or not.  It is also used here
992          * to select the next record from in-memory or on-disk.
993          *
994          * EDEADLK can only occur if the lookup hit an empty internal
995          * element and couldn't delete it.  Since this could only occur
996          * in-range, we can just iterate from the failure point.
997          */
998         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
999                 error = hammer_btree_lookup(cursor);
1000                 if (error == ENOENT || error == EDEADLK) {
1001                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1002                         if (hammer_debug_general & 0x2000)
1003                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1004                         error = hammer_btree_iterate(cursor);
1005                 }
1006                 if (error && error != ENOENT) 
1007                         return(error);
1008                 if (error == 0) {
1009                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1010                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1011                 } else {
1012                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1013                 }
1014         }
1015
1016         /*
1017          * Search the in-memory record list (Red-Black tree).  Unlike the
1018          * B-Tree search, mem_first checks for records in the range.
1019          */
1020         error = hammer_mem_first(cursor, ip);
1021         if (error && error != ENOENT)
1022                 return(error);
1023         if (error == 0) {
1024                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1025                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1026                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1027                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1028         }
1029
1030         /*
1031          * This will return the first matching record.
1032          */
1033         return(hammer_ip_next(cursor));
1034 }
1035
1036 /*
1037  * Retrieve the next record in a merged iteration within the bounds of the
1038  * cursor.  This call may be made multiple times after the cursor has been
1039  * initially searched with hammer_ip_first().
1040  *
1041  * 0 is returned on success, ENOENT if no further records match the
1042  * requested range, or some other error code is returned.
1043  */
1044 int
1045 hammer_ip_next(hammer_cursor_t cursor)
1046 {
1047         hammer_btree_elm_t elm;
1048         hammer_record_t rec, save;
1049         int error;
1050         int r;
1051
1052 next_btree:
1053         /*
1054          * Load the current on-disk and in-memory record.  If we ate any
1055          * records we have to get the next one. 
1056          *
1057          * If we deleted the last on-disk record we had scanned ATEDISK will
1058          * be clear and DELBTREE will be set, forcing a call to iterate. The
1059          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1060          * element.  If ATEDISK is set, iterate will skip the 'current'
1061          * element.
1062          *
1063          * Get the next on-disk record
1064          */
1065         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1066                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1067                         error = hammer_btree_iterate(cursor);
1068                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1069                         if (error == 0)
1070                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1071                         else
1072                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1073                                                  HAMMER_CURSOR_ATEDISK;
1074                 }
1075         }
1076
1077 next_memory:
1078         /*
1079          * Get the next in-memory record.  The record can be ripped out
1080          * of the RB tree so we maintain a scan_info structure to track
1081          * the next node.
1082          *
1083          * hammer_rec_scan_cmp:  Is the record still in our general range,
1084          *                       (non-inclusive of snapshot exclusions)?
1085          * hammer_rec_scan_callback: Is the record in our snapshot?
1086          */
1087         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1088                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1089                         save = cursor->iprec;
1090                         cursor->iprec = NULL;
1091                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1092                         while (rec) {
1093                                 if (hammer_ip_iterate_mem_good(cursor, rec)) {
1094                                         if (hammer_rec_scan_cmp(rec, cursor) != 0)
1095                                                 break;
1096                                         if (hammer_rec_scan_callback(rec, cursor) != 0)
1097                                                 break;
1098                                 }
1099                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1100                         }
1101                         if (save)
1102                                 hammer_rel_mem_record(save);
1103                         if (cursor->iprec) {
1104                                 KKASSERT(cursor->iprec == rec);
1105                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1106 #if 0
1107                                 cursor->scan.node =
1108                                         hammer_rec_rb_tree_RB_NEXT(rec);
1109 #endif
1110                         } else {
1111                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1112                         }
1113                 }
1114         }
1115
1116         /*
1117          * Extract either the disk or memory record depending on their
1118          * relative position.
1119          */
1120         error = 0;
1121         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1122         case 0:
1123                 /*
1124                  * Both entries valid
1125                  */
1126                 elm = &cursor->node->ondisk->elms[cursor->index];
1127                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1128                 if (r < 0) {
1129                         error = hammer_btree_extract(cursor,
1130                                                      HAMMER_CURSOR_GET_RECORD);
1131                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1132                         break;
1133                 }
1134
1135                 /*
1136                  * If the entries match the memory entry must specify
1137                  * an on-disk deletion.  Eat both entries unless the
1138                  * caller wants visibility into the special records.
1139                  */
1140                 if (r == 0) {
1141                         KKASSERT(cursor->iprec->flags & 
1142                                  HAMMER_RECF_DELETE_ONDISK);
1143                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1144                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1145                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1146                                 kprintf("SKIP MEM ENTRY\n");
1147                                 goto next_btree;
1148                         }
1149                 }
1150                 /* fall through to the memory entry */
1151         case HAMMER_CURSOR_ATEDISK:
1152                 /*
1153                  * Only the memory entry is valid.  If the record is
1154                  * placemarking an on-disk deletion, we skip it unless
1155                  * the caller wants special record visibility.
1156                  */
1157                 cursor->record = &cursor->iprec->rec;
1158                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1159                 if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) {
1160                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1161                                 goto next_memory;
1162                 }
1163                 break;
1164         case HAMMER_CURSOR_ATEMEM:
1165                 /*
1166                  * Only the disk entry is valid
1167                  */
1168                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1169                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1170                 break;
1171         default:
1172                 /*
1173                  * Neither entry is valid
1174                  *
1175                  * XXX error not set properly
1176                  */
1177                 cursor->record = NULL;
1178                 error = ENOENT;
1179                 break;
1180         }
1181         return(error);
1182 }
1183
1184 /*
1185  * Resolve the cursor->data pointer for the current cursor position in
1186  * a merged iteration.
1187  */
1188 int
1189 hammer_ip_resolve_data(hammer_cursor_t cursor)
1190 {
1191         int error;
1192
1193         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1194                 cursor->data = cursor->iprec->data;
1195                 error = 0;
1196         } else {
1197                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1198         }
1199         return(error);
1200 }
1201
1202 int
1203 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1204 {
1205         int error;
1206
1207         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1208                 cursor->data = cursor->iprec->data;
1209                 error = 0;
1210         } else {
1211                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1212                                                      HAMMER_CURSOR_GET_RECORD);
1213         }
1214         return(error);
1215 }
1216
1217 /*
1218  * Delete all records within the specified range for inode ip.
1219  *
1220  * NOTE: An unaligned range will cause new records to be added to cover
1221  * the edge cases. (XXX not implemented yet).
1222  *
1223  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1224  *
1225  * NOTE: Record keys for regular file data have to be special-cased since
1226  * they indicate the end of the range (key = base + bytes).
1227  */
1228 int
1229 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1230                        int64_t ran_beg, int64_t ran_end)
1231 {
1232         struct hammer_cursor cursor;
1233         hammer_record_ondisk_t rec;
1234         hammer_base_elm_t base;
1235         int error;
1236         int64_t off;
1237
1238 #if 0
1239         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1240 #endif
1241
1242         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1243 retry:
1244         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1245
1246         cursor.key_beg.obj_id = ip->obj_id;
1247         cursor.key_beg.create_tid = 0;
1248         cursor.key_beg.delete_tid = 0;
1249         cursor.key_beg.obj_type = 0;
1250         cursor.asof = ip->obj_asof;
1251         cursor.flags |= HAMMER_CURSOR_ASOF;
1252         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1253         cursor.flags |= HAMMER_CURSOR_BACKEND;
1254
1255         cursor.key_end = cursor.key_beg;
1256         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1257                 cursor.key_beg.key = ran_beg;
1258                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1259                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1260                 cursor.key_end.key = ran_end;
1261         } else {
1262                 /*
1263                  * The key in the B-Tree is (base+bytes), so the first possible
1264                  * matching key is ran_beg + 1.
1265                  */
1266                 int64_t tmp64;
1267
1268                 cursor.key_beg.key = ran_beg + 1;
1269                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1270                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1271
1272                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1273                 if (tmp64 < ran_end)
1274                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1275                 else
1276                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1277         }
1278         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1279
1280         error = hammer_ip_first(&cursor, ip);
1281
1282         /*
1283          * Iterate through matching records and mark them as deleted.
1284          */
1285         while (error == 0) {
1286                 rec = cursor.record;
1287                 base = &rec->base.base;
1288
1289                 KKASSERT(base->delete_tid == 0);
1290
1291                 /*
1292                  * There may be overlap cases for regular file data.  Also
1293                  * remember the key for a regular file record is the offset
1294                  * of the last byte of the record (base + len - 1), NOT the
1295                  * base offset.
1296                  */
1297 #if 0
1298                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1299 #endif
1300                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1301 #if 0
1302                         kprintf("delete_range loop key %016llx,%d\n",
1303                                 base->key - rec->base.data_len, rec->base.data_len);
1304 #endif
1305                         off = base->key - rec->base.data_len;
1306                         /*
1307                          * Check the left edge case.  We currently do not
1308                          * split existing records.
1309                          */
1310                         if (off < ran_beg) {
1311                                 panic("hammer left edge case %016llx %d\n",
1312                                         base->key, rec->base.data_len);
1313                         }
1314
1315                         /*
1316                          * Check the right edge case.  Note that the
1317                          * record can be completely out of bounds, which
1318                          * terminates the search.
1319                          *
1320                          * base->key is exclusive of the right edge while
1321                          * ran_end is inclusive of the right edge.  The
1322                          * (key - data_len) left boundary is inclusive.
1323                          *
1324                          * XXX theory-check this test at some point, are
1325                          * we missing a + 1 somewhere?  Note that ran_end
1326                          * could overflow.
1327                          */
1328                         if (base->key - 1 > ran_end) {
1329                                 if (base->key - rec->base.data_len > ran_end)
1330                                         break;
1331                                 panic("hammer right edge case\n");
1332                         }
1333                 }
1334
1335                 /*
1336                  * Mark the record and B-Tree entry as deleted.  This will
1337                  * also physically delete the B-Tree entry, record, and
1338                  * data if the retention policy dictates.  The function
1339                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1340                  * uses to perform a fixup.
1341                  */
1342                 error = hammer_ip_delete_record(&cursor, trans->tid);
1343                 if (error)
1344                         break;
1345                 error = hammer_ip_next(&cursor);
1346         }
1347         hammer_done_cursor(&cursor);
1348         if (error == EDEADLK)
1349                 goto retry;
1350         if (error == ENOENT)
1351                 error = 0;
1352         return(error);
1353 }
1354
1355 /*
1356  * Delete all records associated with an inode except the inode record
1357  * itself.
1358  */
1359 int
1360 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1361 {
1362         struct hammer_cursor cursor;
1363         hammer_record_ondisk_t rec;
1364         hammer_base_elm_t base;
1365         int error;
1366
1367         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1368 retry:
1369         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1370
1371         cursor.key_beg.obj_id = ip->obj_id;
1372         cursor.key_beg.create_tid = 0;
1373         cursor.key_beg.delete_tid = 0;
1374         cursor.key_beg.obj_type = 0;
1375         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1376         cursor.key_beg.key = HAMMER_MIN_KEY;
1377
1378         cursor.key_end = cursor.key_beg;
1379         cursor.key_end.rec_type = 0xFFFF;
1380         cursor.key_end.key = HAMMER_MAX_KEY;
1381
1382         cursor.asof = ip->obj_asof;
1383         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1384         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1385         cursor.flags |= HAMMER_CURSOR_BACKEND;
1386
1387         error = hammer_ip_first(&cursor, ip);
1388
1389         /*
1390          * Iterate through matching records and mark them as deleted.
1391          */
1392         while (error == 0) {
1393                 rec = cursor.record;
1394                 base = &rec->base.base;
1395
1396                 KKASSERT(base->delete_tid == 0);
1397
1398                 /*
1399                  * Mark the record and B-Tree entry as deleted.  This will
1400                  * also physically delete the B-Tree entry, record, and
1401                  * data if the retention policy dictates.  The function
1402                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1403                  * uses to perform a fixup.
1404                  */
1405                 error = hammer_ip_delete_record(&cursor, trans->tid);
1406                 if (error)
1407                         break;
1408                 error = hammer_ip_next(&cursor);
1409         }
1410         hammer_done_cursor(&cursor);
1411         if (error == EDEADLK)
1412                 goto retry;
1413         if (error == ENOENT)
1414                 error = 0;
1415         return(error);
1416 }
1417
1418 /*
1419  * Delete the record at the current cursor.  On success the cursor will
1420  * be positioned appropriately for an iteration but may no longer be at
1421  * a leaf node.
1422  *
1423  * This routine is only called from the backend.
1424  *
1425  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1426  * cursor and retry.
1427  */
1428 int
1429 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1430 {
1431         hammer_btree_elm_t elm;
1432         hammer_mount_t hmp;
1433         int error;
1434         int dodelete;
1435
1436         /*
1437          * In-memory (unsynchronized) records can simply be freed.
1438          */
1439         if (cursor->record == &cursor->iprec->rec) {
1440                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE |
1441                                         HAMMER_RECF_DELETED_BE;
1442                 return(0);
1443         }
1444
1445         /*
1446          * On-disk records are marked as deleted by updating their delete_tid.
1447          * This does not effect their position in the B-Tree (which is based
1448          * on their create_tid).
1449          */
1450         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1451         elm = NULL;
1452         hmp = cursor->node->hmp;
1453
1454         dodelete = 0;
1455         if (error == 0) {
1456                 error = hammer_cursor_upgrade(cursor);
1457                 if (error == 0) {
1458                         elm = &cursor->node->ondisk->elms[cursor->index];
1459                         hammer_modify_node(cursor->trans, cursor->node,
1460                                            elm, sizeof(*elm));
1461                         elm->leaf.base.delete_tid = tid;
1462
1463                         /*
1464                          * An on-disk record cannot have the same delete_tid
1465                          * as its create_tid.  In a chain of record updates
1466                          * this could result in a duplicate record.
1467                          */
1468                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1469                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1470                         cursor->record->base.base.delete_tid = tid;
1471                 }
1472         }
1473
1474         /*
1475          * If we were mounted with the nohistory option, we physically
1476          * delete the record.
1477          */
1478         if (hmp->hflags & HMNT_NOHISTORY)
1479                 dodelete = 1;
1480
1481         if (error == 0 && dodelete) {
1482                 error = hammer_delete_at_cursor(cursor, NULL);
1483                 if (error) {
1484                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1485                         error = 0;
1486                 }
1487         }
1488         return(error);
1489 }
1490
1491 int
1492 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1493 {
1494         hammer_btree_elm_t elm;
1495         hammer_off_t rec_offset;
1496         hammer_off_t data_offset;
1497         int32_t data_len;
1498         u_int16_t rec_type;
1499         int error;
1500
1501         elm = &cursor->node->ondisk->elms[cursor->index];
1502         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1503
1504         rec_offset = elm->leaf.rec_offset;
1505         data_offset = elm->leaf.data_offset;
1506         data_len = elm->leaf.data_len;
1507         rec_type = elm->leaf.base.rec_type;
1508
1509         error = hammer_btree_delete(cursor);
1510         if (error == 0) {
1511                 /*
1512                  * This forces a fixup for the iteration because
1513                  * the cursor is now either sitting at the 'next'
1514                  * element or sitting at the end of a leaf.
1515                  */
1516                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1517                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1518                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1519                 }
1520         }
1521         if (error == 0) {
1522                 hammer_blockmap_free(cursor->trans, rec_offset,
1523                                      sizeof(union hammer_record_ondisk));
1524         }
1525         if (error == 0) {
1526                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1527                 case HAMMER_ZONE_LARGE_DATA:
1528                 case HAMMER_ZONE_SMALL_DATA:
1529                         hammer_blockmap_free(cursor->trans,
1530                                              data_offset, data_len);
1531                         break;
1532                 default:
1533                         break;
1534                 }
1535         }
1536 #if 0
1537         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1538                 "(%d remain in cluster)\n",
1539                 cluster->volume->vol_no, cluster->clu_no,
1540                 rec_offset, data_offset, data_len,
1541                 cluster->ondisk->stat_records);
1542 #endif
1543         return (error);
1544 }
1545
1546 /*
1547  * Determine whether a directory is empty or not.  Returns 0 if the directory
1548  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1549  */
1550 int
1551 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1552 {
1553         struct hammer_cursor cursor;
1554         int error;
1555
1556         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1557
1558         cursor.key_beg.obj_id = ip->obj_id;
1559         cursor.key_beg.create_tid = 0;
1560         cursor.key_beg.delete_tid = 0;
1561         cursor.key_beg.obj_type = 0;
1562         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1563         cursor.key_beg.key = HAMMER_MIN_KEY;
1564
1565         cursor.key_end = cursor.key_beg;
1566         cursor.key_end.rec_type = 0xFFFF;
1567         cursor.key_end.key = HAMMER_MAX_KEY;
1568
1569         cursor.asof = ip->obj_asof;
1570         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1571
1572         error = hammer_ip_first(&cursor, ip);
1573         if (error == ENOENT)
1574                 error = 0;
1575         else if (error == 0)
1576                 error = ENOTEMPTY;
1577         hammer_done_cursor(&cursor);
1578         return(error);
1579 }
1580