HAMMER 38E/Many: Undo/Synchronization and crash recovery
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.44 2008/04/26 08:02:17 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71
72         /*
73          * Never match against a deleted item.
74          */
75         if (rec1->flags & HAMMER_RECF_DELETED_FE)
76                 return(1);
77         if (rec2->flags & HAMMER_RECF_DELETED_FE)
78                 return(-1);
79
80         return(0);
81 }
82
83 static int
84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
85 {
86         if (info->rec_type < rec->rec.base.base.rec_type)
87                 return(-3);
88         if (info->rec_type > rec->rec.base.base.rec_type)
89                 return(3);
90
91         if (info->key < rec->rec.base.base.key)
92                 return(-2);
93         if (info->key > rec->rec.base.base.key)
94                 return(2);
95
96         if (info->create_tid == 0) {
97                 if (rec->rec.base.base.create_tid == 0)
98                         return(0);
99                 return(1);
100         }
101         if (rec->rec.base.base.create_tid == 0)
102                 return(-1);
103         if (info->create_tid < rec->rec.base.base.create_tid)
104                 return(-1);
105         if (info->create_tid > rec->rec.base.base.create_tid)
106                 return(1);
107         return(0);
108 }
109
110 /*
111  * RB_SCAN comparison code for hammer_mem_first().  The argument order
112  * is reversed so the comparison result has to be negated.  key_beg and
113  * key_end are both range-inclusive.
114  *
115  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
116  * These do not stop the scan.
117  *
118  * Localized deletions are not cached in-memory.
119  */
120 static
121 int
122 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
123 {
124         hammer_cursor_t cursor = data;
125         int r;
126
127         r = hammer_rec_compare(&cursor->key_beg, rec);
128         if (r > 1)
129                 return(-1);
130         r = hammer_rec_compare(&cursor->key_end, rec);
131         if (r < -1)
132                 return(1);
133         return(0);
134 }
135
136 /*
137  * This compare function is used when simply looking up key_beg.
138  */
139 static
140 int
141 hammer_rec_find_cmp(hammer_record_t rec, void *data)
142 {
143         hammer_cursor_t cursor = data;
144         int r;
145
146         r = hammer_rec_compare(&cursor->key_beg, rec);
147         if (r > 1)
148                 return(-1);
149         if (r < -1)
150                 return(1);
151         return(0);
152 }
153
154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
156                     hammer_rec_compare, hammer_base_elm_t);
157
158 /*
159  * Allocate a record for the caller to finish filling in.  The record is
160  * returned referenced.
161  */
162 hammer_record_t
163 hammer_alloc_mem_record(hammer_inode_t ip)
164 {
165         hammer_record_t record;
166
167         ++hammer_count_records;
168         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
169         record->state = HAMMER_FST_IDLE;
170         record->ip = ip;
171         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
172         TAILQ_INIT(&record->depend_list);
173         hammer_ref(&record->lock);
174         return (record);
175 }
176
177 void
178 hammer_wait_mem_record(hammer_record_t record)
179 {
180         while (record->state == HAMMER_FST_FLUSH) {
181                 record->flags |= HAMMER_RECF_WANTED;
182                 tsleep(record, 0, "hmrrc2", 0);
183         }
184 }
185
186 /*
187  * Called from the backend, hammer_inode.c, when a record has been
188  * flushed to disk.
189  *
190  * The backend has likely marked this record for deletion as well.
191  */
192 void
193 hammer_flush_record_done(hammer_record_t record)
194 {
195         KKASSERT(record->state == HAMMER_FST_FLUSH);
196         record->state = HAMMER_FST_IDLE;
197         if (record->flags & HAMMER_RECF_WANTED) {
198                 record->flags &= ~HAMMER_RECF_WANTED;
199                 wakeup(record);
200         }
201         hammer_rel_mem_record(record);
202 }
203
204 /*
205  * Destroy a memory record.  The record is marked deleted and will
206  * be freed when the last reference goes away.   Any dependancies are
207  * also destroyed.
208  */
209 void
210 hammer_delete_mem_record(hammer_record_t record)
211 {
212         hammer_depend_t depend;
213
214         record->flags |= HAMMER_RECF_DELETED_FE;
215         while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) {
216                 TAILQ_REMOVE(&record->depend_list, depend, rec_entry);
217                 TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry);
218                 --depend->ip->depend_count;
219                 /* NOTE: inode is not flushed */
220                 hammer_rel_inode(depend->ip, 0);
221                 hammer_unref(&record->lock);
222                 KKASSERT(record->lock.refs > 0);
223                 kfree(depend, M_HAMMER);
224         }
225 }
226
227 /*
228  * Release a memory record.  Records marked for deletion are immediately
229  * removed from the RB-Tree but otherwise left intact until the last ref
230  * goes away.
231  */
232 void
233 hammer_rel_mem_record(struct hammer_record *record)
234 {
235         hammer_unref(&record->lock);
236
237         if (record->flags & HAMMER_RECF_DELETED_FE) {
238                 if (record->lock.refs == 0) {
239                         if (record->flags & HAMMER_RECF_ONRBTREE) {
240                                 RB_REMOVE(hammer_rec_rb_tree,
241                                           &record->ip->rec_tree,
242                                           record);
243                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
244                         }
245                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
246                                 --hammer_count_record_datas;
247                                 kfree(record->data, M_HAMMER);
248                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
249                         }
250                         record->data = NULL;
251                         --hammer_count_records;
252                         kfree(record, M_HAMMER);
253                         return;
254                 }
255         }
256
257         /*
258          * If someone wanted the record wake them up.
259          */
260         if (record->flags & HAMMER_RECF_WANTED) {
261                 record->flags &= ~HAMMER_RECF_WANTED;
262                 wakeup(record);
263         }
264 }
265
266 /*
267  * The deletion state of a record will appear different to the backend
268  * then it does to the frontend.
269  */
270 static __inline
271 int
272 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t rec)
273 {
274         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
275                 if (rec->flags & HAMMER_RECF_DELETED_BE)
276                         return(0);
277         } else {
278                 if (rec->flags & HAMMER_RECF_DELETED_FE)
279                         return(0);
280         }
281         return(1);
282 }
283
284 /*
285  * This callback is used as part of the RB_SCAN function for in-memory
286  * records.  We terminate it (return -1) as soon as we get a match.
287  *
288  * This routine is used by frontend code.
289  *
290  * The primary compare code does not account for ASOF lookups.  This
291  * code handles that case as well as a few others.
292  */
293 static
294 int
295 hammer_rec_scan_callback(hammer_record_t rec, void *data)
296 {
297         hammer_cursor_t cursor = data;
298
299         /*
300          * We terminate on success, so this should be NULL on entry.
301          */
302         KKASSERT(cursor->iprec == NULL);
303
304         /*
305          * Skip if the record was marked deleted.
306          */
307         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
308                 return(0);
309
310         /*
311          * Skip if not visible due to our as-of TID
312          */
313         if (cursor->flags & HAMMER_CURSOR_ASOF) {
314                 if (cursor->asof < rec->rec.base.base.create_tid)
315                         return(0);
316                 if (rec->rec.base.base.delete_tid &&
317                     cursor->asof >= rec->rec.base.base.delete_tid) {
318                         return(0);
319                 }
320         }
321
322         /*
323          * If the record is queued to the flusher we have to block until
324          * it isn't.  Otherwise we may see duplication between our memory
325          * cache and the media.
326          */
327         hammer_ref(&rec->lock);
328
329 #warning "This deadlocks"
330 #if 0
331         if (rec->state == HAMMER_FST_FLUSH)
332                 hammer_wait_mem_record(rec);
333 #endif
334
335         /*
336          * The record may have been deleted while we were blocked.
337          */
338         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
339                 hammer_rel_mem_record(rec);
340                 return(0);
341         }
342
343         /*
344          * Set the matching record and stop the scan.
345          */
346         cursor->iprec = rec;
347         return(-1);
348 }
349
350
351 /*
352  * Lookup an in-memory record given the key specified in the cursor.  Works
353  * just like hammer_btree_lookup() but operates on an inode's in-memory
354  * record list.
355  *
356  * The lookup must fail if the record is marked for deferred deletion.
357  */
358 static
359 int
360 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
361 {
362         int error;
363
364         if (cursor->iprec) {
365                 hammer_rel_mem_record(cursor->iprec);
366                 cursor->iprec = NULL;
367         }
368         if (cursor->ip) {
369                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
370                 --cursor->ip->cursor_ip_refs;
371 #if 0
372                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
373                                                   &cursor->ip->rec_tree);
374 #endif
375         }
376         cursor->ip = ip;
377 #if 0
378         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
379 #endif
380         ++ip->cursor_ip_refs;
381
382 #if 0
383         cursor->scan.node = NULL;
384 #endif
385         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
386                                    hammer_rec_scan_callback, cursor);
387
388         if (cursor->iprec == NULL)
389                 error = ENOENT;
390         else
391                 error = 0;
392         return(error);
393 }
394
395 /*
396  * hammer_mem_first() - locate the first in-memory record matching the
397  * cursor within the bounds of the key range.
398  */
399 static
400 int
401 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
402 {
403         if (cursor->iprec) {
404                 hammer_rel_mem_record(cursor->iprec);
405                 cursor->iprec = NULL;
406         }
407         if (cursor->ip) {
408                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
409                 --cursor->ip->cursor_ip_refs;
410 #if 0
411                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
412                                                   &cursor->ip->rec_tree);
413 #endif
414         }
415         cursor->ip = ip;
416 #if 0
417         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
418 #endif
419         ++ip->cursor_ip_refs;
420
421 #if 0
422         cursor->scan.node = NULL;
423 #endif
424         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
425                                    hammer_rec_scan_callback, cursor);
426
427         /*
428          * Adjust scan.node and keep it linked into the RB-tree so we can
429          * hold the cursor through third party modifications of the RB-tree.
430          */
431         if (cursor->iprec) {
432 #if 0
433                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
434 #endif
435                 return(0);
436         }
437         return(ENOENT);
438 }
439
440 void
441 hammer_mem_done(hammer_cursor_t cursor)
442 {
443         if (cursor->ip) {
444                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
445                 --cursor->ip->cursor_ip_refs;
446 #if 0
447                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
448                                                   &cursor->ip->rec_tree);
449 #endif
450                 cursor->ip = NULL;
451         }
452         if (cursor->iprec) {
453                 hammer_rel_mem_record(cursor->iprec);
454                 cursor->iprec = NULL;
455         }
456 }
457
458 /************************************************************************
459  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
460  ************************************************************************
461  *
462  * These functions manipulate in-memory records.  Such records typically
463  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
464  */
465
466 /*
467  * Add a directory entry (dip,ncp) which references inode (ip).
468  *
469  * Note that the low 32 bits of the namekey are set temporarily to create
470  * a unique in-memory record, and may be modified a second time when the
471  * record is synchronized to disk.  In particular, the low 32 bits cannot be
472  * all 0's when synching to disk, which is not handled here.
473  */
474 int
475 hammer_ip_add_directory(struct hammer_transaction *trans,
476                      struct hammer_inode *dip, struct namecache *ncp,
477                      struct hammer_inode *ip)
478 {
479         hammer_record_t record;
480         hammer_depend_t depend;
481         int error;
482         int bytes;
483
484         record = hammer_alloc_mem_record(dip);
485         depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO);
486
487         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
488         if (++trans->hmp->namekey_iterator == 0)
489                 ++trans->hmp->namekey_iterator;
490
491         record->rec.entry.base.base.obj_id = dip->obj_id;
492         record->rec.entry.base.base.key =
493                 hammer_directory_namekey(ncp->nc_name, bytes);
494         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
495         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
496         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
497         record->rec.entry.obj_id = ip->obj_id;
498         record->data = (void *)ncp->nc_name;
499         record->rec.entry.base.data_len = bytes;
500         ++ip->ino_rec.ino_nlinks;
501         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
502         /* NOTE: copies record->data */
503
504         /*
505          * If the inode gets synced cause the directory entry
506          * to be synced as well, or vise-versa.
507          */
508         hammer_ref(&record->lock);      /* for depend entry */
509         hammer_ref(&ip->lock);          /* for depend entry */
510         depend->ip = ip;
511         depend->record = record;
512         TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry);
513         TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry);
514         ++ip->depend_count;
515
516         error = hammer_mem_add(trans, record);
517         return(error);
518 }
519
520 /*
521  * Delete the directory entry and update the inode link count.  The
522  * cursor must be seeked to the directory entry record being deleted.
523  *
524  * The related inode should be share-locked by the caller.  The caller is
525  * on the frontend.
526  *
527  * This function can return EDEADLK requiring the caller to terminate
528  * the cursor, any locks, wait on the returned record, and retry.
529  */
530 int
531 hammer_ip_del_directory(struct hammer_transaction *trans,
532                      hammer_cursor_t cursor, struct hammer_inode *dip,
533                      struct hammer_inode *ip)
534 {
535         hammer_record_t record;
536         hammer_depend_t depend;
537         int error;
538
539         if (cursor->record == &cursor->iprec->rec) {
540                 /*
541                  * In-memory (unsynchronized) records can simply be freed.
542                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
543                  * by the backend, we must still avoid races against the
544                  * backend potentially syncing the record to the media. 
545                  *
546                  * We cannot call hammer_ip_delete_record(), that routine may
547                  * only be called from the backend.
548                  */
549                 record = cursor->iprec;
550                 if (record->state == HAMMER_FST_FLUSH) {
551                         KKASSERT(cursor->deadlk_rec == NULL);
552                         hammer_ref(&record->lock);
553                         cursor->deadlk_rec = record;
554                         error = EDEADLK;
555                 } else {
556                         hammer_delete_mem_record(cursor->iprec);
557                         error = 0;
558                 }
559         } else {
560                 /*
561                  * If the record is on-disk we have to queue the deletion by
562                  * the record's key.  This also causes lookups to skip the
563                  * record.
564                  */
565                 depend = kmalloc(sizeof(*depend), M_HAMMER, M_WAITOK|M_ZERO);
566
567                 record = hammer_alloc_mem_record(dip);
568                 record->rec.entry.base.base = cursor->record->base.base;
569                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
570                 record->flags |= HAMMER_RECF_DELETE_ONDISK;
571
572                 /*
573                  * If the inode gets synced cause the directory entry
574                  * to be synced as well, or vise-versa.
575                  */
576                 hammer_ref(&ip->lock);          /* for depend entry */
577                 hammer_ref(&record->lock);      /* for depend entry */
578                 depend->ip = ip;
579                 depend->record = record;
580                 TAILQ_INSERT_TAIL(&ip->depend_list, depend, ip_entry);
581                 TAILQ_INSERT_TAIL(&record->depend_list, depend, rec_entry);
582                 ++ip->depend_count;
583
584                 error = hammer_mem_add(trans, record);
585         }
586
587         /*
588          * One less link.  The file may still be open in the OS even after
589          * all links have gone away so we only try to sync if the OS has
590          * no references and nlinks falls to 0.
591          *
592          * We have to terminate the cursor before syncing the inode to
593          * avoid deadlocking against ourselves.
594          *
595          * XXX we can't sync the inode here because the encompassing
596          * transaction might be a rename and might update the inode
597          * again with a new link.  That would force the delete_tid to be
598          * the same as the create_tid and cause a panic.
599          */
600         if (error == 0) {
601                 --ip->ino_rec.ino_nlinks;
602                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
603                 if (ip->ino_rec.ino_nlinks == 0 &&
604                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
605                         hammer_done_cursor(cursor);
606                 }
607
608         }
609         return(error);
610 }
611
612 /*
613  * Add a record to an inode.
614  *
615  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
616  * initialize the following additional fields:
617  *
618  * The related inode should be share-locked by the caller.  The caller is
619  * on the frontend.
620  *
621  * record->rec.entry.base.base.key
622  * record->rec.entry.base.base.rec_type
623  * record->rec.entry.base.base.data_len
624  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
625  */
626 int
627 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
628 {
629         hammer_inode_t ip = record->ip;
630         int error;
631
632         record->rec.base.base.obj_id = ip->obj_id;
633         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
634
635         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
636         /* NOTE: copies record->data */
637         error = hammer_mem_add(trans, record);
638         return(error);
639 }
640
641 /*
642  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
643  * is called via the strategy called from a cached data source.  This code
644  * is responsible for actually writing a data record out to the disk.
645  *
646  * This can only occur non-historically (i.e. 'current' data only).
647  *
648  * The file offset must be HAMMER_BUFSIZE aligned but the data length
649  * can be truncated.  The record (currently) always represents a BUFSIZE
650  * swath of space whether the data is truncated or not.
651  */
652 int
653 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
654                        int64_t offset, void *data, int bytes)
655 {
656         struct hammer_cursor cursor;
657         hammer_record_ondisk_t rec;
658         union hammer_btree_elm elm;
659         hammer_off_t rec_offset;
660         void *bdata;
661         int error;
662
663         KKASSERT((offset & HAMMER_BUFMASK) == 0);
664         KKASSERT(trans->type == HAMMER_TRANS_FLS);
665 retry:
666         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
667         if (error)
668                 return(error);
669         cursor.key_beg.obj_id = ip->obj_id;
670         cursor.key_beg.key = offset + bytes;
671         cursor.key_beg.create_tid = trans->tid;
672         cursor.key_beg.delete_tid = 0;
673         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
674         cursor.asof = trans->tid;
675         cursor.flags |= HAMMER_CURSOR_INSERT;
676
677         /*
678          * Issue a lookup to position the cursor.
679          */
680         error = hammer_btree_lookup(&cursor);
681         if (error == 0) {
682                 kprintf("hammer_ip_sync_data: duplicate data at "
683                         "(%lld,%d) tid %016llx\n",
684                         offset, bytes, trans->tid);
685                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
686                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
687                 panic("Duplicate data");
688                 error = EIO;
689         }
690         if (error != ENOENT)
691                 goto done;
692
693         /*
694          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
695          * can cross buffer boundaries so we may have to split our bcopy.
696          */
697         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
698                                   &cursor.record_buffer,
699                                   bytes, &bdata,
700                                   &cursor.data_buffer, &error);
701         if (rec == NULL)
702                 goto done;
703         if (hammer_debug_general & 0x1000)
704                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
705
706         /*
707          * Fill everything in and insert our B-Tree node.
708          *
709          * NOTE: hammer_alloc_record() has already marked the related
710          * buffers as modified.  If we do it again we will generate
711          * unnecessary undo elements.
712          */
713         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
714         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
715         rec->base.base.obj_id = ip->obj_id;
716         rec->base.base.key = offset + bytes;
717         rec->base.base.create_tid = trans->tid;
718         rec->base.base.delete_tid = 0;
719         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
720         rec->base.data_crc = crc32(data, bytes);
721         hammer_modify_buffer_done(cursor.record_buffer);
722         KKASSERT(rec->base.data_len == bytes);
723
724         hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
725         bcopy(data, bdata, bytes);
726         hammer_modify_buffer_done(cursor.data_buffer);
727
728         elm.leaf.base = rec->base.base;
729         elm.leaf.rec_offset = rec_offset;
730         elm.leaf.data_offset = rec->base.data_off;
731         elm.leaf.data_len = bytes;
732         elm.leaf.data_crc = rec->base.data_crc;
733
734         /*
735          * Data records can wind up on-disk before the inode itself is
736          * on-disk.  One must assume data records may be on-disk if either
737          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
738          */
739         ip->flags |= HAMMER_INODE_DONDISK;
740
741         error = hammer_btree_insert(&cursor, &elm);
742         if (error == 0)
743                 goto done;
744
745         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
746 done:
747         hammer_done_cursor(&cursor);
748         if (error == EDEADLK)
749                 goto retry;
750         return(error);
751 }
752
753 /*
754  * Sync an in-memory record to the disk.  This is called by the backend.
755  * This code is responsible for actually writing a record out to the disk.
756  *
757  * Any inode dependancies will queue the inode to the backend.
758  *
759  * NOTE: The frontend can mark the record deleted while it is queued to
760  * the backend.  The deletion applies to a frontend operation and the
761  * record must be treated as NOT having been deleted on the backend, so
762  * we ignore the flag.
763  */
764 int
765 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
766 {
767         struct hammer_cursor cursor;
768         hammer_record_ondisk_t rec;
769         union hammer_btree_elm elm;
770         hammer_depend_t depend;
771         hammer_off_t rec_offset;
772         void *bdata;
773         int error;
774
775         KKASSERT(record->state == HAMMER_FST_FLUSH);
776
777         while ((depend = TAILQ_FIRST(&record->depend_list)) != NULL) {
778                 TAILQ_REMOVE(&record->depend_list, depend, rec_entry);
779                 TAILQ_REMOVE(&depend->ip->depend_list, depend, ip_entry);
780                 --depend->ip->depend_count;
781                 kprintf("S");
782                 hammer_flush_inode(depend->ip, 0);
783                 hammer_rel_inode(depend->ip, 0);
784                 hammer_unref(&record->lock);
785                 KKASSERT(record->lock.refs > 0);
786                 kfree(depend, M_HAMMER);
787         }
788
789 retry:
790         /*
791          * Get a cursor, we will either be inserting or deleting.
792          */
793         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
794         if (error)
795                 return(error);
796         cursor.key_beg = record->rec.base.base;
797
798         /*
799          * If we are deleting an exact match must be found on-disk.
800          */
801         if (record->flags & HAMMER_RECF_DELETE_ONDISK) {
802                 error = hammer_btree_lookup(&cursor);
803                 if (error == 0)
804                         error = hammer_ip_delete_record(&cursor, trans->tid);
805                 if (error == 0)
806                         hammer_delete_mem_record(record);
807                 goto done;
808         }
809
810         /*
811          * We are inserting.
812          *
813          * Issue a lookup to position the cursor and locate the cluster.  The
814          * target key should not exist.  If we are creating a directory entry
815          * we may have to iterate the low 32 bits of the key to find an unused
816          * key.
817          */
818         cursor.flags |= HAMMER_CURSOR_INSERT;
819
820         for (;;) {
821                 error = hammer_btree_lookup(&cursor);
822                 if (error)
823                         break;
824                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
825                         kprintf("hammer_ip_sync_record: duplicate rec "
826                                 "at (%016llx)\n", record->rec.base.base.key);
827                         Debugger("duplicate record1");
828                         error = EIO;
829                         break;
830                 }
831                 if (++trans->hmp->namekey_iterator == 0)
832                         ++trans->hmp->namekey_iterator;
833                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
834                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
835                 cursor.key_beg.key = record->rec.base.base.key;
836         }
837         if (error != ENOENT)
838                 goto done;
839
840         /*
841          * Allocate the record and data.  The result buffers will be
842          * marked as being modified and further calls to
843          * hammer_modify_buffer() will result in unneeded UNDO records.
844          *
845          * Support zero-fill records (data == NULL and data_len != 0)
846          */
847         if (record->data == NULL) {
848                 rec = hammer_alloc_record(trans, &rec_offset,
849                                           record->rec.base.base.rec_type,
850                                           &cursor.record_buffer,
851                                           0, &bdata,
852                                           NULL, &error);
853                 if (hammer_debug_general & 0x1000)
854                         kprintf("NULL RECORD DATA\n");
855         } else if (record->flags & HAMMER_RECF_INBAND) {
856                 rec = hammer_alloc_record(trans, &rec_offset,
857                                           record->rec.base.base.rec_type,
858                                           &cursor.record_buffer,
859                                           record->rec.base.data_len, &bdata,
860                                           NULL, &error);
861                 if (hammer_debug_general & 0x1000)
862                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
863         } else {
864                 rec = hammer_alloc_record(trans, &rec_offset,
865                                           record->rec.base.base.rec_type,
866                                           &cursor.record_buffer,
867                                           record->rec.base.data_len, &bdata,
868                                           &cursor.data_buffer, &error);
869                 if (hammer_debug_general & 0x1000)
870                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
871         }
872
873         if (rec == NULL)
874                 goto done;
875
876         /*
877          * Fill in the remaining fields and insert our B-Tree node.
878          */
879         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
880         rec->base.base = record->rec.base.base;
881         bcopy(&record->rec.base + 1, &rec->base + 1,
882               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
883
884         /*
885          * Copy the data and deal with zero-fill support.
886          */
887         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
888                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
889                 bcopy(record->data, bdata, rec->base.data_len);
890         } else if (record->data) {
891                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
892                 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
893                 bcopy(record->data, bdata, rec->base.data_len);
894                 hammer_modify_buffer_done(cursor.data_buffer);
895         } else {
896                 rec->base.data_len = record->rec.base.data_len;
897         }
898         hammer_modify_buffer_done(cursor.record_buffer);
899
900         elm.leaf.base = record->rec.base.base;
901         elm.leaf.rec_offset = rec_offset;
902         elm.leaf.data_offset = rec->base.data_off;
903         elm.leaf.data_len = rec->base.data_len;
904         elm.leaf.data_crc = rec->base.data_crc;
905
906         error = hammer_btree_insert(&cursor, &elm);
907
908         /*
909          * Clean up on success, or fall through on error.
910          */
911         if (error == 0) {
912                 hammer_delete_mem_record(record);
913                 goto done;
914         }
915
916         /*
917          * Try to unwind the allocation
918          */
919         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
920 done:
921         hammer_done_cursor(&cursor);
922         if (error == EDEADLK)
923                 goto retry;
924         return(error);
925 }
926
927 /*
928  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
929  * entry's key is used to deal with hash collisions in the upper 32 bits.
930  * A unique 64 bit key is generated in-memory and may be regenerated a
931  * second time when the directory record is flushed to the on-disk B-Tree.
932  *
933  * A referenced record is passed to this function.  This function
934  * eats the reference.  If an error occurs the record will be deleted.
935  *
936  * A copy of the temporary record->data pointer provided by the caller
937  * will be made.
938  */
939 static
940 int
941 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
942 {
943         void *data;
944         int bytes;
945         int reclen;
946                 
947         /*
948          * Make a private copy of record->data
949          */
950         if (record->data) {
951                 /*
952                  * Try to embed the data in extra space in the record
953                  * union, otherwise allocate a copy.
954                  */
955                 bytes = record->rec.base.data_len;
956                 switch(record->rec.base.base.rec_type) {
957                 case HAMMER_RECTYPE_DIRENTRY:
958                         reclen = offsetof(struct hammer_entry_record, name[0]);
959                         break;
960                 case HAMMER_RECTYPE_DATA:
961                         reclen = offsetof(struct hammer_data_record, data[0]);
962                         break;
963                 default:
964                         reclen = sizeof(record->rec);
965                         break;
966                 }
967                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
968                         bcopy(record->data, (char *)&record->rec + reclen,
969                               bytes);
970                         record->data = (void *)((char *)&record->rec + reclen);
971                         record->flags |= HAMMER_RECF_INBAND;
972                 } else {
973                         ++hammer_count_record_datas;
974                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
975                         record->flags |= HAMMER_RECF_ALLOCDATA;
976                         bcopy(record->data, data, bytes);
977                         record->data = data;
978                 }
979         }
980
981         /*
982          * Insert into the RB tree, find an unused iterator if this is
983          * a directory entry.
984          */
985         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
986                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
987                         hammer_delete_mem_record(record);
988                         hammer_rel_mem_record(record);
989                         return (EEXIST);
990                 }
991                 if (++trans->hmp->namekey_iterator == 0)
992                         ++trans->hmp->namekey_iterator;
993                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
994                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
995         }
996         record->flags |= HAMMER_RECF_ONRBTREE;
997         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
998         hammer_rel_mem_record(record);
999         return(0);
1000 }
1001
1002 /************************************************************************
1003  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1004  ************************************************************************
1005  *
1006  * These functions augment the B-Tree scanning functions in hammer_btree.c
1007  * by merging in-memory records with on-disk records.
1008  */
1009
1010 /*
1011  * Locate a particular record either in-memory or on-disk.
1012  *
1013  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1014  * NOT be called to iterate results.
1015  */
1016 int
1017 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
1018 {
1019         int error;
1020
1021         /*
1022          * If the element is in-memory return it without searching the
1023          * on-disk B-Tree
1024          */
1025         error = hammer_mem_lookup(cursor, ip);
1026         if (error == 0) {
1027                 cursor->record = &cursor->iprec->rec;
1028                 return(error);
1029         }
1030         if (error != ENOENT)
1031                 return(error);
1032
1033         /*
1034          * If the inode has on-disk components search the on-disk B-Tree.
1035          */
1036         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1037                 return(error);
1038         error = hammer_btree_lookup(cursor);
1039         if (error == 0)
1040                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1041         return(error);
1042 }
1043
1044 /*
1045  * Locate the first record within the cursor's key_beg/key_end range,
1046  * restricted to a particular inode.  0 is returned on success, ENOENT
1047  * if no records matched the requested range, or some other error.
1048  *
1049  * When 0 is returned hammer_ip_next() may be used to iterate additional
1050  * records within the requested range.
1051  *
1052  * This function can return EDEADLK, requiring the caller to terminate
1053  * the cursor and try again.
1054  */
1055 int
1056 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
1057 {
1058         int error;
1059
1060         /*
1061          * Clean up fields and setup for merged scan
1062          */
1063         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1064         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1065         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1066         if (cursor->iprec) {
1067                 hammer_rel_mem_record(cursor->iprec);
1068                 cursor->iprec = NULL;
1069         }
1070
1071         /*
1072          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1073          * exact lookup so if we get ENOENT we have to call the iterate
1074          * function to validate the first record after the begin key.
1075          *
1076          * The ATEDISK flag is used by hammer_btree_iterate to determine
1077          * whether it must index forwards or not.  It is also used here
1078          * to select the next record from in-memory or on-disk.
1079          *
1080          * EDEADLK can only occur if the lookup hit an empty internal
1081          * element and couldn't delete it.  Since this could only occur
1082          * in-range, we can just iterate from the failure point.
1083          */
1084         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1085                 error = hammer_btree_lookup(cursor);
1086                 if (error == ENOENT || error == EDEADLK) {
1087                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1088                         if (hammer_debug_general & 0x2000)
1089                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1090                         error = hammer_btree_iterate(cursor);
1091                 }
1092                 if (error && error != ENOENT) 
1093                         return(error);
1094                 if (error == 0) {
1095                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1096                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1097                 } else {
1098                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1099                 }
1100         }
1101
1102         /*
1103          * Search the in-memory record list (Red-Black tree).  Unlike the
1104          * B-Tree search, mem_first checks for records in the range.
1105          */
1106         error = hammer_mem_first(cursor, ip);
1107         if (error && error != ENOENT)
1108                 return(error);
1109         if (error == 0) {
1110                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1111                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1112                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1113                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1114         }
1115
1116         /*
1117          * This will return the first matching record.
1118          */
1119         return(hammer_ip_next(cursor));
1120 }
1121
1122 /*
1123  * Retrieve the next record in a merged iteration within the bounds of the
1124  * cursor.  This call may be made multiple times after the cursor has been
1125  * initially searched with hammer_ip_first().
1126  *
1127  * 0 is returned on success, ENOENT if no further records match the
1128  * requested range, or some other error code is returned.
1129  */
1130 int
1131 hammer_ip_next(hammer_cursor_t cursor)
1132 {
1133         hammer_btree_elm_t elm;
1134         hammer_record_t rec, save;
1135         int error;
1136         int r;
1137
1138 next_btree:
1139         /*
1140          * Load the current on-disk and in-memory record.  If we ate any
1141          * records we have to get the next one. 
1142          *
1143          * If we deleted the last on-disk record we had scanned ATEDISK will
1144          * be clear and DELBTREE will be set, forcing a call to iterate. The
1145          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1146          * element.  If ATEDISK is set, iterate will skip the 'current'
1147          * element.
1148          *
1149          * Get the next on-disk record
1150          */
1151         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1152                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1153                         error = hammer_btree_iterate(cursor);
1154                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1155                         if (error == 0)
1156                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1157                         else
1158                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1159                                                  HAMMER_CURSOR_ATEDISK;
1160                 }
1161         }
1162
1163 next_memory:
1164         /*
1165          * Get the next in-memory record.  The record can be ripped out
1166          * of the RB tree so we maintain a scan_info structure to track
1167          * the next node.
1168          *
1169          * hammer_rec_scan_cmp:  Is the record still in our general range,
1170          *                       (non-inclusive of snapshot exclusions)?
1171          * hammer_rec_scan_callback: Is the record in our snapshot?
1172          */
1173         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1174                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1175                         save = cursor->iprec;
1176                         cursor->iprec = NULL;
1177                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1178                         while (rec) {
1179                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1180                                         break;
1181                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1182                                         break;
1183                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1184                         }
1185                         if (save)
1186                                 hammer_rel_mem_record(save);
1187                         if (cursor->iprec) {
1188                                 KKASSERT(cursor->iprec == rec);
1189                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1190 #if 0
1191                                 cursor->scan.node =
1192                                         hammer_rec_rb_tree_RB_NEXT(rec);
1193 #endif
1194                         } else {
1195                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1196                         }
1197                 }
1198         }
1199
1200         /*
1201          * Extract either the disk or memory record depending on their
1202          * relative position.
1203          */
1204         error = 0;
1205         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1206         case 0:
1207                 /*
1208                  * Both entries valid
1209                  */
1210                 elm = &cursor->node->ondisk->elms[cursor->index];
1211                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1212                 if (r < 0) {
1213                         error = hammer_btree_extract(cursor,
1214                                                      HAMMER_CURSOR_GET_RECORD);
1215                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1216                         break;
1217                 }
1218
1219                 /*
1220                  * If the entries match the memory entry must specify
1221                  * an on-disk deletion.  Eat both entries unless the
1222                  * caller wants visibility into the special records.
1223                  */
1224                 if (r == 0) {
1225                         KKASSERT(cursor->iprec->flags & 
1226                                  HAMMER_RECF_DELETE_ONDISK);
1227                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1228                                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1229                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1230                                 goto next_btree;
1231                         }
1232                 }
1233                 /* fall through to the memory entry */
1234         case HAMMER_CURSOR_ATEDISK:
1235                 /*
1236                  * Only the memory entry is valid.  If the record is
1237                  * placemarking an on-disk deletion, we skip it unless
1238                  * the caller wants special record visibility.
1239                  */
1240                 cursor->record = &cursor->iprec->rec;
1241                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1242                 if (cursor->iprec->flags & HAMMER_RECF_DELETE_ONDISK) {
1243                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1244                                 goto next_memory;
1245                 }
1246                 break;
1247         case HAMMER_CURSOR_ATEMEM:
1248                 /*
1249                  * Only the disk entry is valid
1250                  */
1251                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1252                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1253                 break;
1254         default:
1255                 /*
1256                  * Neither entry is valid
1257                  *
1258                  * XXX error not set properly
1259                  */
1260                 cursor->record = NULL;
1261                 error = ENOENT;
1262                 break;
1263         }
1264         return(error);
1265 }
1266
1267 /*
1268  * Resolve the cursor->data pointer for the current cursor position in
1269  * a merged iteration.
1270  */
1271 int
1272 hammer_ip_resolve_data(hammer_cursor_t cursor)
1273 {
1274         int error;
1275
1276         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1277                 cursor->data = cursor->iprec->data;
1278                 error = 0;
1279         } else {
1280                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1281         }
1282         return(error);
1283 }
1284
1285 int
1286 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1287 {
1288         int error;
1289
1290         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1291                 cursor->data = cursor->iprec->data;
1292                 error = 0;
1293         } else {
1294                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1295                                                      HAMMER_CURSOR_GET_RECORD);
1296         }
1297         return(error);
1298 }
1299
1300 /*
1301  * Delete all records within the specified range for inode ip.
1302  *
1303  * NOTE: An unaligned range will cause new records to be added to cover
1304  * the edge cases. (XXX not implemented yet).
1305  *
1306  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1307  *
1308  * NOTE: Record keys for regular file data have to be special-cased since
1309  * they indicate the end of the range (key = base + bytes).
1310  */
1311 int
1312 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1313                        int64_t ran_beg, int64_t ran_end)
1314 {
1315         struct hammer_cursor cursor;
1316         hammer_record_ondisk_t rec;
1317         hammer_base_elm_t base;
1318         int error;
1319         int64_t off;
1320
1321 #if 0
1322         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1323 #endif
1324
1325         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1326 retry:
1327         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1328
1329         cursor.key_beg.obj_id = ip->obj_id;
1330         cursor.key_beg.create_tid = 0;
1331         cursor.key_beg.delete_tid = 0;
1332         cursor.key_beg.obj_type = 0;
1333         cursor.asof = ip->obj_asof;
1334         cursor.flags |= HAMMER_CURSOR_ASOF;
1335         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1336         cursor.flags |= HAMMER_CURSOR_BACKEND;
1337
1338         cursor.key_end = cursor.key_beg;
1339         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1340                 cursor.key_beg.key = ran_beg;
1341                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1342                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1343                 cursor.key_end.key = ran_end;
1344         } else {
1345                 /*
1346                  * The key in the B-Tree is (base+bytes), so the first possible
1347                  * matching key is ran_beg + 1.
1348                  */
1349                 int64_t tmp64;
1350
1351                 cursor.key_beg.key = ran_beg + 1;
1352                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1353                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1354
1355                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1356                 if (tmp64 < ran_end)
1357                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1358                 else
1359                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1360         }
1361         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1362
1363         error = hammer_ip_first(&cursor, ip);
1364
1365         /*
1366          * Iterate through matching records and mark them as deleted.
1367          */
1368         while (error == 0) {
1369                 rec = cursor.record;
1370                 base = &rec->base.base;
1371
1372                 KKASSERT(base->delete_tid == 0);
1373
1374                 /*
1375                  * There may be overlap cases for regular file data.  Also
1376                  * remember the key for a regular file record is the offset
1377                  * of the last byte of the record (base + len - 1), NOT the
1378                  * base offset.
1379                  */
1380 #if 0
1381                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1382 #endif
1383                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1384 #if 0
1385                         kprintf("delete_range loop key %016llx,%d\n",
1386                                 base->key - rec->base.data_len, rec->base.data_len);
1387 #endif
1388                         off = base->key - rec->base.data_len;
1389                         /*
1390                          * Check the left edge case.  We currently do not
1391                          * split existing records.
1392                          */
1393                         if (off < ran_beg) {
1394                                 panic("hammer left edge case %016llx %d\n",
1395                                         base->key, rec->base.data_len);
1396                         }
1397
1398                         /*
1399                          * Check the right edge case.  Note that the
1400                          * record can be completely out of bounds, which
1401                          * terminates the search.
1402                          *
1403                          * base->key is exclusive of the right edge while
1404                          * ran_end is inclusive of the right edge.  The
1405                          * (key - data_len) left boundary is inclusive.
1406                          *
1407                          * XXX theory-check this test at some point, are
1408                          * we missing a + 1 somewhere?  Note that ran_end
1409                          * could overflow.
1410                          */
1411                         if (base->key - 1 > ran_end) {
1412                                 if (base->key - rec->base.data_len > ran_end)
1413                                         break;
1414                                 panic("hammer right edge case\n");
1415                         }
1416                 }
1417
1418                 /*
1419                  * Mark the record and B-Tree entry as deleted.  This will
1420                  * also physically delete the B-Tree entry, record, and
1421                  * data if the retention policy dictates.  The function
1422                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1423                  * uses to perform a fixup.
1424                  */
1425                 error = hammer_ip_delete_record(&cursor, trans->tid);
1426                 if (error)
1427                         break;
1428                 error = hammer_ip_next(&cursor);
1429         }
1430         hammer_done_cursor(&cursor);
1431         if (error == EDEADLK)
1432                 goto retry;
1433         if (error == ENOENT)
1434                 error = 0;
1435         return(error);
1436 }
1437
1438 /*
1439  * Delete all records associated with an inode except the inode record
1440  * itself.
1441  */
1442 int
1443 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1444 {
1445         struct hammer_cursor cursor;
1446         hammer_record_ondisk_t rec;
1447         hammer_base_elm_t base;
1448         int error;
1449
1450         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1451 retry:
1452         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1453
1454         cursor.key_beg.obj_id = ip->obj_id;
1455         cursor.key_beg.create_tid = 0;
1456         cursor.key_beg.delete_tid = 0;
1457         cursor.key_beg.obj_type = 0;
1458         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1459         cursor.key_beg.key = HAMMER_MIN_KEY;
1460
1461         cursor.key_end = cursor.key_beg;
1462         cursor.key_end.rec_type = 0xFFFF;
1463         cursor.key_end.key = HAMMER_MAX_KEY;
1464
1465         cursor.asof = ip->obj_asof;
1466         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1467         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1468         cursor.flags |= HAMMER_CURSOR_BACKEND;
1469
1470         error = hammer_ip_first(&cursor, ip);
1471
1472         /*
1473          * Iterate through matching records and mark them as deleted.
1474          */
1475         while (error == 0) {
1476                 rec = cursor.record;
1477                 base = &rec->base.base;
1478
1479                 KKASSERT(base->delete_tid == 0);
1480
1481                 /*
1482                  * Mark the record and B-Tree entry as deleted.  This will
1483                  * also physically delete the B-Tree entry, record, and
1484                  * data if the retention policy dictates.  The function
1485                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1486                  * uses to perform a fixup.
1487                  */
1488                 error = hammer_ip_delete_record(&cursor, trans->tid);
1489                 if (error)
1490                         break;
1491                 error = hammer_ip_next(&cursor);
1492         }
1493         hammer_done_cursor(&cursor);
1494         if (error == EDEADLK)
1495                 goto retry;
1496         if (error == ENOENT)
1497                 error = 0;
1498         return(error);
1499 }
1500
1501 /*
1502  * Delete the record at the current cursor.  On success the cursor will
1503  * be positioned appropriately for an iteration but may no longer be at
1504  * a leaf node.
1505  *
1506  * This routine is only called from the backend.
1507  *
1508  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1509  * cursor and retry.
1510  */
1511 int
1512 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1513 {
1514         hammer_btree_elm_t elm;
1515         hammer_mount_t hmp;
1516         int error;
1517         int dodelete;
1518
1519         /*
1520          * In-memory (unsynchronized) records can simply be freed.
1521          */
1522         if (cursor->record == &cursor->iprec->rec) {
1523                 hammer_delete_mem_record(cursor->iprec);
1524                 cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
1525                 return(0);
1526         }
1527
1528         /*
1529          * On-disk records are marked as deleted by updating their delete_tid.
1530          * This does not effect their position in the B-Tree (which is based
1531          * on their create_tid).
1532          */
1533         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1534         elm = NULL;
1535         hmp = cursor->node->hmp;
1536
1537         dodelete = 0;
1538         if (error == 0) {
1539                 error = hammer_cursor_upgrade(cursor);
1540                 if (error == 0) {
1541                         elm = &cursor->node->ondisk->elms[cursor->index];
1542                         hammer_modify_node(cursor->trans, cursor->node,
1543                                            &elm->leaf.base.delete_tid,
1544                                            sizeof(elm->leaf.base.delete_tid));
1545                         elm->leaf.base.delete_tid = tid;
1546                         hammer_modify_node_done(cursor->node);
1547
1548                         /*
1549                          * An on-disk record cannot have the same delete_tid
1550                          * as its create_tid.  In a chain of record updates
1551                          * this could result in a duplicate record.
1552                          */
1553                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1554                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1555                         cursor->record->base.base.delete_tid = tid;
1556                         hammer_modify_buffer_done(cursor->record_buffer);
1557                 }
1558         }
1559
1560         /*
1561          * If we were mounted with the nohistory option, we physically
1562          * delete the record.
1563          */
1564         if (hmp->hflags & HMNT_NOHISTORY)
1565                 dodelete = 1;
1566
1567         if (error == 0 && dodelete) {
1568                 error = hammer_delete_at_cursor(cursor, NULL);
1569                 if (error) {
1570                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1571                         error = 0;
1572                 }
1573         }
1574         return(error);
1575 }
1576
1577 int
1578 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1579 {
1580         hammer_btree_elm_t elm;
1581         hammer_off_t rec_offset;
1582         hammer_off_t data_offset;
1583         int32_t data_len;
1584         u_int16_t rec_type;
1585         int error;
1586
1587         elm = &cursor->node->ondisk->elms[cursor->index];
1588         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1589
1590         rec_offset = elm->leaf.rec_offset;
1591         data_offset = elm->leaf.data_offset;
1592         data_len = elm->leaf.data_len;
1593         rec_type = elm->leaf.base.rec_type;
1594
1595         error = hammer_btree_delete(cursor);
1596         if (error == 0) {
1597                 /*
1598                  * This forces a fixup for the iteration because
1599                  * the cursor is now either sitting at the 'next'
1600                  * element or sitting at the end of a leaf.
1601                  */
1602                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1603                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1604                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1605                 }
1606         }
1607         if (error == 0) {
1608                 hammer_blockmap_free(cursor->trans, rec_offset,
1609                                      sizeof(union hammer_record_ondisk));
1610         }
1611         if (error == 0) {
1612                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1613                 case HAMMER_ZONE_LARGE_DATA:
1614                 case HAMMER_ZONE_SMALL_DATA:
1615                         hammer_blockmap_free(cursor->trans,
1616                                              data_offset, data_len);
1617                         break;
1618                 default:
1619                         break;
1620                 }
1621         }
1622 #if 0
1623         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1624                 "(%d remain in cluster)\n",
1625                 cluster->volume->vol_no, cluster->clu_no,
1626                 rec_offset, data_offset, data_len,
1627                 cluster->ondisk->stat_records);
1628 #endif
1629         return (error);
1630 }
1631
1632 /*
1633  * Determine whether a directory is empty or not.  Returns 0 if the directory
1634  * is empty, ENOTEMPTY if it isn't, plus other possible errors.
1635  */
1636 int
1637 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip)
1638 {
1639         struct hammer_cursor cursor;
1640         int error;
1641
1642         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1643
1644         cursor.key_beg.obj_id = ip->obj_id;
1645         cursor.key_beg.create_tid = 0;
1646         cursor.key_beg.delete_tid = 0;
1647         cursor.key_beg.obj_type = 0;
1648         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1649         cursor.key_beg.key = HAMMER_MIN_KEY;
1650
1651         cursor.key_end = cursor.key_beg;
1652         cursor.key_end.rec_type = 0xFFFF;
1653         cursor.key_end.key = HAMMER_MAX_KEY;
1654
1655         cursor.asof = ip->obj_asof;
1656         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1657
1658         error = hammer_ip_first(&cursor, ip);
1659         if (error == ENOENT)
1660                 error = 0;
1661         else if (error == 0)
1662                 error = ENOTEMPTY;
1663         hammer_done_cursor(&cursor);
1664         return(error);
1665 }
1666