HAMMER 40A/Many: Inode/link-count sequencer.
[dragonfly.git] / sys / vfs / hammer / hammer_object.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.48 2008/05/02 01:00:42 dillon Exp $
35  */
36
37 #include "hammer.h"
38
39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record);
40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip);
41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip);
42
43 /*
44  * Red-black tree support.
45  */
46 static int
47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2)
48 {
49         if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type)
50                 return(-1);
51         if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type)
52                 return(1);
53
54         if (rec1->rec.base.base.key < rec2->rec.base.base.key)
55                 return(-1);
56         if (rec1->rec.base.base.key > rec2->rec.base.base.key)
57                 return(1);
58
59         if (rec1->rec.base.base.create_tid == 0) {
60                 if (rec2->rec.base.base.create_tid == 0)
61                         return(0);
62                 return(1);
63         }
64         if (rec2->rec.base.base.create_tid == 0)
65                 return(-1);
66
67         if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid)
68                 return(-1);
69         if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid)
70                 return(1);
71
72         /*
73          * Never match against an item deleted by the front-end.
74          */
75         if (rec1->flags & HAMMER_RECF_DELETED_FE)
76                 return(1);
77         if (rec2->flags & HAMMER_RECF_DELETED_FE)
78                 return(-1);
79
80         return(0);
81 }
82
83 static int
84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec)
85 {
86         if (info->rec_type < rec->rec.base.base.rec_type)
87                 return(-3);
88         if (info->rec_type > rec->rec.base.base.rec_type)
89                 return(3);
90
91         if (info->key < rec->rec.base.base.key)
92                 return(-2);
93         if (info->key > rec->rec.base.base.key)
94                 return(2);
95
96         if (info->create_tid == 0) {
97                 if (rec->rec.base.base.create_tid == 0)
98                         return(0);
99                 return(1);
100         }
101         if (rec->rec.base.base.create_tid == 0)
102                 return(-1);
103         if (info->create_tid < rec->rec.base.base.create_tid)
104                 return(-1);
105         if (info->create_tid > rec->rec.base.base.create_tid)
106                 return(1);
107         return(0);
108 }
109
110 /*
111  * RB_SCAN comparison code for hammer_mem_first().  The argument order
112  * is reversed so the comparison result has to be negated.  key_beg and
113  * key_end are both range-inclusive.
114  *
115  * The creation timestamp can cause hammer_rec_compare() to return -1 or +1.
116  * These do not stop the scan.
117  *
118  * Localized deletions are not cached in-memory.
119  */
120 static
121 int
122 hammer_rec_scan_cmp(hammer_record_t rec, void *data)
123 {
124         hammer_cursor_t cursor = data;
125         int r;
126
127         r = hammer_rec_compare(&cursor->key_beg, rec);
128         if (r > 1)
129                 return(-1);
130         r = hammer_rec_compare(&cursor->key_end, rec);
131         if (r < -1)
132                 return(1);
133         return(0);
134 }
135
136 /*
137  * This compare function is used when simply looking up key_beg.
138  */
139 static
140 int
141 hammer_rec_find_cmp(hammer_record_t rec, void *data)
142 {
143         hammer_cursor_t cursor = data;
144         int r;
145
146         r = hammer_rec_compare(&cursor->key_beg, rec);
147         if (r > 1)
148                 return(-1);
149         if (r < -1)
150                 return(1);
151         return(0);
152 }
153
154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare);
155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node,
156                     hammer_rec_compare, hammer_base_elm_t);
157
158 /*
159  * Allocate a record for the caller to finish filling in.  The record is
160  * returned referenced.
161  */
162 hammer_record_t
163 hammer_alloc_mem_record(hammer_inode_t ip)
164 {
165         hammer_record_t record;
166
167         ++hammer_count_records;
168         record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO);
169         record->flush_state = HAMMER_FST_IDLE;
170         record->ip = ip;
171         record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
172         hammer_ref(&record->lock);
173         return (record);
174 }
175
176 void
177 hammer_wait_mem_record(hammer_record_t record)
178 {
179         while (record->flush_state == HAMMER_FST_FLUSH) {
180                 record->flags |= HAMMER_RECF_WANTED;
181                 tsleep(record, 0, "hmrrc2", 0);
182         }
183 }
184
185 /*
186  * Called from the backend, hammer_inode.c, after a record has been
187  * flushed to disk.  The record has been exclusively locked by the
188  * caller and interlocked with BE.
189  *
190  * We clean up the state, unlock, and release the record (the record
191  * was referenced by the fact that it was in the HAMMER_FST_FLUSH state).
192  */
193 void
194 hammer_flush_record_done(hammer_record_t record, int error)
195 {
196         hammer_inode_t target_ip;
197         int cleanup = 0;
198
199         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
200         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
201
202         if (error) {
203                 /*
204                  * An error occured, the backend was unable to sync the
205                  * record to its media.  Leave the record intact.
206                  */
207                 Debugger("flush_record_done error");
208         } else if (record->flags & HAMMER_RECF_CONVERT_DELETE) {
209                 /*
210                  * deleted-record to delete-on-disk conversion, occurs when
211                  * we sync a record to disk which is marked deleted by the
212                  * frontend, but not deleted from the point of view of the
213                  * backend.
214                  */
215                 if (record->flags & HAMMER_RECF_DELETED_BE) {
216                         record->flags |= HAMMER_RECF_DELETED_FE;
217                         cleanup = 1;
218                 } else {
219                         KKASSERT(record->type == HAMMER_MEM_RECORD_DEL);
220                 }
221         } else {
222                 /*
223                  * Normal completion, record has been disposed of (by
224                  * having been synchronized to the media).
225                  */
226                 record->flags |= HAMMER_RECF_DELETED_FE;
227                 record->flags |= HAMMER_RECF_DELETED_BE;
228                 cleanup = 1;
229         }
230         if (cleanup) {
231                 if ((target_ip = record->target_ip) != NULL) {
232                         TAILQ_REMOVE(&target_ip->target_list, record,
233                                      target_entry);
234                         record->target_ip = NULL;
235                         hammer_test_inode(target_ip);
236                 }
237                 record->flush_state = HAMMER_FST_IDLE;
238         } else {
239                 if (record->target_ip)
240                         record->flush_state = HAMMER_FST_SETUP;
241                 else
242                         record->flush_state = HAMMER_FST_IDLE;
243         }
244
245         record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
246         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
247         if (record->flags & HAMMER_RECF_WANTED) {
248                 record->flags &= ~HAMMER_RECF_WANTED;
249                 wakeup(record);
250         }
251         hammer_rel_mem_record(record);
252 }
253
254 /*
255  * Release a memory record.  Records marked for deletion are immediately
256  * removed from the RB-Tree but otherwise left intact until the last ref
257  * goes away.
258  */
259 void
260 hammer_rel_mem_record(struct hammer_record *record)
261 {
262         hammer_inode_t ip, target_ip;
263
264         hammer_unref(&record->lock);
265
266         if (record->flags & HAMMER_RECF_DELETED_FE) {
267                 if (record->lock.refs == 0) {
268                         KKASSERT(record->flush_state != HAMMER_FST_FLUSH);
269
270                         ip = record->ip;
271                         if ((target_ip = record->target_ip) != NULL) {
272                                 TAILQ_REMOVE(&target_ip->target_list,
273                                              record, target_entry);
274                                 record->target_ip = NULL;
275                                 hammer_test_inode(target_ip);
276                         }
277
278                         if (record->flags & HAMMER_RECF_ONRBTREE) {
279                                 RB_REMOVE(hammer_rec_rb_tree,
280                                           &record->ip->rec_tree,
281                                           record);
282                                 record->flags &= ~HAMMER_RECF_ONRBTREE;
283                         }
284                         if (record->flags & HAMMER_RECF_ALLOCDATA) {
285                                 --hammer_count_record_datas;
286                                 kfree(record->data, M_HAMMER);
287                                 record->flags &= ~HAMMER_RECF_ALLOCDATA;
288                         }
289                         record->data = NULL;
290                         --hammer_count_records;
291                         kfree(record, M_HAMMER);
292                         return;
293                 }
294         }
295 }
296
297 /*
298  * Record visibility depends on whether the record is being accessed by
299  * the backend or the frontend.
300  *
301  * Return non-zero if the record is visible, zero if it isn't or if it is
302  * deleted.
303  */
304 static __inline
305 int
306 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record)
307 {
308         if (cursor->flags & HAMMER_CURSOR_BACKEND) {
309                 if (record->flags & HAMMER_RECF_DELETED_BE)
310                         return(0);
311                 if ((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0)
312                         return(0);
313         } else {
314                 if (record->flags & HAMMER_RECF_DELETED_FE)
315                         return(0);
316         }
317         return(1);
318 }
319
320 /*
321  * This callback is used as part of the RB_SCAN function for in-memory
322  * records.  We terminate it (return -1) as soon as we get a match.
323  *
324  * This routine is used by frontend code.
325  *
326  * The primary compare code does not account for ASOF lookups.  This
327  * code handles that case as well as a few others.
328  */
329 static
330 int
331 hammer_rec_scan_callback(hammer_record_t rec, void *data)
332 {
333         hammer_cursor_t cursor = data;
334
335         /*
336          * We terminate on success, so this should be NULL on entry.
337          */
338         KKASSERT(cursor->iprec == NULL);
339
340         /*
341          * Skip if the record was marked deleted.
342          */
343         if (hammer_ip_iterate_mem_good(cursor, rec) == 0)
344                 return(0);
345
346         /*
347          * Skip if not visible due to our as-of TID
348          */
349         if (cursor->flags & HAMMER_CURSOR_ASOF) {
350                 if (cursor->asof < rec->rec.base.base.create_tid)
351                         return(0);
352                 if (rec->rec.base.base.delete_tid &&
353                     cursor->asof >= rec->rec.base.base.delete_tid) {
354                         return(0);
355                 }
356         }
357
358         /*
359          * If the record is queued to the flusher we have to block until
360          * it isn't.  Otherwise we may see duplication between our memory
361          * cache and the media.
362          */
363         hammer_ref(&rec->lock);
364
365 #warning "This deadlocks"
366 #if 0
367         if (rec->flush_state == HAMMER_FST_FLUSH)
368                 hammer_wait_mem_record(rec);
369 #endif
370
371         /*
372          * The record may have been deleted while we were blocked.
373          */
374         if (hammer_ip_iterate_mem_good(cursor, rec) == 0) {
375                 hammer_rel_mem_record(rec);
376                 return(0);
377         }
378
379         /*
380          * Set the matching record and stop the scan.
381          */
382         cursor->iprec = rec;
383         return(-1);
384 }
385
386
387 /*
388  * Lookup an in-memory record given the key specified in the cursor.  Works
389  * just like hammer_btree_lookup() but operates on an inode's in-memory
390  * record list.
391  *
392  * The lookup must fail if the record is marked for deferred deletion.
393  */
394 static
395 int
396 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip)
397 {
398         int error;
399
400         if (cursor->iprec) {
401                 hammer_rel_mem_record(cursor->iprec);
402                 cursor->iprec = NULL;
403         }
404         if (cursor->ip) {
405                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
406                 --cursor->ip->cursor_ip_refs;
407 #if 0
408                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
409                                                   &cursor->ip->rec_tree);
410 #endif
411         }
412         cursor->ip = ip;
413 #if 0
414         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
415 #endif
416         ++ip->cursor_ip_refs;
417
418 #if 0
419         cursor->scan.node = NULL;
420 #endif
421         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp,
422                                    hammer_rec_scan_callback, cursor);
423
424         if (cursor->iprec == NULL)
425                 error = ENOENT;
426         else
427                 error = 0;
428         return(error);
429 }
430
431 /*
432  * hammer_mem_first() - locate the first in-memory record matching the
433  * cursor within the bounds of the key range.
434  */
435 static
436 int
437 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip)
438 {
439         if (cursor->iprec) {
440                 hammer_rel_mem_record(cursor->iprec);
441                 cursor->iprec = NULL;
442         }
443         if (cursor->ip) {
444                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
445                 --cursor->ip->cursor_ip_refs;
446 #if 0
447                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
448                                                   &cursor->ip->rec_tree);
449 #endif
450         }
451         cursor->ip = ip;
452 #if 0
453         hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree);
454 #endif
455         ++ip->cursor_ip_refs;
456
457 #if 0
458         cursor->scan.node = NULL;
459 #endif
460         hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp,
461                                    hammer_rec_scan_callback, cursor);
462
463         /*
464          * Adjust scan.node and keep it linked into the RB-tree so we can
465          * hold the cursor through third party modifications of the RB-tree.
466          */
467         if (cursor->iprec) {
468 #if 0
469                 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec);
470 #endif
471                 return(0);
472         }
473         return(ENOENT);
474 }
475
476 void
477 hammer_mem_done(hammer_cursor_t cursor)
478 {
479         if (cursor->ip) {
480                 KKASSERT(cursor->ip->cursor_ip_refs > 0);
481                 --cursor->ip->cursor_ip_refs;
482 #if 0
483                 hammer_rec_rb_tree_scan_info_done(&cursor->scan,
484                                                   &cursor->ip->rec_tree);
485 #endif
486                 cursor->ip = NULL;
487         }
488         if (cursor->iprec) {
489                 hammer_rel_mem_record(cursor->iprec);
490                 cursor->iprec = NULL;
491         }
492 }
493
494 /************************************************************************
495  *                   HAMMER IN-MEMORY RECORD FUNCTIONS                  *
496  ************************************************************************
497  *
498  * These functions manipulate in-memory records.  Such records typically
499  * exist prior to being committed to disk or indexed via the on-disk B-Tree.
500  */
501
502 /*
503  * Add a directory entry (dip,ncp) which references inode (ip).
504  *
505  * Note that the low 32 bits of the namekey are set temporarily to create
506  * a unique in-memory record, and may be modified a second time when the
507  * record is synchronized to disk.  In particular, the low 32 bits cannot be
508  * all 0's when synching to disk, which is not handled here.
509  */
510 int
511 hammer_ip_add_directory(struct hammer_transaction *trans,
512                      struct hammer_inode *dip, struct namecache *ncp,
513                      struct hammer_inode *ip)
514 {
515         hammer_record_t record;
516         int error;
517         int bytes;
518
519         record = hammer_alloc_mem_record(dip);
520
521         bytes = ncp->nc_nlen;   /* NOTE: terminating \0 is NOT included */
522         if (++trans->hmp->namekey_iterator == 0)
523                 ++trans->hmp->namekey_iterator;
524
525         record->type = HAMMER_MEM_RECORD_ADD;
526         record->rec.entry.base.base.obj_id = dip->obj_id;
527         record->rec.entry.base.base.key =
528                 hammer_directory_namekey(ncp->nc_name, bytes);
529         record->rec.entry.base.base.key += trans->hmp->namekey_iterator;
530         record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY;
531         record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type;
532         record->rec.entry.obj_id = ip->obj_id;
533         record->data = (void *)ncp->nc_name;
534         record->rec.entry.base.data_len = bytes;
535         ++ip->ino_rec.ino_nlinks;
536         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
537
538         /*
539          * The target inode and the directory entry are bound together.
540          */
541         record->target_ip = ip;
542         record->flush_state = HAMMER_FST_SETUP;
543         TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
544
545         /*
546          * The inode now has a dependancy and must be taken out of the idle
547          * state.  An inode not in an idle state is given an extra reference.
548          */
549         if (ip->flush_state == HAMMER_FST_IDLE) {
550                 hammer_ref(&ip->lock);
551                 ip->flush_state = HAMMER_FST_SETUP;
552         }
553
554         /* NOTE: copies record->data */
555         error = hammer_mem_add(trans, record);
556         return(error);
557 }
558
559 /*
560  * Delete the directory entry and update the inode link count.  The
561  * cursor must be seeked to the directory entry record being deleted.
562  *
563  * The related inode should be share-locked by the caller.  The caller is
564  * on the frontend.
565  *
566  * This function can return EDEADLK requiring the caller to terminate
567  * the cursor, any locks, wait on the returned record, and retry.
568  */
569 int
570 hammer_ip_del_directory(struct hammer_transaction *trans,
571                      hammer_cursor_t cursor, struct hammer_inode *dip,
572                      struct hammer_inode *ip)
573 {
574         hammer_record_t record;
575         int error;
576
577         if (cursor->record == &cursor->iprec->rec) {
578                 /*
579                  * In-memory (unsynchronized) records can simply be freed.
580                  * Even though the HAMMER_RECF_DELETED_FE flag is ignored
581                  * by the backend, we must still avoid races against the
582                  * backend potentially syncing the record to the media. 
583                  *
584                  * We cannot call hammer_ip_delete_record(), that routine may
585                  * only be called from the backend.
586                  */
587                 record = cursor->iprec;
588                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
589                         KKASSERT(cursor->deadlk_rec == NULL);
590                         hammer_ref(&record->lock);
591                         cursor->deadlk_rec = record;
592                         error = EDEADLK;
593                 } else {
594                         KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
595                         record->flags |= HAMMER_RECF_DELETED_FE;
596                         error = 0;
597                 }
598         } else {
599                 /*
600                  * If the record is on-disk we have to queue the deletion by
601                  * the record's key.  This also causes lookups to skip the
602                  * record.
603                  */
604                 record = hammer_alloc_mem_record(dip);
605                 record->type = HAMMER_MEM_RECORD_DEL;
606                 record->rec.entry.base.base = cursor->record->base.base;
607                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
608
609                 record->target_ip = ip;
610                 record->flush_state = HAMMER_FST_SETUP;
611                 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
612
613                 /*
614                  * The inode now has a dependancy and must be taken out of
615                  * the idle state.  An inode not in an idle state is given
616                  * an extra reference.
617                  */
618                 if (ip->flush_state == HAMMER_FST_IDLE) {
619                         hammer_ref(&ip->lock);
620                         ip->flush_state = HAMMER_FST_SETUP;
621                 }
622
623                 error = hammer_mem_add(trans, record);
624         }
625
626         /*
627          * One less link.  The file may still be open in the OS even after
628          * all links have gone away so we only try to sync if the OS has
629          * no references and nlinks falls to 0.
630          *
631          * We have to terminate the cursor before syncing the inode to
632          * avoid deadlocking against ourselves.
633          *
634          * XXX we can't sync the inode here because the encompassing
635          * transaction might be a rename and might update the inode
636          * again with a new link.  That would force the delete_tid to be
637          * the same as the create_tid and cause a panic.
638          */
639         if (error == 0) {
640                 --ip->ino_rec.ino_nlinks;
641                 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
642                 if (ip->ino_rec.ino_nlinks == 0 &&
643                     (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
644                         hammer_done_cursor(cursor);
645                 }
646
647         }
648         return(error);
649 }
650
651 /*
652  * Add a record to an inode.
653  *
654  * The caller must allocate the record with hammer_alloc_mem_record(ip) and
655  * initialize the following additional fields:
656  *
657  * The related inode should be share-locked by the caller.  The caller is
658  * on the frontend.
659  *
660  * record->rec.entry.base.base.key
661  * record->rec.entry.base.base.rec_type
662  * record->rec.entry.base.base.data_len
663  * record->data         (a copy will be kmalloc'd if it cannot be embedded)
664  */
665 int
666 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record)
667 {
668         hammer_inode_t ip = record->ip;
669         int error;
670
671         record->rec.base.base.obj_id = ip->obj_id;
672         record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type;
673
674         hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY);
675
676         /* NOTE: copies record->data */
677         error = hammer_mem_add(trans, record);
678         return(error);
679 }
680
681 /*
682  * Sync data from a buffer cache buffer (typically) to the filesystem.  This
683  * is called via the strategy called from a cached data source.  This code
684  * is responsible for actually writing a data record out to the disk.
685  *
686  * This can only occur non-historically (i.e. 'current' data only).
687  *
688  * The file offset must be HAMMER_BUFSIZE aligned but the data length
689  * can be truncated.  The record (currently) always represents a BUFSIZE
690  * swath of space whether the data is truncated or not.
691  */
692 int
693 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip,
694                        int64_t offset, void *data, int bytes)
695 {
696         struct hammer_cursor cursor;
697         hammer_record_ondisk_t rec;
698         union hammer_btree_elm elm;
699         hammer_off_t rec_offset;
700         void *bdata;
701         int error;
702
703         KKASSERT((offset & HAMMER_BUFMASK) == 0);
704         KKASSERT(trans->type == HAMMER_TRANS_FLS);
705 retry:
706         error = hammer_init_cursor(trans, &cursor, &ip->cache[0]);
707         if (error)
708                 return(error);
709         cursor.key_beg.obj_id = ip->obj_id;
710         cursor.key_beg.key = offset + bytes;
711         cursor.key_beg.create_tid = trans->tid;
712         cursor.key_beg.delete_tid = 0;
713         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
714         cursor.asof = trans->tid;
715         cursor.flags |= HAMMER_CURSOR_INSERT;
716         cursor.flags |= HAMMER_CURSOR_BACKEND;
717
718         /*
719          * Issue a lookup to position the cursor.
720          */
721         error = hammer_btree_lookup(&cursor);
722         if (error == 0) {
723                 kprintf("hammer_ip_sync_data: duplicate data at "
724                         "(%lld,%d) tid %016llx\n",
725                         offset, bytes, trans->tid);
726                 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index],
727                                        HAMMER_BTREE_TYPE_LEAF, cursor.index);
728                 panic("Duplicate data");
729                 error = EIO;
730         }
731         if (error != ENOENT)
732                 goto done;
733
734         /*
735          * Allocate record and data space.  HAMMER_RECTYPE_DATA records
736          * can cross buffer boundaries so we may have to split our bcopy.
737          */
738         rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA,
739                                   &cursor.record_buffer,
740                                   bytes, &bdata,
741                                   &cursor.data_buffer, &error);
742         if (rec == NULL)
743                 goto done;
744         if (hammer_debug_general & 0x1000)
745                 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len);
746
747         /*
748          * Fill everything in and insert our B-Tree node.
749          *
750          * NOTE: hammer_alloc_record() has already marked the related
751          * buffers as modified.  If we do it again we will generate
752          * unnecessary undo elements.
753          */
754         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
755         rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD;
756         rec->base.base.obj_id = ip->obj_id;
757         rec->base.base.key = offset + bytes;
758         rec->base.base.create_tid = trans->tid;
759         rec->base.base.delete_tid = 0;
760         rec->base.base.rec_type = HAMMER_RECTYPE_DATA;
761         rec->base.data_crc = crc32(data, bytes);
762         hammer_modify_buffer_done(cursor.record_buffer);
763         KKASSERT(rec->base.data_len == bytes);
764
765         hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
766         bcopy(data, bdata, bytes);
767         hammer_modify_buffer_done(cursor.data_buffer);
768
769         elm.leaf.base = rec->base.base;
770         elm.leaf.rec_offset = rec_offset;
771         elm.leaf.data_offset = rec->base.data_off;
772         elm.leaf.data_len = bytes;
773         elm.leaf.data_crc = rec->base.data_crc;
774
775         /*
776          * Data records can wind up on-disk before the inode itself is
777          * on-disk.  One must assume data records may be on-disk if either
778          * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set
779          */
780         ip->flags |= HAMMER_INODE_DONDISK;
781
782         error = hammer_btree_insert(&cursor, &elm);
783         if (error == 0)
784                 goto done;
785
786         hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
787 done:
788         hammer_done_cursor(&cursor);
789         if (error == EDEADLK)
790                 goto retry;
791         return(error);
792 }
793
794 /*
795  * Sync an in-memory record to the disk.  This is called by the backend.
796  * This code is responsible for actually writing a record out to the disk.
797  *
798  * This routine can only be called by the backend and the record
799  * must have been interlocked with BE.  It will remain interlocked on
800  * return.  The caller is responsible for the record's disposition.
801  */
802 int
803 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record)
804 {
805         struct hammer_cursor cursor;
806         hammer_record_ondisk_t rec;
807         union hammer_btree_elm elm;
808         hammer_off_t rec_offset;
809         void *bdata;
810         int error;
811
812         KKASSERT(record->flush_state == HAMMER_FST_FLUSH);
813         KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE);
814
815 retry:
816         /*
817          * Get a cursor, we will either be inserting or deleting.
818          */
819         error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]);
820         if (error)
821                 return(error);
822         cursor.key_beg = record->rec.base.base;
823         cursor.flags |= HAMMER_CURSOR_BACKEND;
824
825         /*
826          * If we are deleting an exact match must be found on-disk.
827          */
828         if (record->type == HAMMER_MEM_RECORD_DEL) {
829                 error = hammer_btree_lookup(&cursor);
830                 if (error == 0)
831                         error = hammer_ip_delete_record(&cursor, trans->tid);
832                 goto done;
833         }
834
835         /*
836          * We are inserting.
837          *
838          * Issue a lookup to position the cursor and locate the cluster.  The
839          * target key should not exist.  If we are creating a directory entry
840          * we may have to iterate the low 32 bits of the key to find an unused
841          * key.
842          */
843         cursor.flags |= HAMMER_CURSOR_INSERT;
844
845         for (;;) {
846                 error = hammer_btree_lookup(&cursor);
847                 if (error)
848                         break;
849                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) {
850                         kprintf("hammer_ip_sync_record: duplicate rec "
851                                 "at (%016llx)\n", record->rec.base.base.key);
852                         Debugger("duplicate record1");
853                         error = EIO;
854                         break;
855                 }
856                 if (++trans->hmp->namekey_iterator == 0)
857                         ++trans->hmp->namekey_iterator;
858                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
859                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
860                 cursor.key_beg.key = record->rec.base.base.key;
861         }
862         if (error != ENOENT)
863                 goto done;
864
865         /*
866          * Allocate the record and data.  The result buffers will be
867          * marked as being modified and further calls to
868          * hammer_modify_buffer() will result in unneeded UNDO records.
869          *
870          * Support zero-fill records (data == NULL and data_len != 0)
871          */
872         if (record->data == NULL) {
873                 rec = hammer_alloc_record(trans, &rec_offset,
874                                           record->rec.base.base.rec_type,
875                                           &cursor.record_buffer,
876                                           0, &bdata,
877                                           NULL, &error);
878                 if (hammer_debug_general & 0x1000)
879                         kprintf("NULL RECORD DATA\n");
880         } else if (record->flags & HAMMER_RECF_INBAND) {
881                 rec = hammer_alloc_record(trans, &rec_offset,
882                                           record->rec.base.base.rec_type,
883                                           &cursor.record_buffer,
884                                           record->rec.base.data_len, &bdata,
885                                           NULL, &error);
886                 if (hammer_debug_general & 0x1000)
887                         kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
888         } else {
889                 rec = hammer_alloc_record(trans, &rec_offset,
890                                           record->rec.base.base.rec_type,
891                                           &cursor.record_buffer,
892                                           record->rec.base.data_len, &bdata,
893                                           &cursor.data_buffer, &error);
894                 if (hammer_debug_general & 0x1000)
895                         kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len);
896         }
897
898         if (rec == NULL)
899                 goto done;
900
901         /*
902          * Fill in the remaining fields and insert our B-Tree node.
903          */
904         hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0);
905         rec->base.base = record->rec.base.base;
906         bcopy(&record->rec.base + 1, &rec->base + 1,
907               HAMMER_RECORD_SIZE - sizeof(record->rec.base));
908
909         /*
910          * Copy the data and deal with zero-fill support.
911          */
912         if (record->data && (record->flags & HAMMER_RECF_INBAND)) {
913                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
914                 bcopy(record->data, bdata, rec->base.data_len);
915         } else if (record->data) {
916                 rec->base.data_crc = crc32(record->data, rec->base.data_len);
917                 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0);
918                 bcopy(record->data, bdata, rec->base.data_len);
919                 hammer_modify_buffer_done(cursor.data_buffer);
920         } else {
921                 rec->base.data_len = record->rec.base.data_len;
922         }
923         hammer_modify_buffer_done(cursor.record_buffer);
924
925         elm.leaf.base = record->rec.base.base;
926         elm.leaf.rec_offset = rec_offset;
927         elm.leaf.data_offset = rec->base.data_off;
928         elm.leaf.data_len = rec->base.data_len;
929         elm.leaf.data_crc = rec->base.data_crc;
930
931         error = hammer_btree_insert(&cursor, &elm);
932
933         /*
934          * This occurs when the frontend creates a record and queues it to
935          * the backend, then tries to delete the record.  The backend must
936          * still sync the record to the media as if it were not deleted,
937          * but must interlock with the frontend to ensure that the 
938          * synchronized record is not visible to the frontend, which means
939          * converting it from an ADD record to a DEL record.
940          *
941          * The DEL record then masks the record synced to disk until another
942          * round can delete it for real.
943          */
944         if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE)) {
945                 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD);
946                 record->flags &= ~HAMMER_RECF_DELETED_FE;
947                 record->type = HAMMER_MEM_RECORD_DEL;
948                 if (record->flush_state == HAMMER_FST_SETUP) {
949                         hammer_test_inode(record->ip);
950                         hammer_test_inode(record->target_ip);
951                 }
952         }
953
954         /*
955          * If the error occured unwind the operation.
956          */
957         if (error)
958                 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE);
959
960 done:
961         hammer_done_cursor(&cursor);
962         if (error == EDEADLK)
963                 goto retry;
964         return(error);
965 }
966
967 /*
968  * Add the record to the inode's rec_tree.  The low 32 bits of a directory
969  * entry's key is used to deal with hash collisions in the upper 32 bits.
970  * A unique 64 bit key is generated in-memory and may be regenerated a
971  * second time when the directory record is flushed to the on-disk B-Tree.
972  *
973  * A referenced record is passed to this function.  This function
974  * eats the reference.  If an error occurs the record will be deleted.
975  *
976  * A copy of the temporary record->data pointer provided by the caller
977  * will be made.
978  */
979 static
980 int
981 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record)
982 {
983         void *data;
984         int bytes;
985         int reclen;
986                 
987         /*
988          * Make a private copy of record->data
989          */
990         if (record->data) {
991                 /*
992                  * Try to embed the data in extra space in the record
993                  * union, otherwise allocate a copy.
994                  */
995                 bytes = record->rec.base.data_len;
996                 switch(record->rec.base.base.rec_type) {
997                 case HAMMER_RECTYPE_DIRENTRY:
998                         reclen = offsetof(struct hammer_entry_record, name[0]);
999                         break;
1000                 case HAMMER_RECTYPE_DATA:
1001                         reclen = offsetof(struct hammer_data_record, data[0]);
1002                         break;
1003                 default:
1004                         reclen = sizeof(record->rec);
1005                         break;
1006                 }
1007                 if (reclen + bytes <= HAMMER_RECORD_SIZE) {
1008                         bcopy(record->data, (char *)&record->rec + reclen,
1009                               bytes);
1010                         record->data = (void *)((char *)&record->rec + reclen);
1011                         record->flags |= HAMMER_RECF_INBAND;
1012                 } else {
1013                         ++hammer_count_record_datas;
1014                         data = kmalloc(bytes, M_HAMMER, M_WAITOK);
1015                         record->flags |= HAMMER_RECF_ALLOCDATA;
1016                         bcopy(record->data, data, bytes);
1017                         record->data = data;
1018                 }
1019         }
1020
1021         /*
1022          * Insert into the RB tree, find an unused iterator if this is
1023          * a directory entry.
1024          */
1025         while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) {
1026                 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){
1027                         record->flags |= HAMMER_RECF_DELETED_FE;
1028                         hammer_rel_mem_record(record);
1029                         return (EEXIST);
1030                 }
1031                 if (++trans->hmp->namekey_iterator == 0)
1032                         ++trans->hmp->namekey_iterator;
1033                 record->rec.base.base.key &= ~(0xFFFFFFFFLL);
1034                 record->rec.base.base.key |= trans->hmp->namekey_iterator;
1035         }
1036         record->flags |= HAMMER_RECF_ONRBTREE;
1037         hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY);
1038         hammer_rel_mem_record(record);
1039         return(0);
1040 }
1041
1042 /************************************************************************
1043  *                   HAMMER INODE MERGED-RECORD FUNCTIONS               *
1044  ************************************************************************
1045  *
1046  * These functions augment the B-Tree scanning functions in hammer_btree.c
1047  * by merging in-memory records with on-disk records.
1048  */
1049
1050 /*
1051  * Locate a particular record either in-memory or on-disk.
1052  *
1053  * NOTE: This is basically a standalone routine, hammer_ip_next() may
1054  * NOT be called to iterate results.
1055  */
1056 int
1057 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip)
1058 {
1059         int error;
1060
1061         /*
1062          * If the element is in-memory return it without searching the
1063          * on-disk B-Tree
1064          */
1065         error = hammer_mem_lookup(cursor, ip);
1066         if (error == 0) {
1067                 cursor->record = &cursor->iprec->rec;
1068                 return(error);
1069         }
1070         if (error != ENOENT)
1071                 return(error);
1072
1073         /*
1074          * If the inode has on-disk components search the on-disk B-Tree.
1075          */
1076         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0)
1077                 return(error);
1078         error = hammer_btree_lookup(cursor);
1079         if (error == 0)
1080                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1081         return(error);
1082 }
1083
1084 /*
1085  * Locate the first record within the cursor's key_beg/key_end range,
1086  * restricted to a particular inode.  0 is returned on success, ENOENT
1087  * if no records matched the requested range, or some other error.
1088  *
1089  * When 0 is returned hammer_ip_next() may be used to iterate additional
1090  * records within the requested range.
1091  *
1092  * This function can return EDEADLK, requiring the caller to terminate
1093  * the cursor and try again.
1094  */
1095 int
1096 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip)
1097 {
1098         int error;
1099
1100         /*
1101          * Clean up fields and setup for merged scan
1102          */
1103         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1104         cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM;
1105         cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF;
1106         if (cursor->iprec) {
1107                 hammer_rel_mem_record(cursor->iprec);
1108                 cursor->iprec = NULL;
1109         }
1110
1111         /*
1112          * Search the on-disk B-Tree.  hammer_btree_lookup() only does an
1113          * exact lookup so if we get ENOENT we have to call the iterate
1114          * function to validate the first record after the begin key.
1115          *
1116          * The ATEDISK flag is used by hammer_btree_iterate to determine
1117          * whether it must index forwards or not.  It is also used here
1118          * to select the next record from in-memory or on-disk.
1119          *
1120          * EDEADLK can only occur if the lookup hit an empty internal
1121          * element and couldn't delete it.  Since this could only occur
1122          * in-range, we can just iterate from the failure point.
1123          */
1124         if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) {
1125                 error = hammer_btree_lookup(cursor);
1126                 if (error == ENOENT || error == EDEADLK) {
1127                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1128                         if (hammer_debug_general & 0x2000)
1129                                 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index);
1130                         error = hammer_btree_iterate(cursor);
1131                 }
1132                 if (error && error != ENOENT) 
1133                         return(error);
1134                 if (error == 0) {
1135                         cursor->flags &= ~HAMMER_CURSOR_DISKEOF;
1136                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1137                 } else {
1138                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1139                 }
1140         }
1141
1142         /*
1143          * Search the in-memory record list (Red-Black tree).  Unlike the
1144          * B-Tree search, mem_first checks for records in the range.
1145          */
1146         error = hammer_mem_first(cursor, ip);
1147         if (error && error != ENOENT)
1148                 return(error);
1149         if (error == 0) {
1150                 cursor->flags &= ~HAMMER_CURSOR_MEMEOF;
1151                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1152                 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0)
1153                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1154         }
1155
1156         /*
1157          * This will return the first matching record.
1158          */
1159         return(hammer_ip_next(cursor));
1160 }
1161
1162 /*
1163  * Retrieve the next record in a merged iteration within the bounds of the
1164  * cursor.  This call may be made multiple times after the cursor has been
1165  * initially searched with hammer_ip_first().
1166  *
1167  * 0 is returned on success, ENOENT if no further records match the
1168  * requested range, or some other error code is returned.
1169  */
1170 int
1171 hammer_ip_next(hammer_cursor_t cursor)
1172 {
1173         hammer_btree_elm_t elm;
1174         hammer_record_t rec, save;
1175         int error;
1176         int r;
1177
1178 next_btree:
1179         /*
1180          * Load the current on-disk and in-memory record.  If we ate any
1181          * records we have to get the next one. 
1182          *
1183          * If we deleted the last on-disk record we had scanned ATEDISK will
1184          * be clear and DELBTREE will be set, forcing a call to iterate. The
1185          * fact that ATEDISK is clear causes iterate to re-test the 'current'
1186          * element.  If ATEDISK is set, iterate will skip the 'current'
1187          * element.
1188          *
1189          * Get the next on-disk record
1190          */
1191         if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) {
1192                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1193                         error = hammer_btree_iterate(cursor);
1194                         cursor->flags &= ~HAMMER_CURSOR_DELBTREE;
1195                         if (error == 0)
1196                                 cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1197                         else
1198                                 cursor->flags |= HAMMER_CURSOR_DISKEOF |
1199                                                  HAMMER_CURSOR_ATEDISK;
1200                 }
1201         }
1202
1203 next_memory:
1204         /*
1205          * Get the next in-memory record.  The record can be ripped out
1206          * of the RB tree so we maintain a scan_info structure to track
1207          * the next node.
1208          *
1209          * hammer_rec_scan_cmp:  Is the record still in our general range,
1210          *                       (non-inclusive of snapshot exclusions)?
1211          * hammer_rec_scan_callback: Is the record in our snapshot?
1212          */
1213         if (cursor->flags & HAMMER_CURSOR_ATEMEM) {
1214                 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) {
1215                         save = cursor->iprec;
1216                         cursor->iprec = NULL;
1217                         rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL;
1218                         while (rec) {
1219                                 if (hammer_rec_scan_cmp(rec, cursor) != 0)
1220                                         break;
1221                                 if (hammer_rec_scan_callback(rec, cursor) != 0)
1222                                         break;
1223                                 rec = hammer_rec_rb_tree_RB_NEXT(rec);
1224                         }
1225                         if (save)
1226                                 hammer_rel_mem_record(save);
1227                         if (cursor->iprec) {
1228                                 KKASSERT(cursor->iprec == rec);
1229                                 cursor->flags &= ~HAMMER_CURSOR_ATEMEM;
1230 #if 0
1231                                 cursor->scan.node =
1232                                         hammer_rec_rb_tree_RB_NEXT(rec);
1233 #endif
1234                         } else {
1235                                 cursor->flags |= HAMMER_CURSOR_MEMEOF;
1236                         }
1237                 }
1238         }
1239
1240         /*
1241          * Extract either the disk or memory record depending on their
1242          * relative position.
1243          */
1244         error = 0;
1245         switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) {
1246         case 0:
1247                 /*
1248                  * Both entries valid
1249                  */
1250                 elm = &cursor->node->ondisk->elms[cursor->index];
1251                 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base);
1252                 if (r < 0) {
1253                         error = hammer_btree_extract(cursor,
1254                                                      HAMMER_CURSOR_GET_RECORD);
1255                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1256                         break;
1257                 }
1258
1259                 /*
1260                  * If the entries match exactly the memory entry typically
1261                  * specifies an on-disk deletion and we eat both entries.
1262                  *
1263                  * If the in-memory record is not an on-disk deletion we
1264                  * probably caught the syncer while it was syncing it to
1265                  * the media.  Since we hold a shared lock on the cursor,
1266                  * the in-memory record had better be marked deleted at
1267                  * this point.
1268                  */
1269                 if (r == 0) {
1270                         if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1271                                 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) {
1272                                         cursor->flags |= HAMMER_CURSOR_ATEDISK;
1273                                         cursor->flags |= HAMMER_CURSOR_ATEMEM;
1274                                         goto next_btree;
1275                                 }
1276                         } else {
1277                                 KKASSERT(hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0);
1278                                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1279                                 goto next_memory;
1280                         }
1281                 }
1282                 /* fall through to the memory entry */
1283         case HAMMER_CURSOR_ATEDISK:
1284                 /*
1285                  * Only the memory entry is valid.  If the record is
1286                  * placemarking an on-disk deletion, we skip it unless
1287                  * the caller wants special record visibility.
1288                  */
1289                 cursor->record = &cursor->iprec->rec;
1290                 cursor->flags |= HAMMER_CURSOR_ATEMEM;
1291                 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) {
1292                         if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0)
1293                                 goto next_memory;
1294                 }
1295                 break;
1296         case HAMMER_CURSOR_ATEMEM:
1297                 /*
1298                  * Only the disk entry is valid
1299                  */
1300                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1301                 cursor->flags |= HAMMER_CURSOR_ATEDISK;
1302                 break;
1303         default:
1304                 /*
1305                  * Neither entry is valid
1306                  *
1307                  * XXX error not set properly
1308                  */
1309                 cursor->record = NULL;
1310                 error = ENOENT;
1311                 break;
1312         }
1313         return(error);
1314 }
1315
1316 /*
1317  * Resolve the cursor->data pointer for the current cursor position in
1318  * a merged iteration.
1319  */
1320 int
1321 hammer_ip_resolve_data(hammer_cursor_t cursor)
1322 {
1323         int error;
1324
1325         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1326                 cursor->data = cursor->iprec->data;
1327                 error = 0;
1328         } else {
1329                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA);
1330         }
1331         return(error);
1332 }
1333
1334 int
1335 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor)
1336 {
1337         int error;
1338
1339         if (cursor->iprec && cursor->record == &cursor->iprec->rec) {
1340                 cursor->data = cursor->iprec->data;
1341                 error = 0;
1342         } else {
1343                 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA |
1344                                                      HAMMER_CURSOR_GET_RECORD);
1345         }
1346         return(error);
1347 }
1348
1349 /*
1350  * Delete all records within the specified range for inode ip.
1351  *
1352  * NOTE: An unaligned range will cause new records to be added to cover
1353  * the edge cases. (XXX not implemented yet).
1354  *
1355  * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024).
1356  *
1357  * NOTE: Record keys for regular file data have to be special-cased since
1358  * they indicate the end of the range (key = base + bytes).
1359  */
1360 int
1361 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip,
1362                        int64_t ran_beg, int64_t ran_end)
1363 {
1364         struct hammer_cursor cursor;
1365         hammer_record_ondisk_t rec;
1366         hammer_base_elm_t base;
1367         int error;
1368         int64_t off;
1369
1370 #if 0
1371         kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end);
1372 #endif
1373
1374         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1375 retry:
1376         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1377
1378         cursor.key_beg.obj_id = ip->obj_id;
1379         cursor.key_beg.create_tid = 0;
1380         cursor.key_beg.delete_tid = 0;
1381         cursor.key_beg.obj_type = 0;
1382         cursor.asof = ip->obj_asof;
1383         cursor.flags |= HAMMER_CURSOR_ASOF;
1384         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1385         cursor.flags |= HAMMER_CURSOR_BACKEND;
1386
1387         cursor.key_end = cursor.key_beg;
1388         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1389                 cursor.key_beg.key = ran_beg;
1390                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1391                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1392                 cursor.key_end.key = ran_end;
1393         } else {
1394                 /*
1395                  * The key in the B-Tree is (base+bytes), so the first possible
1396                  * matching key is ran_beg + 1.
1397                  */
1398                 int64_t tmp64;
1399
1400                 cursor.key_beg.key = ran_beg + 1;
1401                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1402                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1403
1404                 tmp64 = ran_end + MAXPHYS + 1;  /* work around GCC-4 bug */
1405                 if (tmp64 < ran_end)
1406                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1407                 else
1408                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1409         }
1410         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1411
1412         error = hammer_ip_first(&cursor, ip);
1413
1414         /*
1415          * Iterate through matching records and mark them as deleted.
1416          */
1417         while (error == 0) {
1418                 rec = cursor.record;
1419                 base = &rec->base.base;
1420
1421                 KKASSERT(base->delete_tid == 0);
1422
1423                 /*
1424                  * There may be overlap cases for regular file data.  Also
1425                  * remember the key for a regular file record is the offset
1426                  * of the last byte of the record (base + len - 1), NOT the
1427                  * base offset.
1428                  */
1429 #if 0
1430                 kprintf("delete_range rec_type %02x\n", base->rec_type);
1431 #endif
1432                 if (base->rec_type == HAMMER_RECTYPE_DATA) {
1433 #if 0
1434                         kprintf("delete_range loop key %016llx,%d\n",
1435                                 base->key - rec->base.data_len, rec->base.data_len);
1436 #endif
1437                         off = base->key - rec->base.data_len;
1438                         /*
1439                          * Check the left edge case.  We currently do not
1440                          * split existing records.
1441                          */
1442                         if (off < ran_beg) {
1443                                 panic("hammer left edge case %016llx %d\n",
1444                                         base->key, rec->base.data_len);
1445                         }
1446
1447                         /*
1448                          * Check the right edge case.  Note that the
1449                          * record can be completely out of bounds, which
1450                          * terminates the search.
1451                          *
1452                          * base->key is exclusive of the right edge while
1453                          * ran_end is inclusive of the right edge.  The
1454                          * (key - data_len) left boundary is inclusive.
1455                          *
1456                          * XXX theory-check this test at some point, are
1457                          * we missing a + 1 somewhere?  Note that ran_end
1458                          * could overflow.
1459                          */
1460                         if (base->key - 1 > ran_end) {
1461                                 if (base->key - rec->base.data_len > ran_end)
1462                                         break;
1463                                 panic("hammer right edge case\n");
1464                         }
1465                 }
1466
1467                 /*
1468                  * Mark the record and B-Tree entry as deleted.  This will
1469                  * also physically delete the B-Tree entry, record, and
1470                  * data if the retention policy dictates.  The function
1471                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1472                  * uses to perform a fixup.
1473                  */
1474                 error = hammer_ip_delete_record(&cursor, trans->tid);
1475                 if (error)
1476                         break;
1477                 error = hammer_ip_next(&cursor);
1478         }
1479         hammer_done_cursor(&cursor);
1480         if (error == EDEADLK)
1481                 goto retry;
1482         if (error == ENOENT)
1483                 error = 0;
1484         return(error);
1485 }
1486
1487 /*
1488  * Delete all user records associated with an inode except the inode record
1489  * itself.  Directory entries are not deleted (they must be properly disposed
1490  * of or nlinks would get upset).
1491  */
1492 int
1493 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip)
1494 {
1495         struct hammer_cursor cursor;
1496         hammer_record_ondisk_t rec;
1497         hammer_base_elm_t base;
1498         int error;
1499
1500         KKASSERT(trans->type == HAMMER_TRANS_FLS);
1501 retry:
1502         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1503
1504         cursor.key_beg.obj_id = ip->obj_id;
1505         cursor.key_beg.create_tid = 0;
1506         cursor.key_beg.delete_tid = 0;
1507         cursor.key_beg.obj_type = 0;
1508         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1509         cursor.key_beg.key = HAMMER_MIN_KEY;
1510
1511         cursor.key_end = cursor.key_beg;
1512         cursor.key_end.rec_type = 0xFFFF;
1513         cursor.key_end.key = HAMMER_MAX_KEY;
1514
1515         cursor.asof = ip->obj_asof;
1516         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1517         cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY;
1518         cursor.flags |= HAMMER_CURSOR_BACKEND;
1519
1520         error = hammer_ip_first(&cursor, ip);
1521
1522         /*
1523          * Iterate through matching records and mark them as deleted.
1524          */
1525         while (error == 0) {
1526                 rec = cursor.record;
1527                 base = &rec->base.base;
1528
1529                 KKASSERT(base->delete_tid == 0);
1530
1531                 /*
1532                  * Mark the record and B-Tree entry as deleted.  This will
1533                  * also physically delete the B-Tree entry, record, and
1534                  * data if the retention policy dictates.  The function
1535                  * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next()
1536                  * uses to perform a fixup.
1537                  *
1538                  * Directory entries (and delete-on-disk directory entries)
1539                  * must be synced and cannot be deleted.
1540                  */
1541                 if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY)
1542                         error = hammer_ip_delete_record(&cursor, trans->tid);
1543                 if (error)
1544                         break;
1545                 error = hammer_ip_next(&cursor);
1546         }
1547         hammer_done_cursor(&cursor);
1548         if (error == EDEADLK)
1549                 goto retry;
1550         if (error == ENOENT)
1551                 error = 0;
1552         return(error);
1553 }
1554
1555 /*
1556  * Delete the record at the current cursor.  On success the cursor will
1557  * be positioned appropriately for an iteration but may no longer be at
1558  * a leaf node.
1559  *
1560  * This routine is only called from the backend.
1561  *
1562  * NOTE: This can return EDEADLK, requiring the caller to terminate the
1563  * cursor and retry.
1564  */
1565 int
1566 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid)
1567 {
1568         hammer_btree_elm_t elm;
1569         hammer_mount_t hmp;
1570         int error;
1571         int dodelete;
1572
1573         KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND);
1574
1575         /*
1576          * In-memory (unsynchronized) records can simply be freed.  This
1577          * only occurs in range iterations since all other records are
1578          * individually synchronized.  Thus there should be no confusion with
1579          * the interlock.
1580          */
1581         if (cursor->record == &cursor->iprec->rec) {
1582                 KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0);
1583                 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE;
1584                 cursor->iprec->flags |= HAMMER_RECF_DELETED_BE;
1585                 return(0);
1586         }
1587
1588         /*
1589          * On-disk records are marked as deleted by updating their delete_tid.
1590          * This does not effect their position in the B-Tree (which is based
1591          * on their create_tid).
1592          */
1593         error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD);
1594         elm = NULL;
1595         hmp = cursor->node->hmp;
1596
1597         dodelete = 0;
1598         if (error == 0) {
1599                 error = hammer_cursor_upgrade(cursor);
1600                 if (error == 0) {
1601                         elm = &cursor->node->ondisk->elms[cursor->index];
1602                         hammer_modify_node(cursor->trans, cursor->node,
1603                                            &elm->leaf.base.delete_tid,
1604                                            sizeof(elm->leaf.base.delete_tid));
1605                         elm->leaf.base.delete_tid = tid;
1606                         hammer_modify_node_done(cursor->node);
1607
1608                         /*
1609                          * An on-disk record cannot have the same delete_tid
1610                          * as its create_tid.  In a chain of record updates
1611                          * this could result in a duplicate record.
1612                          */
1613                         KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid);
1614                         hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t));
1615                         cursor->record->base.base.delete_tid = tid;
1616                         hammer_modify_buffer_done(cursor->record_buffer);
1617                 }
1618         }
1619
1620         /*
1621          * If we were mounted with the nohistory option, we physically
1622          * delete the record.
1623          */
1624         if (hmp->hflags & HMNT_NOHISTORY)
1625                 dodelete = 1;
1626
1627         if (error == 0 && dodelete) {
1628                 error = hammer_delete_at_cursor(cursor, NULL);
1629                 if (error) {
1630                         panic("hammer_ip_delete_record: unable to physically delete the record!\n");
1631                         error = 0;
1632                 }
1633         }
1634         return(error);
1635 }
1636
1637 int
1638 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes)
1639 {
1640         hammer_btree_elm_t elm;
1641         hammer_off_t rec_offset;
1642         hammer_off_t data_offset;
1643         int32_t data_len;
1644         u_int16_t rec_type;
1645         int error;
1646
1647         elm = &cursor->node->ondisk->elms[cursor->index];
1648         KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD);
1649
1650         rec_offset = elm->leaf.rec_offset;
1651         data_offset = elm->leaf.data_offset;
1652         data_len = elm->leaf.data_len;
1653         rec_type = elm->leaf.base.rec_type;
1654
1655         error = hammer_btree_delete(cursor);
1656         if (error == 0) {
1657                 /*
1658                  * This forces a fixup for the iteration because
1659                  * the cursor is now either sitting at the 'next'
1660                  * element or sitting at the end of a leaf.
1661                  */
1662                 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) {
1663                         cursor->flags |= HAMMER_CURSOR_DELBTREE;
1664                         cursor->flags &= ~HAMMER_CURSOR_ATEDISK;
1665                 }
1666         }
1667         if (error == 0) {
1668                 hammer_blockmap_free(cursor->trans, rec_offset,
1669                                      sizeof(union hammer_record_ondisk));
1670         }
1671         if (error == 0) {
1672                 switch(data_offset & HAMMER_OFF_ZONE_MASK) {
1673                 case HAMMER_ZONE_LARGE_DATA:
1674                 case HAMMER_ZONE_SMALL_DATA:
1675                         hammer_blockmap_free(cursor->trans,
1676                                              data_offset, data_len);
1677                         break;
1678                 default:
1679                         break;
1680                 }
1681         }
1682 #if 0
1683         kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d "
1684                 "(%d remain in cluster)\n",
1685                 cluster->volume->vol_no, cluster->clu_no,
1686                 rec_offset, data_offset, data_len,
1687                 cluster->ondisk->stat_records);
1688 #endif
1689         return (error);
1690 }
1691
1692 /*
1693  * Determine whether we can remove a directory.  This routine checks whether
1694  * a directory is empty or not and enforces flush connectivity.
1695  *
1696  * Flush connectivity requires that we block if the target directory is
1697  * currently flushing, otherwise it may not end up in the same flush group.
1698  *
1699  * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure.
1700  */
1701 int
1702 hammer_ip_check_directory_empty(hammer_transaction_t trans,
1703                         hammer_cursor_t parent_cursor, hammer_inode_t ip)
1704 {
1705         struct hammer_cursor cursor;
1706         int error;
1707
1708 #if 0
1709         /*
1710          * Check flush connectivity
1711          */
1712         if (ip->flush_state != HAMMER_FST_IDLE) {
1713                 kprintf("FWAIT\n");
1714                 hammer_done_cursor(parent_cursor);
1715                 hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL);
1716                 hammer_wait_inode(ip);
1717                 return (EDEADLK);
1718         }
1719 #endif
1720
1721         /*
1722          * Check directory empty
1723          */
1724         hammer_init_cursor(trans, &cursor, &ip->cache[0]);
1725
1726         cursor.key_beg.obj_id = ip->obj_id;
1727         cursor.key_beg.create_tid = 0;
1728         cursor.key_beg.delete_tid = 0;
1729         cursor.key_beg.obj_type = 0;
1730         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1;
1731         cursor.key_beg.key = HAMMER_MIN_KEY;
1732
1733         cursor.key_end = cursor.key_beg;
1734         cursor.key_end.rec_type = 0xFFFF;
1735         cursor.key_end.key = HAMMER_MAX_KEY;
1736
1737         cursor.asof = ip->obj_asof;
1738         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1739
1740         error = hammer_ip_first(&cursor, ip);
1741         if (error == ENOENT)
1742                 error = 0;
1743         else if (error == 0)
1744                 error = ENOTEMPTY;
1745         hammer_done_cursor(&cursor);
1746         return(error);
1747 }
1748