ef413f342b55f5768641a0144ca4762f7aef731f
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
35  */
36
37 #include "hammer.h"
38 #include <vm/vm_extern.h>
39 #include <sys/buf.h>
40 #include <sys/buf2.h>
41
42 static int      hammer_unload_inode(struct hammer_inode *ip);
43 static void     hammer_free_inode(hammer_inode_t ip);
44 static void     hammer_flush_inode_core(hammer_inode_t ip,
45                                         hammer_flush_group_t flg, int flags);
46 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
47 #if 0
48 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
49 #endif
50 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
51                                         hammer_flush_group_t flg);
52 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
53                                         int depth, hammer_flush_group_t flg);
54 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
55
56 #ifdef DEBUG_TRUNCATE
57 extern struct hammer_inode *HammerTruncIp;
58 #endif
59
60 /*
61  * RB-Tree support for inode structures
62  */
63 int
64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
65 {
66         if (ip1->obj_localization < ip2->obj_localization)
67                 return(-1);
68         if (ip1->obj_localization > ip2->obj_localization)
69                 return(1);
70         if (ip1->obj_id < ip2->obj_id)
71                 return(-1);
72         if (ip1->obj_id > ip2->obj_id)
73                 return(1);
74         if (ip1->obj_asof < ip2->obj_asof)
75                 return(-1);
76         if (ip1->obj_asof > ip2->obj_asof)
77                 return(1);
78         return(0);
79 }
80
81 /*
82  * RB-Tree support for inode structures / special LOOKUP_INFO
83  */
84 static int
85 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
86 {
87         if (info->obj_localization < ip->obj_localization)
88                 return(-1);
89         if (info->obj_localization > ip->obj_localization)
90                 return(1);
91         if (info->obj_id < ip->obj_id)
92                 return(-1);
93         if (info->obj_id > ip->obj_id)
94                 return(1);
95         if (info->obj_asof < ip->obj_asof)
96                 return(-1);
97         if (info->obj_asof > ip->obj_asof)
98                 return(1);
99         return(0);
100 }
101
102 /*
103  * Used by hammer_scan_inode_snapshots() to locate all of an object's
104  * snapshots.  Note that the asof field is not tested, which we can get
105  * away with because it is the lowest-priority field.
106  */
107 static int
108 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
109 {
110         hammer_inode_info_t info = data;
111
112         if (ip->obj_localization > info->obj_localization)
113                 return(1);
114         if (ip->obj_localization < info->obj_localization)
115                 return(-1);
116         if (ip->obj_id > info->obj_id)
117                 return(1);
118         if (ip->obj_id < info->obj_id)
119                 return(-1);
120         return(0);
121 }
122
123 /*
124  * Used by hammer_unload_pseudofs() to locate all inodes associated with
125  * a particular PFS.
126  */
127 static int
128 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
129 {
130         u_int32_t localization = *(u_int32_t *)data;
131         if (ip->obj_localization > localization)
132                 return(1);
133         if (ip->obj_localization < localization)
134                 return(-1);
135         return(0);
136 }
137
138 /*
139  * RB-Tree support for pseudofs structures
140  */
141 static int
142 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
143 {
144         if (p1->localization < p2->localization)
145                 return(-1);
146         if (p1->localization > p2->localization)
147                 return(1);
148         return(0);
149 }
150
151
152 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
153 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
154                 hammer_inode_info_cmp, hammer_inode_info_t);
155 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
156              hammer_pfs_rb_compare, u_int32_t, localization);
157
158 /*
159  * The kernel is not actively referencing this vnode but is still holding
160  * it cached.
161  *
162  * This is called from the frontend.
163  */
164 int
165 hammer_vop_inactive(struct vop_inactive_args *ap)
166 {
167         struct hammer_inode *ip = VTOI(ap->a_vp);
168
169         /*
170          * Degenerate case
171          */
172         if (ip == NULL) {
173                 vrecycle(ap->a_vp);
174                 return(0);
175         }
176
177         /*
178          * If the inode no longer has visibility in the filesystem try to
179          * recycle it immediately, even if the inode is dirty.  Recycling
180          * it quickly allows the system to reclaim buffer cache and VM
181          * resources which can matter a lot in a heavily loaded system.
182          *
183          * This can deadlock in vfsync() if we aren't careful.
184          * 
185          * Do not queue the inode to the flusher if we still have visibility,
186          * otherwise namespace calls such as chmod will unnecessarily generate
187          * multiple inode updates.
188          */
189         hammer_inode_unloadable_check(ip, 0);
190         if (ip->ino_data.nlinks == 0) {
191                 if (ip->flags & HAMMER_INODE_MODMASK)
192                         hammer_flush_inode(ip, 0);
193                 vrecycle(ap->a_vp);
194         }
195         return(0);
196 }
197
198 /*
199  * Release the vnode association.  This is typically (but not always)
200  * the last reference on the inode.
201  *
202  * Once the association is lost we are on our own with regards to
203  * flushing the inode.
204  */
205 int
206 hammer_vop_reclaim(struct vop_reclaim_args *ap)
207 {
208         struct hammer_inode *ip;
209         hammer_mount_t hmp;
210         struct vnode *vp;
211
212         vp = ap->a_vp;
213
214         if ((ip = vp->v_data) != NULL) {
215                 hmp = ip->hmp;
216                 vp->v_data = NULL;
217                 ip->vp = NULL;
218
219                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
220                         ++hammer_count_reclaiming;
221                         ++hmp->inode_reclaims;
222                         ip->flags |= HAMMER_INODE_RECLAIM;
223                 }
224                 hammer_rel_inode(ip, 1);
225         }
226         return(0);
227 }
228
229 /*
230  * Return a locked vnode for the specified inode.  The inode must be
231  * referenced but NOT LOCKED on entry and will remain referenced on
232  * return.
233  *
234  * Called from the frontend.
235  */
236 int
237 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
238 {
239         hammer_mount_t hmp;
240         struct vnode *vp;
241         int error = 0;
242         u_int8_t obj_type;
243
244         hmp = ip->hmp;
245
246         for (;;) {
247                 if ((vp = ip->vp) == NULL) {
248                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
249                         if (error)
250                                 break;
251                         hammer_lock_ex(&ip->lock);
252                         if (ip->vp != NULL) {
253                                 hammer_unlock(&ip->lock);
254                                 vp = *vpp;
255                                 vp->v_type = VBAD;
256                                 vx_put(vp);
257                                 continue;
258                         }
259                         hammer_ref(&ip->lock);
260                         vp = *vpp;
261                         ip->vp = vp;
262
263                         obj_type = ip->ino_data.obj_type;
264                         vp->v_type = hammer_get_vnode_type(obj_type);
265
266                         hammer_inode_wakereclaims(ip);
267
268                         switch(ip->ino_data.obj_type) {
269                         case HAMMER_OBJTYPE_CDEV:
270                         case HAMMER_OBJTYPE_BDEV:
271                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
272                                 addaliasu(vp, ip->ino_data.rmajor,
273                                           ip->ino_data.rminor);
274                                 break;
275                         case HAMMER_OBJTYPE_FIFO:
276                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
277                                 break;
278                         case HAMMER_OBJTYPE_REGFILE:
279                                 /*
280                                  * MPSAFE read supported.
281                                  */
282                                 vp->v_flag |= VMP_READ;
283                                 break;
284                         default:
285                                 break;
286                         }
287                         vp->v_flag |= VMP_GETATTR;
288
289                         /*
290                          * Only mark as the root vnode if the ip is not
291                          * historical, otherwise the VFS cache will get
292                          * confused.  The other half of the special handling
293                          * is in hammer_vop_nlookupdotdot().
294                          *
295                          * Pseudo-filesystem roots can be accessed via
296                          * non-root filesystem paths and setting VROOT may
297                          * confuse the namecache.  Set VPFSROOT instead.
298                          */
299                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
300                             ip->obj_asof == hmp->asof) {
301                                 if (ip->obj_localization == 0)
302                                         vp->v_flag |= VROOT;
303                                 else
304                                         vp->v_flag |= VPFSROOT;
305                         }
306
307                         vp->v_data = (void *)ip;
308                         /* vnode locked by getnewvnode() */
309                         /* make related vnode dirty if inode dirty? */
310                         hammer_unlock(&ip->lock);
311                         if (vp->v_type == VREG)
312                                 vinitvmio(vp, ip->ino_data.size);
313                         break;
314                 }
315
316                 /*
317                  * loop if the vget fails (aka races), or if the vp
318                  * no longer matches ip->vp.
319                  */
320                 if (vget(vp, LK_EXCLUSIVE) == 0) {
321                         if (vp == ip->vp)
322                                 break;
323                         vput(vp);
324                 }
325         }
326         *vpp = vp;
327         return(error);
328 }
329
330 /*
331  * Locate all copies of the inode for obj_id compatible with the specified
332  * asof, reference, and issue the related call-back.  This routine is used
333  * for direct-io invalidation and does not create any new inodes.
334  */
335 void
336 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
337                             int (*callback)(hammer_inode_t ip, void *data),
338                             void *data)
339 {
340         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
341                                    hammer_inode_info_cmp_all_history,
342                                    callback, iinfo);
343 }
344
345 /*
346  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
347  * do not attach or detach the related vnode (use hammer_get_vnode() for
348  * that).
349  *
350  * The flags argument is only applied for newly created inodes, and only
351  * certain flags are inherited.
352  *
353  * Called from the frontend.
354  */
355 struct hammer_inode *
356 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
357                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
358                  int flags, int *errorp)
359 {
360         hammer_mount_t hmp = trans->hmp;
361         struct hammer_node_cache *cachep;
362         struct hammer_inode_info iinfo;
363         struct hammer_cursor cursor;
364         struct hammer_inode *ip;
365
366
367         /*
368          * Determine if we already have an inode cached.  If we do then
369          * we are golden.
370          *
371          * If we find an inode with no vnode we have to mark the
372          * transaction such that hammer_inode_waitreclaims() is
373          * called later on to avoid building up an infinite number
374          * of inodes.  Otherwise we can continue to * add new inodes
375          * faster then they can be disposed of, even with the tsleep
376          * delay.
377          *
378          * If we find a dummy inode we return a failure so dounlink
379          * (which does another lookup) doesn't try to mess with the
380          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
381          * to ref dummy inodes.
382          */
383         iinfo.obj_id = obj_id;
384         iinfo.obj_asof = asof;
385         iinfo.obj_localization = localization;
386 loop:
387         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
388         if (ip) {
389                 if (ip->flags & HAMMER_INODE_DUMMY) {
390                         *errorp = ENOENT;
391                         return(NULL);
392                 }
393                 hammer_ref(&ip->lock);
394                 *errorp = 0;
395                 return(ip);
396         }
397
398         /*
399          * Allocate a new inode structure and deal with races later.
400          */
401         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
402         ++hammer_count_inodes;
403         ++hmp->count_inodes;
404         ip->obj_id = obj_id;
405         ip->obj_asof = iinfo.obj_asof;
406         ip->obj_localization = localization;
407         ip->hmp = hmp;
408         ip->flags = flags & HAMMER_INODE_RO;
409         ip->cache[0].ip = ip;
410         ip->cache[1].ip = ip;
411         ip->cache[2].ip = ip;
412         ip->cache[3].ip = ip;
413         if (hmp->ronly)
414                 ip->flags |= HAMMER_INODE_RO;
415         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
416                 0x7FFFFFFFFFFFFFFFLL;
417         RB_INIT(&ip->rec_tree);
418         TAILQ_INIT(&ip->target_list);
419         hammer_ref(&ip->lock);
420
421         /*
422          * Locate the on-disk inode.  If this is a PFS root we always
423          * access the current version of the root inode and (if it is not
424          * a master) always access information under it with a snapshot
425          * TID.
426          *
427          * We cache recent inode lookups in this directory in dip->cache[2].
428          * If we can't find it we assume the inode we are looking for is
429          * close to the directory inode.
430          */
431 retry:
432         cachep = NULL;
433         if (dip) {
434                 if (dip->cache[2].node)
435                         cachep = &dip->cache[2];
436                 else
437                         cachep = &dip->cache[0];
438         }
439         hammer_init_cursor(trans, &cursor, cachep, NULL);
440         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
441         cursor.key_beg.obj_id = ip->obj_id;
442         cursor.key_beg.key = 0;
443         cursor.key_beg.create_tid = 0;
444         cursor.key_beg.delete_tid = 0;
445         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
446         cursor.key_beg.obj_type = 0;
447
448         cursor.asof = iinfo.obj_asof;
449         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
450                        HAMMER_CURSOR_ASOF;
451
452         *errorp = hammer_btree_lookup(&cursor);
453         if (*errorp == EDEADLK) {
454                 hammer_done_cursor(&cursor);
455                 goto retry;
456         }
457
458         /*
459          * On success the B-Tree lookup will hold the appropriate
460          * buffer cache buffers and provide a pointer to the requested
461          * information.  Copy the information to the in-memory inode
462          * and cache the B-Tree node to improve future operations.
463          */
464         if (*errorp == 0) {
465                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
466                 ip->ino_data = cursor.data->inode;
467
468                 /*
469                  * cache[0] tries to cache the location of the object inode.
470                  * The assumption is that it is near the directory inode.
471                  *
472                  * cache[1] tries to cache the location of the object data.
473                  * We might have something in the governing directory from
474                  * scan optimizations (see the strategy code in
475                  * hammer_vnops.c).
476                  *
477                  * We update dip->cache[2], if possible, with the location
478                  * of the object inode for future directory shortcuts.
479                  */
480                 hammer_cache_node(&ip->cache[0], cursor.node);
481                 if (dip) {
482                         if (dip->cache[3].node) {
483                                 hammer_cache_node(&ip->cache[1],
484                                                   dip->cache[3].node);
485                         }
486                         hammer_cache_node(&dip->cache[2], cursor.node);
487                 }
488
489                 /*
490                  * The file should not contain any data past the file size
491                  * stored in the inode.  Setting save_trunc_off to the
492                  * file size instead of max reduces B-Tree lookup overheads
493                  * on append by allowing the flusher to avoid checking for
494                  * record overwrites.
495                  */
496                 ip->save_trunc_off = ip->ino_data.size;
497
498                 /*
499                  * Locate and assign the pseudofs management structure to
500                  * the inode.
501                  */
502                 if (dip && dip->obj_localization == ip->obj_localization) {
503                         ip->pfsm = dip->pfsm;
504                         hammer_ref(&ip->pfsm->lock);
505                 } else {
506                         ip->pfsm = hammer_load_pseudofs(trans,
507                                                         ip->obj_localization,
508                                                         errorp);
509                         *errorp = 0;    /* ignore ENOENT */
510                 }
511         }
512
513         /*
514          * The inode is placed on the red-black tree and will be synced to
515          * the media when flushed or by the filesystem sync.  If this races
516          * another instantiation/lookup the insertion will fail.
517          */
518         if (*errorp == 0) {
519                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
520                         hammer_free_inode(ip);
521                         hammer_done_cursor(&cursor);
522                         goto loop;
523                 }
524                 ip->flags |= HAMMER_INODE_ONDISK;
525         } else {
526                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
527                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
528                         --hmp->rsv_inodes;
529                 }
530
531                 hammer_free_inode(ip);
532                 ip = NULL;
533         }
534         hammer_done_cursor(&cursor);
535         trans->flags |= HAMMER_TRANSF_NEWINODE;
536         return (ip);
537 }
538
539 /*
540  * Get a dummy inode to placemark a broken directory entry.
541  */
542 struct hammer_inode *
543 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
544                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
545                  int flags, int *errorp)
546 {
547         hammer_mount_t hmp = trans->hmp;
548         struct hammer_inode_info iinfo;
549         struct hammer_inode *ip;
550
551         /*
552          * Determine if we already have an inode cached.  If we do then
553          * we are golden.
554          *
555          * If we find an inode with no vnode we have to mark the
556          * transaction such that hammer_inode_waitreclaims() is
557          * called later on to avoid building up an infinite number
558          * of inodes.  Otherwise we can continue to * add new inodes
559          * faster then they can be disposed of, even with the tsleep
560          * delay.
561          *
562          * If we find a non-fake inode we return an error.  Only fake
563          * inodes can be returned by this routine.
564          */
565         iinfo.obj_id = obj_id;
566         iinfo.obj_asof = asof;
567         iinfo.obj_localization = localization;
568 loop:
569         *errorp = 0;
570         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
571         if (ip) {
572                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
573                         *errorp = ENOENT;
574                         return(NULL);
575                 }
576                 hammer_ref(&ip->lock);
577                 return(ip);
578         }
579
580         /*
581          * Allocate a new inode structure and deal with races later.
582          */
583         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
584         ++hammer_count_inodes;
585         ++hmp->count_inodes;
586         ip->obj_id = obj_id;
587         ip->obj_asof = iinfo.obj_asof;
588         ip->obj_localization = localization;
589         ip->hmp = hmp;
590         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
591         ip->cache[0].ip = ip;
592         ip->cache[1].ip = ip;
593         ip->cache[2].ip = ip;
594         ip->cache[3].ip = ip;
595         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
596                 0x7FFFFFFFFFFFFFFFLL;
597         RB_INIT(&ip->rec_tree);
598         TAILQ_INIT(&ip->target_list);
599         hammer_ref(&ip->lock);
600
601         /*
602          * Populate the dummy inode.  Leave everything zero'd out.
603          *
604          * (ip->ino_leaf and ip->ino_data)
605          *
606          * Make the dummy inode a FIFO object which most copy programs
607          * will properly ignore.
608          */
609         ip->save_trunc_off = ip->ino_data.size;
610         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
611
612         /*
613          * Locate and assign the pseudofs management structure to
614          * the inode.
615          */
616         if (dip && dip->obj_localization == ip->obj_localization) {
617                 ip->pfsm = dip->pfsm;
618                 hammer_ref(&ip->pfsm->lock);
619         } else {
620                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
621                                                 errorp);
622                 *errorp = 0;    /* ignore ENOENT */
623         }
624
625         /*
626          * The inode is placed on the red-black tree and will be synced to
627          * the media when flushed or by the filesystem sync.  If this races
628          * another instantiation/lookup the insertion will fail.
629          *
630          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
631          */
632         if (*errorp == 0) {
633                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
634                         hammer_free_inode(ip);
635                         goto loop;
636                 }
637         } else {
638                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
639                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
640                         --hmp->rsv_inodes;
641                 }
642                 hammer_free_inode(ip);
643                 ip = NULL;
644         }
645         trans->flags |= HAMMER_TRANSF_NEWINODE;
646         return (ip);
647 }
648
649 /*
650  * Return a referenced inode only if it is in our inode cache.
651  *
652  * Dummy inodes do not count.
653  */
654 struct hammer_inode *
655 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
656                   hammer_tid_t asof, u_int32_t localization)
657 {
658         hammer_mount_t hmp = trans->hmp;
659         struct hammer_inode_info iinfo;
660         struct hammer_inode *ip;
661
662         iinfo.obj_id = obj_id;
663         iinfo.obj_asof = asof;
664         iinfo.obj_localization = localization;
665
666         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
667         if (ip) {
668                 if (ip->flags & HAMMER_INODE_DUMMY)
669                         ip = NULL;
670                 else
671                         hammer_ref(&ip->lock);
672         }
673         return(ip);
674 }
675
676 /*
677  * Create a new filesystem object, returning the inode in *ipp.  The
678  * returned inode will be referenced.  The inode is created in-memory.
679  *
680  * If pfsm is non-NULL the caller wishes to create the root inode for
681  * a master PFS.
682  */
683 int
684 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
685                     struct ucred *cred,
686                     hammer_inode_t dip, const char *name, int namelen,
687                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
688 {
689         hammer_mount_t hmp;
690         hammer_inode_t ip;
691         uid_t xuid;
692         int error;
693         int64_t namekey;
694         u_int32_t dummy;
695
696         hmp = trans->hmp;
697
698         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
699         ++hammer_count_inodes;
700         ++hmp->count_inodes;
701         trans->flags |= HAMMER_TRANSF_NEWINODE;
702
703         if (pfsm) {
704                 KKASSERT(pfsm->localization != 0);
705                 ip->obj_id = HAMMER_OBJID_ROOT;
706                 ip->obj_localization = pfsm->localization;
707         } else {
708                 KKASSERT(dip != NULL);
709                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
710                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
711                 ip->obj_localization = dip->obj_localization;
712         }
713
714         KKASSERT(ip->obj_id != 0);
715         ip->obj_asof = hmp->asof;
716         ip->hmp = hmp;
717         ip->flush_state = HAMMER_FST_IDLE;
718         ip->flags = HAMMER_INODE_DDIRTY |
719                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
720         ip->cache[0].ip = ip;
721         ip->cache[1].ip = ip;
722         ip->cache[2].ip = ip;
723         ip->cache[3].ip = ip;
724
725         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
726         /* ip->save_trunc_off = 0; (already zero) */
727         RB_INIT(&ip->rec_tree);
728         TAILQ_INIT(&ip->target_list);
729
730         ip->ino_data.atime = trans->time;
731         ip->ino_data.mtime = trans->time;
732         ip->ino_data.size = 0;
733         ip->ino_data.nlinks = 0;
734
735         /*
736          * A nohistory designator on the parent directory is inherited by
737          * the child.  We will do this even for pseudo-fs creation... the
738          * sysad can turn it off.
739          */
740         if (dip) {
741                 ip->ino_data.uflags = dip->ino_data.uflags &
742                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
743         }
744
745         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
746         ip->ino_leaf.base.localization = ip->obj_localization +
747                                          HAMMER_LOCALIZE_INODE;
748         ip->ino_leaf.base.obj_id = ip->obj_id;
749         ip->ino_leaf.base.key = 0;
750         ip->ino_leaf.base.create_tid = 0;
751         ip->ino_leaf.base.delete_tid = 0;
752         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
753         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
754
755         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
756         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
757         ip->ino_data.mode = vap->va_mode;
758         ip->ino_data.ctime = trans->time;
759
760         /*
761          * If we are running version 2 or greater directory entries are
762          * inode-localized instead of data-localized.
763          */
764         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
765                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
766                         ip->ino_data.cap_flags |=
767                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
768                 }
769         }
770
771         /*
772          * Setup the ".." pointer.  This only needs to be done for directories
773          * but we do it for all objects as a recovery aid.
774          */
775         if (dip)
776                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
777 #if 0
778         /*
779          * The parent_obj_localization field only applies to pseudo-fs roots.
780          * XXX this is no longer applicable, PFSs are no longer directly
781          * tied into the parent's directory structure.
782          */
783         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
784             ip->obj_id == HAMMER_OBJID_ROOT) {
785                 ip->ino_data.ext.obj.parent_obj_localization = 
786                                                 dip->obj_localization;
787         }
788 #endif
789
790         switch(ip->ino_leaf.base.obj_type) {
791         case HAMMER_OBJTYPE_CDEV:
792         case HAMMER_OBJTYPE_BDEV:
793                 ip->ino_data.rmajor = vap->va_rmajor;
794                 ip->ino_data.rminor = vap->va_rminor;
795                 break;
796         default:
797                 break;
798         }
799
800         /*
801          * Calculate default uid/gid and overwrite with information from
802          * the vap.
803          */
804         if (dip) {
805                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
806                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
807                                              xuid, cred, &vap->va_mode);
808         } else {
809                 xuid = 0;
810         }
811         ip->ino_data.mode = vap->va_mode;
812
813         if (vap->va_vaflags & VA_UID_UUID_VALID)
814                 ip->ino_data.uid = vap->va_uid_uuid;
815         else if (vap->va_uid != (uid_t)VNOVAL)
816                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
817         else
818                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
819
820         if (vap->va_vaflags & VA_GID_UUID_VALID)
821                 ip->ino_data.gid = vap->va_gid_uuid;
822         else if (vap->va_gid != (gid_t)VNOVAL)
823                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
824         else if (dip)
825                 ip->ino_data.gid = dip->ino_data.gid;
826
827         hammer_ref(&ip->lock);
828
829         if (pfsm) {
830                 ip->pfsm = pfsm;
831                 hammer_ref(&pfsm->lock);
832                 error = 0;
833         } else if (dip->obj_localization == ip->obj_localization) {
834                 ip->pfsm = dip->pfsm;
835                 hammer_ref(&ip->pfsm->lock);
836                 error = 0;
837         } else {
838                 ip->pfsm = hammer_load_pseudofs(trans,
839                                                 ip->obj_localization,
840                                                 &error);
841                 error = 0;      /* ignore ENOENT */
842         }
843
844         if (error) {
845                 hammer_free_inode(ip);
846                 ip = NULL;
847         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
848                 panic("hammer_create_inode: duplicate obj_id %llx",
849                       (long long)ip->obj_id);
850                 /* not reached */
851                 hammer_free_inode(ip);
852         }
853         *ipp = ip;
854         return(error);
855 }
856
857 /*
858  * Final cleanup / freeing of an inode structure
859  */
860 static void
861 hammer_free_inode(hammer_inode_t ip)
862 {
863         struct hammer_mount *hmp;
864
865         hmp = ip->hmp;
866         KKASSERT(ip->lock.refs == 1);
867         hammer_uncache_node(&ip->cache[0]);
868         hammer_uncache_node(&ip->cache[1]);
869         hammer_uncache_node(&ip->cache[2]);
870         hammer_uncache_node(&ip->cache[3]);
871         hammer_inode_wakereclaims(ip);
872         if (ip->objid_cache)
873                 hammer_clear_objid(ip);
874         --hammer_count_inodes;
875         --hmp->count_inodes;
876         if (ip->pfsm) {
877                 hammer_rel_pseudofs(hmp, ip->pfsm);
878                 ip->pfsm = NULL;
879         }
880         kfree(ip, hmp->m_inodes);
881         ip = NULL;
882 }
883
884 /*
885  * Retrieve pseudo-fs data.  NULL will never be returned.
886  *
887  * If an error occurs *errorp will be set and a default template is returned,
888  * otherwise *errorp is set to 0.  Typically when an error occurs it will
889  * be ENOENT.
890  */
891 hammer_pseudofs_inmem_t
892 hammer_load_pseudofs(hammer_transaction_t trans,
893                      u_int32_t localization, int *errorp)
894 {
895         hammer_mount_t hmp = trans->hmp;
896         hammer_inode_t ip;
897         hammer_pseudofs_inmem_t pfsm;
898         struct hammer_cursor cursor;
899         int bytes;
900
901 retry:
902         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
903         if (pfsm) {
904                 hammer_ref(&pfsm->lock);
905                 *errorp = 0;
906                 return(pfsm);
907         }
908
909         /*
910          * PFS records are stored in the root inode (not the PFS root inode,
911          * but the real root).  Avoid an infinite recursion if loading
912          * the PFS for the real root.
913          */
914         if (localization) {
915                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
916                                       HAMMER_MAX_TID,
917                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
918         } else {
919                 ip = NULL;
920         }
921
922         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
923         pfsm->localization = localization;
924         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
925         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
926
927         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
928         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
929                                       HAMMER_LOCALIZE_MISC;
930         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
931         cursor.key_beg.create_tid = 0;
932         cursor.key_beg.delete_tid = 0;
933         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
934         cursor.key_beg.obj_type = 0;
935         cursor.key_beg.key = localization;
936         cursor.asof = HAMMER_MAX_TID;
937         cursor.flags |= HAMMER_CURSOR_ASOF;
938
939         if (ip)
940                 *errorp = hammer_ip_lookup(&cursor);
941         else
942                 *errorp = hammer_btree_lookup(&cursor);
943         if (*errorp == 0) {
944                 *errorp = hammer_ip_resolve_data(&cursor);
945                 if (*errorp == 0) {
946                         if (cursor.data->pfsd.mirror_flags &
947                             HAMMER_PFSD_DELETED) {
948                                 *errorp = ENOENT;
949                         } else {
950                                 bytes = cursor.leaf->data_len;
951                                 if (bytes > sizeof(pfsm->pfsd))
952                                         bytes = sizeof(pfsm->pfsd);
953                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
954                         }
955                 }
956         }
957         hammer_done_cursor(&cursor);
958
959         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
960         hammer_ref(&pfsm->lock);
961         if (ip)
962                 hammer_rel_inode(ip, 0);
963         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
964                 kfree(pfsm, hmp->m_misc);
965                 goto retry;
966         }
967         return(pfsm);
968 }
969
970 /*
971  * Store pseudo-fs data.  The backend will automatically delete any prior
972  * on-disk pseudo-fs data but we have to delete in-memory versions.
973  */
974 int
975 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
976 {
977         struct hammer_cursor cursor;
978         hammer_record_t record;
979         hammer_inode_t ip;
980         int error;
981
982         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
983                               HAMMER_DEF_LOCALIZATION, 0, &error);
984 retry:
985         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
986         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
987         cursor.key_beg.localization = ip->obj_localization +
988                                       HAMMER_LOCALIZE_MISC;
989         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
990         cursor.key_beg.create_tid = 0;
991         cursor.key_beg.delete_tid = 0;
992         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
993         cursor.key_beg.obj_type = 0;
994         cursor.key_beg.key = pfsm->localization;
995         cursor.asof = HAMMER_MAX_TID;
996         cursor.flags |= HAMMER_CURSOR_ASOF;
997
998         /*
999          * Replace any in-memory version of the record.
1000          */
1001         error = hammer_ip_lookup(&cursor);
1002         if (error == 0 && hammer_cursor_inmem(&cursor)) {
1003                 record = cursor.iprec;
1004                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1005                         KKASSERT(cursor.deadlk_rec == NULL);
1006                         hammer_ref(&record->lock);
1007                         cursor.deadlk_rec = record;
1008                         error = EDEADLK;
1009                 } else {
1010                         record->flags |= HAMMER_RECF_DELETED_FE;
1011                         error = 0;
1012                 }
1013         }
1014
1015         /*
1016          * Allocate replacement general record.  The backend flush will
1017          * delete any on-disk version of the record.
1018          */
1019         if (error == 0 || error == ENOENT) {
1020                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1021                 record->type = HAMMER_MEM_RECORD_GENERAL;
1022
1023                 record->leaf.base.localization = ip->obj_localization +
1024                                                  HAMMER_LOCALIZE_MISC;
1025                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1026                 record->leaf.base.key = pfsm->localization;
1027                 record->leaf.data_len = sizeof(pfsm->pfsd);
1028                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1029                 error = hammer_ip_add_record(trans, record);
1030         }
1031         hammer_done_cursor(&cursor);
1032         if (error == EDEADLK)
1033                 goto retry;
1034         hammer_rel_inode(ip, 0);
1035         return(error);
1036 }
1037
1038 /*
1039  * Create a root directory for a PFS if one does not alredy exist.
1040  *
1041  * The PFS root stands alone so we must also bump the nlinks count
1042  * to prevent it from being destroyed on release.
1043  */
1044 int
1045 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1046                        hammer_pseudofs_inmem_t pfsm)
1047 {
1048         hammer_inode_t ip;
1049         struct vattr vap;
1050         int error;
1051
1052         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1053                               pfsm->localization, 0, &error);
1054         if (ip == NULL) {
1055                 vattr_null(&vap);
1056                 vap.va_mode = 0755;
1057                 vap.va_type = VDIR;
1058                 error = hammer_create_inode(trans, &vap, cred,
1059                                             NULL, NULL, 0,
1060                                             pfsm, &ip);
1061                 if (error == 0) {
1062                         ++ip->ino_data.nlinks;
1063                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1064                 }
1065         }
1066         if (ip)
1067                 hammer_rel_inode(ip, 0);
1068         return(error);
1069 }
1070
1071 /*
1072  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1073  * if we are unable to disassociate all the inodes.
1074  */
1075 static
1076 int
1077 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1078 {
1079         int res;
1080
1081         hammer_ref(&ip->lock);
1082         if (ip->lock.refs == 2 && ip->vp)
1083                 vclean_unlocked(ip->vp);
1084         if (ip->lock.refs == 1 && ip->vp == NULL)
1085                 res = 0;
1086         else
1087                 res = -1;       /* stop, someone is using the inode */
1088         hammer_rel_inode(ip, 0);
1089         return(res);
1090 }
1091
1092 int
1093 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
1094 {
1095         int res;
1096         int try;
1097
1098         for (try = res = 0; try < 4; ++try) {
1099                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1100                                            hammer_inode_pfs_cmp,
1101                                            hammer_unload_pseudofs_callback,
1102                                            &localization);
1103                 if (res == 0 && try > 1)
1104                         break;
1105                 hammer_flusher_sync(trans->hmp);
1106         }
1107         if (res != 0)
1108                 res = ENOTEMPTY;
1109         return(res);
1110 }
1111
1112
1113 /*
1114  * Release a reference on a PFS
1115  */
1116 void
1117 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1118 {
1119         hammer_unref(&pfsm->lock);
1120         if (pfsm->lock.refs == 0) {
1121                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1122                 kfree(pfsm, hmp->m_misc);
1123         }
1124 }
1125
1126 /*
1127  * Called by hammer_sync_inode().
1128  */
1129 static int
1130 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1131 {
1132         hammer_transaction_t trans = cursor->trans;
1133         hammer_record_t record;
1134         int error;
1135         int redirty;
1136
1137 retry:
1138         error = 0;
1139
1140         /*
1141          * If the inode has a presence on-disk then locate it and mark
1142          * it deleted, setting DELONDISK.
1143          *
1144          * The record may or may not be physically deleted, depending on
1145          * the retention policy.
1146          */
1147         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1148             HAMMER_INODE_ONDISK) {
1149                 hammer_normalize_cursor(cursor);
1150                 cursor->key_beg.localization = ip->obj_localization + 
1151                                                HAMMER_LOCALIZE_INODE;
1152                 cursor->key_beg.obj_id = ip->obj_id;
1153                 cursor->key_beg.key = 0;
1154                 cursor->key_beg.create_tid = 0;
1155                 cursor->key_beg.delete_tid = 0;
1156                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1157                 cursor->key_beg.obj_type = 0;
1158                 cursor->asof = ip->obj_asof;
1159                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1160                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1161                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1162
1163                 error = hammer_btree_lookup(cursor);
1164                 if (hammer_debug_inode)
1165                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1166
1167                 if (error == 0) {
1168                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1169                         if (hammer_debug_inode)
1170                                 kprintf(" error %d\n", error);
1171                         if (error == 0) {
1172                                 ip->flags |= HAMMER_INODE_DELONDISK;
1173                         }
1174                         if (cursor->node)
1175                                 hammer_cache_node(&ip->cache[0], cursor->node);
1176                 }
1177                 if (error == EDEADLK) {
1178                         hammer_done_cursor(cursor);
1179                         error = hammer_init_cursor(trans, cursor,
1180                                                    &ip->cache[0], ip);
1181                         if (hammer_debug_inode)
1182                                 kprintf("IPDED %p %d\n", ip, error);
1183                         if (error == 0)
1184                                 goto retry;
1185                 }
1186         }
1187
1188         /*
1189          * Ok, write out the initial record or a new record (after deleting
1190          * the old one), unless the DELETED flag is set.  This routine will
1191          * clear DELONDISK if it writes out a record.
1192          *
1193          * Update our inode statistics if this is the first application of
1194          * the inode on-disk.
1195          */
1196         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1197                 /*
1198                  * Generate a record and write it to the media.  We clean-up
1199                  * the state before releasing so we do not have to set-up
1200                  * a flush_group.
1201                  */
1202                 record = hammer_alloc_mem_record(ip, 0);
1203                 record->type = HAMMER_MEM_RECORD_INODE;
1204                 record->flush_state = HAMMER_FST_FLUSH;
1205                 record->leaf = ip->sync_ino_leaf;
1206                 record->leaf.base.create_tid = trans->tid;
1207                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1208                 record->leaf.create_ts = trans->time32;
1209                 record->data = (void *)&ip->sync_ino_data;
1210                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1211
1212                 /*
1213                  * If this flag is set we cannot sync the new file size
1214                  * because we haven't finished related truncations.  The
1215                  * inode will be flushed in another flush group to finish
1216                  * the job.
1217                  */
1218                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1219                     ip->sync_ino_data.size != ip->ino_data.size) {
1220                         redirty = 1;
1221                         ip->sync_ino_data.size = ip->ino_data.size;
1222                 } else {
1223                         redirty = 0;
1224                 }
1225
1226                 for (;;) {
1227                         error = hammer_ip_sync_record_cursor(cursor, record);
1228                         if (hammer_debug_inode)
1229                                 kprintf("GENREC %p rec %08x %d\n",      
1230                                         ip, record->flags, error);
1231                         if (error != EDEADLK)
1232                                 break;
1233                         hammer_done_cursor(cursor);
1234                         error = hammer_init_cursor(trans, cursor,
1235                                                    &ip->cache[0], ip);
1236                         if (hammer_debug_inode)
1237                                 kprintf("GENREC reinit %d\n", error);
1238                         if (error)
1239                                 break;
1240                 }
1241
1242                 /*
1243                  * Note:  The record was never on the inode's record tree
1244                  * so just wave our hands importantly and destroy it.
1245                  */
1246                 record->flags |= HAMMER_RECF_COMMITTED;
1247                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1248                 record->flush_state = HAMMER_FST_IDLE;
1249                 ++ip->rec_generation;
1250                 hammer_rel_mem_record(record);
1251
1252                 /*
1253                  * Finish up.
1254                  */
1255                 if (error == 0) {
1256                         if (hammer_debug_inode)
1257                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1258                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1259                                             HAMMER_INODE_ATIME |
1260                                             HAMMER_INODE_MTIME);
1261                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1262                         if (redirty)
1263                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1264
1265                         /*
1266                          * Root volume count of inodes
1267                          */
1268                         hammer_sync_lock_sh(trans);
1269                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1270                                 hammer_modify_volume_field(trans,
1271                                                            trans->rootvol,
1272                                                            vol0_stat_inodes);
1273                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1274                                 hammer_modify_volume_done(trans->rootvol);
1275                                 ip->flags |= HAMMER_INODE_ONDISK;
1276                                 if (hammer_debug_inode)
1277                                         kprintf("NOWONDISK %p\n", ip);
1278                         }
1279                         hammer_sync_unlock(trans);
1280                 }
1281         }
1282
1283         /*
1284          * If the inode has been destroyed, clean out any left-over flags
1285          * that may have been set by the frontend.
1286          */
1287         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
1288                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1289                                     HAMMER_INODE_ATIME |
1290                                     HAMMER_INODE_MTIME);
1291         }
1292         return(error);
1293 }
1294
1295 /*
1296  * Update only the itimes fields.
1297  *
1298  * ATIME can be updated without generating any UNDO.  MTIME is updated
1299  * with UNDO so it is guaranteed to be synchronized properly in case of
1300  * a crash.
1301  *
1302  * Neither field is included in the B-Tree leaf element's CRC, which is how
1303  * we can get away with updating ATIME the way we do.
1304  */
1305 static int
1306 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1307 {
1308         hammer_transaction_t trans = cursor->trans;
1309         int error;
1310
1311 retry:
1312         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1313             HAMMER_INODE_ONDISK) {
1314                 return(0);
1315         }
1316
1317         hammer_normalize_cursor(cursor);
1318         cursor->key_beg.localization = ip->obj_localization + 
1319                                        HAMMER_LOCALIZE_INODE;
1320         cursor->key_beg.obj_id = ip->obj_id;
1321         cursor->key_beg.key = 0;
1322         cursor->key_beg.create_tid = 0;
1323         cursor->key_beg.delete_tid = 0;
1324         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1325         cursor->key_beg.obj_type = 0;
1326         cursor->asof = ip->obj_asof;
1327         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1328         cursor->flags |= HAMMER_CURSOR_ASOF;
1329         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1330         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1331         cursor->flags |= HAMMER_CURSOR_BACKEND;
1332
1333         error = hammer_btree_lookup(cursor);
1334         if (error == 0) {
1335                 hammer_cache_node(&ip->cache[0], cursor->node);
1336                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1337                         /*
1338                          * Updating MTIME requires an UNDO.  Just cover
1339                          * both atime and mtime.
1340                          */
1341                         hammer_sync_lock_sh(trans);
1342                         hammer_modify_buffer(trans, cursor->data_buffer,
1343                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
1344                                      HAMMER_ITIMES_BYTES);
1345                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1346                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1347                         hammer_modify_buffer_done(cursor->data_buffer);
1348                         hammer_sync_unlock(trans);
1349                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1350                         /*
1351                          * Updating atime only can be done in-place with
1352                          * no UNDO.
1353                          */
1354                         hammer_sync_lock_sh(trans);
1355                         hammer_modify_buffer(trans, cursor->data_buffer,
1356                                              NULL, 0);
1357                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1358                         hammer_modify_buffer_done(cursor->data_buffer);
1359                         hammer_sync_unlock(trans);
1360                 }
1361                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1362         }
1363         if (error == EDEADLK) {
1364                 hammer_done_cursor(cursor);
1365                 error = hammer_init_cursor(trans, cursor,
1366                                            &ip->cache[0], ip);
1367                 if (error == 0)
1368                         goto retry;
1369         }
1370         return(error);
1371 }
1372
1373 /*
1374  * Release a reference on an inode, flush as requested.
1375  *
1376  * On the last reference we queue the inode to the flusher for its final
1377  * disposition.
1378  */
1379 void
1380 hammer_rel_inode(struct hammer_inode *ip, int flush)
1381 {
1382         /*hammer_mount_t hmp = ip->hmp;*/
1383
1384         /*
1385          * Handle disposition when dropping the last ref.
1386          */
1387         for (;;) {
1388                 if (ip->lock.refs == 1) {
1389                         /*
1390                          * Determine whether on-disk action is needed for
1391                          * the inode's final disposition.
1392                          */
1393                         KKASSERT(ip->vp == NULL);
1394                         hammer_inode_unloadable_check(ip, 0);
1395                         if (ip->flags & HAMMER_INODE_MODMASK) {
1396                                 hammer_flush_inode(ip, 0);
1397                         } else if (ip->lock.refs == 1) {
1398                                 hammer_unload_inode(ip);
1399                                 break;
1400                         }
1401                 } else {
1402                         if (flush)
1403                                 hammer_flush_inode(ip, 0);
1404
1405                         /*
1406                          * The inode still has multiple refs, try to drop
1407                          * one ref.
1408                          */
1409                         KKASSERT(ip->lock.refs >= 1);
1410                         if (ip->lock.refs > 1) {
1411                                 hammer_unref(&ip->lock);
1412                                 break;
1413                         }
1414                 }
1415         }
1416 }
1417
1418 /*
1419  * Unload and destroy the specified inode.  Must be called with one remaining
1420  * reference.  The reference is disposed of.
1421  *
1422  * The inode must be completely clean.
1423  */
1424 static int
1425 hammer_unload_inode(struct hammer_inode *ip)
1426 {
1427         hammer_mount_t hmp = ip->hmp;
1428
1429         KASSERT(ip->lock.refs == 1,
1430                 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
1431         KKASSERT(ip->vp == NULL);
1432         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1433         KKASSERT(ip->cursor_ip_refs == 0);
1434         KKASSERT(hammer_notlocked(&ip->lock));
1435         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1436
1437         KKASSERT(RB_EMPTY(&ip->rec_tree));
1438         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1439
1440         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1441
1442         hammer_free_inode(ip);
1443         return(0);
1444 }
1445
1446 /*
1447  * Called during unmounting if a critical error occured.  The in-memory
1448  * inode and all related structures are destroyed.
1449  *
1450  * If a critical error did not occur the unmount code calls the standard
1451  * release and asserts that the inode is gone.
1452  */
1453 int
1454 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1455 {
1456         hammer_record_t rec;
1457
1458         /*
1459          * Get rid of the inodes in-memory records, regardless of their
1460          * state, and clear the mod-mask.
1461          */
1462         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1463                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1464                 rec->target_ip = NULL;
1465                 if (rec->flush_state == HAMMER_FST_SETUP)
1466                         rec->flush_state = HAMMER_FST_IDLE;
1467         }
1468         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1469                 if (rec->flush_state == HAMMER_FST_FLUSH)
1470                         --rec->flush_group->refs;
1471                 else
1472                         hammer_ref(&rec->lock);
1473                 KKASSERT(rec->lock.refs == 1);
1474                 rec->flush_state = HAMMER_FST_IDLE;
1475                 rec->flush_group = NULL;
1476                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1477                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1478                 ++ip->rec_generation;
1479                 hammer_rel_mem_record(rec);
1480         }
1481         ip->flags &= ~HAMMER_INODE_MODMASK;
1482         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1483         KKASSERT(ip->vp == NULL);
1484
1485         /*
1486          * Remove the inode from any flush group, force it idle.  FLUSH
1487          * and SETUP states have an inode ref.
1488          */
1489         switch(ip->flush_state) {
1490         case HAMMER_FST_FLUSH:
1491                 TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
1492                 --ip->flush_group->refs;
1493                 ip->flush_group = NULL;
1494                 /* fall through */
1495         case HAMMER_FST_SETUP:
1496                 hammer_unref(&ip->lock);
1497                 ip->flush_state = HAMMER_FST_IDLE;
1498                 /* fall through */
1499         case HAMMER_FST_IDLE:
1500                 break;
1501         }
1502
1503         /*
1504          * There shouldn't be any associated vnode.  The unload needs at
1505          * least one ref, if we do have a vp steal its ip ref.
1506          */
1507         if (ip->vp) {
1508                 kprintf("hammer_destroy_inode_callback: Unexpected "
1509                         "vnode association ip %p vp %p\n", ip, ip->vp);
1510                 ip->vp->v_data = NULL;
1511                 ip->vp = NULL;
1512         } else {
1513                 hammer_ref(&ip->lock);
1514         }
1515         hammer_unload_inode(ip);
1516         return(0);
1517 }
1518
1519 /*
1520  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1521  * the read-only flag for cached inodes.
1522  *
1523  * This routine is called from a RB_SCAN().
1524  */
1525 int
1526 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1527 {
1528         hammer_mount_t hmp = ip->hmp;
1529
1530         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1531                 ip->flags |= HAMMER_INODE_RO;
1532         else
1533                 ip->flags &= ~HAMMER_INODE_RO;
1534         return(0);
1535 }
1536
1537 /*
1538  * A transaction has modified an inode, requiring updates as specified by
1539  * the passed flags.
1540  *
1541  * HAMMER_INODE_DDIRTY: Inode data has been updated
1542  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1543  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1544  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1545  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1546  */
1547 void
1548 hammer_modify_inode(hammer_inode_t ip, int flags)
1549 {
1550         /* 
1551          * ronly of 0 or 2 does not trigger assertion.
1552          * 2 is a special error state 
1553          */
1554         KKASSERT(ip->hmp->ronly != 1 ||
1555                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 
1556                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1557                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1558         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1559                 ip->flags |= HAMMER_INODE_RSV_INODES;
1560                 ++ip->hmp->rsv_inodes;
1561         }
1562
1563         ip->flags |= flags;
1564 }
1565
1566 /*
1567  * Request that an inode be flushed.  This whole mess cannot block and may
1568  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1569  * actively flush the inode until the flush can be done.
1570  *
1571  * The inode may already be flushing, or may be in a setup state.  We can
1572  * place the inode in a flushing state if it is currently idle and flag it
1573  * to reflush if it is currently flushing.
1574  *
1575  * Upon return if the inode could not be flushed due to a setup
1576  * dependancy, then it will be automatically flushed when the dependancy
1577  * is satisfied.
1578  */
1579 void
1580 hammer_flush_inode(hammer_inode_t ip, int flags)
1581 {
1582         hammer_mount_t hmp;
1583         hammer_flush_group_t flg;
1584         int good;
1585
1586         /*
1587          * next_flush_group is the first flush group we can place the inode
1588          * in.  It may be NULL.  If it becomes full we append a new flush
1589          * group and make that the next_flush_group.
1590          */
1591         hmp = ip->hmp;
1592         while ((flg = hmp->next_flush_group) != NULL) {
1593                 KKASSERT(flg->running == 0);
1594                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
1595                         break;
1596                 hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
1597                 hammer_flusher_async(ip->hmp, flg);
1598         }
1599         if (flg == NULL) {
1600                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1601                 hmp->next_flush_group = flg;
1602                 TAILQ_INIT(&flg->flush_list);
1603                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1604         }
1605
1606         /*
1607          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1608          * state we have to put it back into an IDLE state so we can
1609          * drop the extra ref.
1610          *
1611          * If we have a parent dependancy we must still fall through
1612          * so we can run it.
1613          */
1614         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1615                 if (ip->flush_state == HAMMER_FST_SETUP &&
1616                     TAILQ_EMPTY(&ip->target_list)) {
1617                         ip->flush_state = HAMMER_FST_IDLE;
1618                         hammer_rel_inode(ip, 0);
1619                 }
1620                 if (ip->flush_state == HAMMER_FST_IDLE)
1621                         return;
1622         }
1623
1624         /*
1625          * Our flush action will depend on the current state.
1626          */
1627         switch(ip->flush_state) {
1628         case HAMMER_FST_IDLE:
1629                 /*
1630                  * We have no dependancies and can flush immediately.  Some
1631                  * our children may not be flushable so we have to re-test
1632                  * with that additional knowledge.
1633                  */
1634                 hammer_flush_inode_core(ip, flg, flags);
1635                 break;
1636         case HAMMER_FST_SETUP:
1637                 /*
1638                  * Recurse upwards through dependancies via target_list
1639                  * and start their flusher actions going if possible.
1640                  *
1641                  * 'good' is our connectivity.  -1 means we have none and
1642                  * can't flush, 0 means there weren't any dependancies, and
1643                  * 1 means we have good connectivity.
1644                  */
1645                 good = hammer_setup_parent_inodes(ip, 0, flg);
1646
1647                 if (good >= 0) {
1648                         /*
1649                          * We can continue if good >= 0.  Determine how 
1650                          * many records under our inode can be flushed (and
1651                          * mark them).
1652                          */
1653                         hammer_flush_inode_core(ip, flg, flags);
1654                 } else {
1655                         /*
1656                          * Parent has no connectivity, tell it to flush
1657                          * us as soon as it does.
1658                          *
1659                          * The REFLUSH flag is also needed to trigger
1660                          * dependancy wakeups.
1661                          */
1662                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1663                                      HAMMER_INODE_REFLUSH;
1664                         if (flags & HAMMER_FLUSH_SIGNAL) {
1665                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1666                                 hammer_flusher_async(ip->hmp, flg);
1667                         }
1668                 }
1669                 break;
1670         case HAMMER_FST_FLUSH:
1671                 /*
1672                  * We are already flushing, flag the inode to reflush
1673                  * if needed after it completes its current flush.
1674                  *
1675                  * The REFLUSH flag is also needed to trigger
1676                  * dependancy wakeups.
1677                  */
1678                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1679                         ip->flags |= HAMMER_INODE_REFLUSH;
1680                 if (flags & HAMMER_FLUSH_SIGNAL) {
1681                         ip->flags |= HAMMER_INODE_RESIGNAL;
1682                         hammer_flusher_async(ip->hmp, flg);
1683                 }
1684                 break;
1685         }
1686 }
1687
1688 /*
1689  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1690  * ip which reference our ip.
1691  *
1692  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1693  *     so for now do not ref/deref the structures.  Note that if we use the
1694  *     ref/rel code later, the rel CAN block.
1695  */
1696 static int
1697 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1698                            hammer_flush_group_t flg)
1699 {
1700         hammer_record_t depend;
1701         int good;
1702         int r;
1703
1704         /*
1705          * If we hit our recursion limit and we have parent dependencies
1706          * We cannot continue.  Returning < 0 will cause us to be flagged
1707          * for reflush.  Returning -2 cuts off additional dependency checks
1708          * because they are likely to also hit the depth limit.
1709          *
1710          * We cannot return < 0 if there are no dependencies or there might
1711          * not be anything to wakeup (ip).
1712          */
1713         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1714                 kprintf("HAMMER Warning: depth limit reached on "
1715                         "setup recursion, inode %p %016llx\n",
1716                         ip, (long long)ip->obj_id);
1717                 return(-2);
1718         }
1719
1720         /*
1721          * Scan dependencies
1722          */
1723         good = 0;
1724         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1725                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1726                 KKASSERT(depend->target_ip == ip);
1727                 if (r < 0 && good == 0)
1728                         good = -1;
1729                 if (r > 0)
1730                         good = 1;
1731
1732                 /*
1733                  * If we failed due to the recursion depth limit then stop
1734                  * now.
1735                  */
1736                 if (r == -2)
1737                         break;
1738         }
1739         return(good);
1740 }
1741
1742 /*
1743  * This helper function takes a record representing the dependancy between
1744  * the parent inode and child inode.
1745  *
1746  * record->ip           = parent inode
1747  * record->target_ip    = child inode
1748  * 
1749  * We are asked to recurse upwards and convert the record from SETUP
1750  * to FLUSH if possible.
1751  *
1752  * Return 1 if the record gives us connectivity
1753  *
1754  * Return 0 if the record is not relevant 
1755  *
1756  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1757  */
1758 static int
1759 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1760                                   hammer_flush_group_t flg)
1761 {
1762         hammer_mount_t hmp;
1763         hammer_inode_t pip;
1764         int good;
1765
1766         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1767         pip = record->ip;
1768         hmp = pip->hmp;
1769
1770         /*
1771          * If the record is already flushing, is it in our flush group?
1772          *
1773          * If it is in our flush group but it is a general record or a 
1774          * delete-on-disk, it does not improve our connectivity (return 0),
1775          * and if the target inode is not trying to destroy itself we can't
1776          * allow the operation yet anyway (the second return -1).
1777          */
1778         if (record->flush_state == HAMMER_FST_FLUSH) {
1779                 /*
1780                  * If not in our flush group ask the parent to reflush
1781                  * us as soon as possible.
1782                  */
1783                 if (record->flush_group != flg) {
1784                         pip->flags |= HAMMER_INODE_REFLUSH;
1785                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1786                         return(-1);
1787                 }
1788
1789                 /*
1790                  * If in our flush group everything is already set up,
1791                  * just return whether the record will improve our
1792                  * visibility or not.
1793                  */
1794                 if (record->type == HAMMER_MEM_RECORD_ADD)
1795                         return(1);
1796                 return(0);
1797         }
1798
1799         /*
1800          * It must be a setup record.  Try to resolve the setup dependancies
1801          * by recursing upwards so we can place ip on the flush list.
1802          *
1803          * Limit ourselves to 20 levels of recursion to avoid blowing out
1804          * the kernel stack.  If we hit the recursion limit we can't flush
1805          * until the parent flushes.  The parent will flush independantly
1806          * on its own and ultimately a deep recursion will be resolved.
1807          */
1808         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1809
1810         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1811
1812         /*
1813          * If good < 0 the parent has no connectivity and we cannot safely
1814          * flush the directory entry, which also means we can't flush our
1815          * ip.  Flag us for downward recursion once the parent's
1816          * connectivity is resolved.  Flag the parent for [re]flush or it
1817          * may not check for downward recursions.
1818          */
1819         if (good < 0) {
1820                 pip->flags |= HAMMER_INODE_REFLUSH;
1821                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1822                 return(good);
1823         }
1824
1825         /*
1826          * We are go, place the parent inode in a flushing state so we can
1827          * place its record in a flushing state.  Note that the parent
1828          * may already be flushing.  The record must be in the same flush
1829          * group as the parent.
1830          */
1831         if (pip->flush_state != HAMMER_FST_FLUSH)
1832                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
1833         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
1834         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1835
1836 #if 0
1837         if (record->type == HAMMER_MEM_RECORD_DEL &&
1838             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
1839                 /*
1840                  * Regardless of flushing state we cannot sync this path if the
1841                  * record represents a delete-on-disk but the target inode
1842                  * is not ready to sync its own deletion.
1843                  *
1844                  * XXX need to count effective nlinks to determine whether
1845                  * the flush is ok, otherwise removing a hardlink will
1846                  * just leave the DEL record to rot.
1847                  */
1848                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
1849                 return(-1);
1850         } else
1851 #endif
1852         if (pip->flush_group == flg) {
1853                 /*
1854                  * Because we have not calculated nlinks yet we can just
1855                  * set records to the flush state if the parent is in
1856                  * the same flush group as we are.
1857                  */
1858                 record->flush_state = HAMMER_FST_FLUSH;
1859                 record->flush_group = flg;
1860                 ++record->flush_group->refs;
1861                 hammer_ref(&record->lock);
1862
1863                 /*
1864                  * A general directory-add contributes to our visibility.
1865                  *
1866                  * Otherwise it is probably a directory-delete or 
1867                  * delete-on-disk record and does not contribute to our
1868                  * visbility (but we can still flush it).
1869                  */
1870                 if (record->type == HAMMER_MEM_RECORD_ADD)
1871                         return(1);
1872                 return(0);
1873         } else {
1874                 /*
1875                  * If the parent is not in our flush group we cannot
1876                  * flush this record yet, there is no visibility.
1877                  * We tell the parent to reflush and mark ourselves
1878                  * so the parent knows it should flush us too.
1879                  */
1880                 pip->flags |= HAMMER_INODE_REFLUSH;
1881                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1882                 return(-1);
1883         }
1884 }
1885
1886 /*
1887  * This is the core routine placing an inode into the FST_FLUSH state.
1888  */
1889 static void
1890 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
1891 {
1892         int go_count;
1893
1894         /*
1895          * Set flush state and prevent the flusher from cycling into
1896          * the next flush group.  Do not place the ip on the list yet.
1897          * Inodes not in the idle state get an extra reference.
1898          */
1899         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
1900         if (ip->flush_state == HAMMER_FST_IDLE)
1901                 hammer_ref(&ip->lock);
1902         ip->flush_state = HAMMER_FST_FLUSH;
1903         ip->flush_group = flg;
1904         ++ip->hmp->flusher.group_lock;
1905         ++ip->hmp->count_iqueued;
1906         ++hammer_count_iqueued;
1907         ++flg->total_count;
1908
1909         /*
1910          * If the flush group reaches the autoflush limit we want to signal
1911          * the flusher.  This is particularly important for remove()s.
1912          */
1913         if (flg->total_count == hammer_autoflush)
1914                 flags |= HAMMER_FLUSH_SIGNAL;
1915
1916         /*
1917          * We need to be able to vfsync/truncate from the backend.
1918          */
1919         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
1920         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
1921                 ip->flags |= HAMMER_INODE_VHELD;
1922                 vref(ip->vp);
1923         }
1924
1925         /*
1926          * Figure out how many in-memory records we can actually flush
1927          * (not including inode meta-data, buffers, etc).
1928          */
1929         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
1930         if (flags & HAMMER_FLUSH_RECURSION) {
1931                 /*
1932                  * If this is a upwards recursion we do not want to
1933                  * recurse down again!
1934                  */
1935                 go_count = 1;
1936 #if 0
1937         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
1938                 /*
1939                  * No new records are added if we must complete a flush
1940                  * from a previous cycle, but we do have to move the records
1941                  * from the previous cycle to the current one.
1942                  */
1943 #if 0
1944                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1945                                    hammer_syncgrp_child_callback, NULL);
1946 #endif
1947                 go_count = 1;
1948 #endif
1949         } else {
1950                 /*
1951                  * Normal flush, scan records and bring them into the flush.
1952                  * Directory adds and deletes are usually skipped (they are
1953                  * grouped with the related inode rather then with the
1954                  * directory).
1955                  *
1956                  * go_count can be negative, which means the scan aborted
1957                  * due to the flush group being over-full and we should
1958                  * flush what we have.
1959                  */
1960                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1961                                    hammer_setup_child_callback, NULL);
1962         }
1963
1964         /*
1965          * This is a more involved test that includes go_count.  If we
1966          * can't flush, flag the inode and return.  If go_count is 0 we
1967          * were are unable to flush any records in our rec_tree and
1968          * must ignore the XDIRTY flag.
1969          */
1970         if (go_count == 0) {
1971                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
1972                         --ip->hmp->count_iqueued;
1973                         --hammer_count_iqueued;
1974
1975                         --flg->total_count;
1976                         ip->flush_state = HAMMER_FST_SETUP;
1977                         ip->flush_group = NULL;
1978                         if (ip->flags & HAMMER_INODE_VHELD) {
1979                                 ip->flags &= ~HAMMER_INODE_VHELD;
1980                                 vrele(ip->vp);
1981                         }
1982
1983                         /*
1984                          * REFLUSH is needed to trigger dependancy wakeups
1985                          * when an inode is in SETUP.
1986                          */
1987                         ip->flags |= HAMMER_INODE_REFLUSH;
1988                         if (flags & HAMMER_FLUSH_SIGNAL) {
1989                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1990                                 hammer_flusher_async(ip->hmp, flg);
1991                         }
1992                         if (--ip->hmp->flusher.group_lock == 0)
1993                                 wakeup(&ip->hmp->flusher.group_lock);
1994                         return;
1995                 }
1996         }
1997
1998         /*
1999          * Snapshot the state of the inode for the backend flusher.
2000          *
2001          * We continue to retain save_trunc_off even when all truncations
2002          * have been resolved as an optimization to determine if we can
2003          * skip the B-Tree lookup for overwrite deletions.
2004          *
2005          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2006          * and stays in ip->flags.  Once set, it stays set until the
2007          * inode is destroyed.
2008          */
2009         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2010                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2011                 ip->sync_trunc_off = ip->trunc_off;
2012                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2013                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2014                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2015
2016                 /*
2017                  * The save_trunc_off used to cache whether the B-Tree
2018                  * holds any records past that point is not used until
2019                  * after the truncation has succeeded, so we can safely
2020                  * set it now.
2021                  */
2022                 if (ip->save_trunc_off > ip->sync_trunc_off)
2023                         ip->save_trunc_off = ip->sync_trunc_off;
2024         }
2025         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2026                            ~HAMMER_INODE_TRUNCATED);
2027         ip->sync_ino_leaf = ip->ino_leaf;
2028         ip->sync_ino_data = ip->ino_data;
2029         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2030 #ifdef DEBUG_TRUNCATE
2031         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
2032                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
2033 #endif
2034
2035         /*
2036          * The flusher list inherits our inode and reference.
2037          */
2038         KKASSERT(flg->running == 0);
2039         TAILQ_INSERT_TAIL(&flg->flush_list, ip, flush_entry);
2040         if (--ip->hmp->flusher.group_lock == 0)
2041                 wakeup(&ip->hmp->flusher.group_lock);
2042
2043         if (flags & HAMMER_FLUSH_SIGNAL) {
2044                 hammer_flusher_async(ip->hmp, flg);
2045         }
2046 }
2047
2048 /*
2049  * Callback for scan of ip->rec_tree.  Try to include each record in our
2050  * flush.  ip->flush_group has been set but the inode has not yet been
2051  * moved into a flushing state.
2052  *
2053  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2054  * both inodes.
2055  *
2056  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2057  * the caller from shortcutting the flush.
2058  */
2059 static int
2060 hammer_setup_child_callback(hammer_record_t rec, void *data)
2061 {
2062         hammer_flush_group_t flg;
2063         hammer_inode_t target_ip;
2064         hammer_inode_t ip;
2065         int r;
2066
2067         /*
2068          * Records deleted or committed by the backend are ignored.
2069          * Note that the flush detects deleted frontend records at
2070          * multiple points to deal with races.  This is just the first
2071          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2072          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2073          * messes up link-count calculations.
2074          *
2075          * NOTE: Don't get confused between record deletion and, say,
2076          * directory entry deletion.  The deletion of a directory entry
2077          * which is on-media has nothing to do with the record deletion
2078          * flags.
2079          */
2080         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2081                           HAMMER_RECF_COMMITTED)) {
2082                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2083                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2084                         r = 1;
2085                 } else {
2086                         r = 0;
2087                 }
2088                 return(r);
2089         }
2090
2091         /*
2092          * If the record is in an idle state it has no dependancies and
2093          * can be flushed.
2094          */
2095         ip = rec->ip;
2096         flg = ip->flush_group;
2097         r = 0;
2098
2099         switch(rec->flush_state) {
2100         case HAMMER_FST_IDLE:
2101                 /*
2102                  * The record has no setup dependancy, we can flush it.
2103                  */
2104                 KKASSERT(rec->target_ip == NULL);
2105                 rec->flush_state = HAMMER_FST_FLUSH;
2106                 rec->flush_group = flg;
2107                 ++flg->refs;
2108                 hammer_ref(&rec->lock);
2109                 r = 1;
2110                 break;
2111         case HAMMER_FST_SETUP:
2112                 /*
2113                  * The record has a setup dependancy.  These are typically
2114                  * directory entry adds and deletes.  Such entries will be
2115                  * flushed when their inodes are flushed so we do not
2116                  * usually have to add them to the flush here.  However,
2117                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2118                  * it is asking us to flush this record (and it).
2119                  */
2120                 target_ip = rec->target_ip;
2121                 KKASSERT(target_ip != NULL);
2122                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2123
2124                 /*
2125                  * If the target IP is already flushing in our group
2126                  * we could associate the record, but target_ip has
2127                  * already synced ino_data to sync_ino_data and we
2128                  * would also have to adjust nlinks.   Plus there are
2129                  * ordering issues for adds and deletes.
2130                  *
2131                  * Reflush downward if this is an ADD, and upward if
2132                  * this is a DEL.
2133                  */
2134                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2135                         if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
2136                                 ip->flags |= HAMMER_INODE_REFLUSH;
2137                         else
2138                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2139                         break;
2140                 } 
2141
2142                 /*
2143                  * Target IP is not yet flushing.  This can get complex
2144                  * because we have to be careful about the recursion.
2145                  *
2146                  * Directories create an issue for us in that if a flush
2147                  * of a directory is requested the expectation is to flush
2148                  * any pending directory entries, but this will cause the
2149                  * related inodes to recursively flush as well.  We can't
2150                  * really defer the operation so just get as many as we
2151                  * can and
2152                  */
2153 #if 0
2154                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2155                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2156                         /*
2157                          * We aren't reclaiming and the target ip was not
2158                          * previously prevented from flushing due to this
2159                          * record dependancy.  Do not flush this record.
2160                          */
2161                         /*r = 0;*/
2162                 } else
2163 #endif
2164                 if (flg->total_count + flg->refs >
2165                            ip->hmp->undo_rec_limit) {
2166                         /*
2167                          * Our flush group is over-full and we risk blowing
2168                          * out the UNDO FIFO.  Stop the scan, flush what we
2169                          * have, then reflush the directory.
2170                          *
2171                          * The directory may be forced through multiple
2172                          * flush groups before it can be completely
2173                          * flushed.
2174                          */
2175                         ip->flags |= HAMMER_INODE_RESIGNAL |
2176                                      HAMMER_INODE_REFLUSH;
2177                         r = -1;
2178                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2179                         /*
2180                          * If the target IP is not flushing we can force
2181                          * it to flush, even if it is unable to write out
2182                          * any of its own records we have at least one in
2183                          * hand that we CAN deal with.
2184                          */
2185                         rec->flush_state = HAMMER_FST_FLUSH;
2186                         rec->flush_group = flg;
2187                         ++flg->refs;
2188                         hammer_ref(&rec->lock);
2189                         hammer_flush_inode_core(target_ip, flg,
2190                                                 HAMMER_FLUSH_RECURSION);
2191                         r = 1;
2192                 } else {
2193                         /*
2194                          * General or delete-on-disk record.
2195                          *
2196                          * XXX this needs help.  If a delete-on-disk we could
2197                          * disconnect the target.  If the target has its own
2198                          * dependancies they really need to be flushed.
2199                          *
2200                          * XXX
2201                          */
2202                         rec->flush_state = HAMMER_FST_FLUSH;
2203                         rec->flush_group = flg;
2204                         ++flg->refs;
2205                         hammer_ref(&rec->lock);
2206                         hammer_flush_inode_core(target_ip, flg,
2207                                                 HAMMER_FLUSH_RECURSION);
2208                         r = 1;
2209                 }
2210                 break;
2211         case HAMMER_FST_FLUSH:
2212                 /* 
2213                  * The flush_group should already match.
2214                  */
2215                 KKASSERT(rec->flush_group == flg);
2216                 r = 1;
2217                 break;
2218         }
2219         return(r);
2220 }
2221
2222 #if 0
2223 /*
2224  * This version just moves records already in a flush state to the new
2225  * flush group and that is it.
2226  */
2227 static int
2228 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2229 {
2230         hammer_inode_t ip = rec->ip;
2231
2232         switch(rec->flush_state) {
2233         case HAMMER_FST_FLUSH:
2234                 KKASSERT(rec->flush_group == ip->flush_group);
2235                 break;
2236         default:
2237                 break;
2238         }
2239         return(0);
2240 }
2241 #endif
2242
2243 /*
2244  * Wait for a previously queued flush to complete.
2245  *
2246  * If a critical error occured we don't try to wait.
2247  */
2248 void
2249 hammer_wait_inode(hammer_inode_t ip)
2250 {
2251         hammer_flush_group_t flg;
2252
2253         flg = NULL;
2254         if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
2255                 while (ip->flush_state != HAMMER_FST_IDLE &&
2256                        (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
2257                         if (ip->flush_state == HAMMER_FST_SETUP)
2258                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2259                         if (ip->flush_state != HAMMER_FST_IDLE) {
2260                                 ip->flags |= HAMMER_INODE_FLUSHW;
2261                                 tsleep(&ip->flags, 0, "hmrwin", 0);
2262                         }
2263                 }
2264         }
2265 }
2266
2267 /*
2268  * Called by the backend code when a flush has been completed.
2269  * The inode has already been removed from the flush list.
2270  *
2271  * A pipelined flush can occur, in which case we must re-enter the
2272  * inode on the list and re-copy its fields.
2273  */
2274 void
2275 hammer_flush_inode_done(hammer_inode_t ip, int error)
2276 {
2277         hammer_mount_t hmp;
2278         int dorel;
2279
2280         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2281
2282         hmp = ip->hmp;
2283
2284         /*
2285          * Auto-reflush if the backend could not completely flush
2286          * the inode.  This fixes a case where a deferred buffer flush
2287          * could cause fsync to return early.
2288          */
2289         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2290                 ip->flags |= HAMMER_INODE_REFLUSH;
2291
2292         /*
2293          * Merge left-over flags back into the frontend and fix the state.
2294          * Incomplete truncations are retained by the backend.
2295          */
2296         ip->error = error;
2297         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2298         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2299
2300         /*
2301          * The backend may have adjusted nlinks, so if the adjusted nlinks
2302          * does not match the fronttend set the frontend's RDIRTY flag again.
2303          */
2304         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2305                 ip->flags |= HAMMER_INODE_DDIRTY;
2306
2307         /*
2308          * Fix up the dirty buffer status.
2309          */
2310         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2311                 ip->flags |= HAMMER_INODE_BUFS;
2312         }
2313
2314         /*
2315          * Re-set the XDIRTY flag if some of the inode's in-memory records
2316          * could not be flushed.
2317          */
2318         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2319                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2320                  (!RB_EMPTY(&ip->rec_tree) &&
2321                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2322
2323         /*
2324          * Do not lose track of inodes which no longer have vnode
2325          * assocations, otherwise they may never get flushed again.
2326          *
2327          * The reflush flag can be set superfluously, causing extra pain
2328          * for no reason.  If the inode is no longer modified it no longer
2329          * needs to be flushed.
2330          */
2331         if (ip->flags & HAMMER_INODE_MODMASK) {
2332                 if (ip->vp == NULL)
2333                         ip->flags |= HAMMER_INODE_REFLUSH;
2334         } else {
2335                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2336         }
2337
2338         /*
2339          * Adjust the flush state.
2340          */
2341         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2342                 /*
2343                  * We were unable to flush out all our records, leave the
2344                  * inode in a flush state and in the current flush group.
2345                  * The flush group will be re-run.
2346                  *
2347                  * This occurs if the UNDO block gets too full or there is
2348                  * too much dirty meta-data and allows the flusher to
2349                  * finalize the UNDO block and then re-flush.
2350                  */
2351                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2352                 dorel = 0;
2353         } else {
2354                 /*
2355                  * Remove from the flush_group
2356                  */
2357                 TAILQ_REMOVE(&ip->flush_group->flush_list, ip, flush_entry);
2358                 ip->flush_group = NULL;
2359
2360                 /*
2361                  * Clean up the vnode ref and tracking counts.
2362                  */
2363                 if (ip->flags & HAMMER_INODE_VHELD) {
2364                         ip->flags &= ~HAMMER_INODE_VHELD;
2365                         vrele(ip->vp);
2366                 }
2367                 --hmp->count_iqueued;
2368                 --hammer_count_iqueued;
2369
2370                 /*
2371                  * And adjust the state.
2372                  */
2373                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2374                         ip->flush_state = HAMMER_FST_IDLE;
2375                         dorel = 1;
2376                 } else {
2377                         ip->flush_state = HAMMER_FST_SETUP;
2378                         dorel = 0;
2379                 }
2380
2381                 /*
2382                  * If the frontend is waiting for a flush to complete,
2383                  * wake it up.
2384                  */
2385                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2386                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2387                         wakeup(&ip->flags);
2388                 }
2389
2390                 /*
2391                  * If the frontend made more changes and requested another
2392                  * flush, then try to get it running.
2393                  *
2394                  * Reflushes are aborted when the inode is errored out.
2395                  */
2396                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2397                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2398                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2399                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2400                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2401                         } else {
2402                                 hammer_flush_inode(ip, 0);
2403                         }
2404                 }
2405         }
2406
2407         /*
2408          * If we have no parent dependancies we can clear CONN_DOWN
2409          */
2410         if (TAILQ_EMPTY(&ip->target_list))
2411                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2412
2413         /*
2414          * If the inode is now clean drop the space reservation.
2415          */
2416         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2417             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2418                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2419                 --hmp->rsv_inodes;
2420         }
2421
2422         if (dorel)
2423                 hammer_rel_inode(ip, 0);
2424 }
2425
2426 /*
2427  * Called from hammer_sync_inode() to synchronize in-memory records
2428  * to the media.
2429  */
2430 static int
2431 hammer_sync_record_callback(hammer_record_t record, void *data)
2432 {
2433         hammer_cursor_t cursor = data;
2434         hammer_transaction_t trans = cursor->trans;
2435         hammer_mount_t hmp = trans->hmp;
2436         int error;
2437
2438         /*
2439          * Skip records that do not belong to the current flush.
2440          */
2441         ++hammer_stats_record_iterations;
2442         if (record->flush_state != HAMMER_FST_FLUSH)
2443                 return(0);
2444
2445 #if 1
2446         if (record->flush_group != record->ip->flush_group) {
2447                 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
2448                 Debugger("blah2");
2449                 return(0);
2450         }
2451 #endif
2452         KKASSERT(record->flush_group == record->ip->flush_group);
2453
2454         /*
2455          * Interlock the record using the BE flag.  Once BE is set the
2456          * frontend cannot change the state of FE.
2457          *
2458          * NOTE: If FE is set prior to us setting BE we still sync the
2459          * record out, but the flush completion code converts it to 
2460          * a delete-on-disk record instead of destroying it.
2461          */
2462         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2463         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2464
2465         /*
2466          * The backend has already disposed of the record.
2467          */
2468         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2469                 error = 0;
2470                 goto done;
2471         }
2472
2473         /*
2474          * If the whole inode is being deleting all on-disk records will
2475          * be deleted very soon, we can't sync any new records to disk
2476          * because they will be deleted in the same transaction they were
2477          * created in (delete_tid == create_tid), which will assert.
2478          *
2479          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2480          * that we currently panic on.
2481          */
2482         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2483                 switch(record->type) {
2484                 case HAMMER_MEM_RECORD_DATA:
2485                         /*
2486                          * We don't have to do anything, if the record was
2487                          * committed the space will have been accounted for
2488                          * in the blockmap.
2489                          */
2490                         /* fall through */
2491                 case HAMMER_MEM_RECORD_GENERAL:
2492                         /*
2493                          * Set deleted-by-backend flag.  Do not set the
2494                          * backend committed flag, because we are throwing
2495                          * the record away.
2496                          */
2497                         record->flags |= HAMMER_RECF_DELETED_BE;
2498                         ++record->ip->rec_generation;
2499                         error = 0;
2500                         goto done;
2501                 case HAMMER_MEM_RECORD_ADD:
2502                         panic("hammer_sync_record_callback: illegal add "
2503                               "during inode deletion record %p", record);
2504                         break; /* NOT REACHED */
2505                 case HAMMER_MEM_RECORD_INODE:
2506                         panic("hammer_sync_record_callback: attempt to "
2507                               "sync inode record %p?", record);
2508                         break; /* NOT REACHED */
2509                 case HAMMER_MEM_RECORD_DEL:
2510                         /* 
2511                          * Follow through and issue the on-disk deletion
2512                          */
2513                         break;
2514                 }
2515         }
2516
2517         /*
2518          * If DELETED_FE is set special handling is needed for directory
2519          * entries.  Dependant pieces related to the directory entry may
2520          * have already been synced to disk.  If this occurs we have to
2521          * sync the directory entry and then change the in-memory record
2522          * from an ADD to a DELETE to cover the fact that it's been
2523          * deleted by the frontend.
2524          *
2525          * A directory delete covering record (MEM_RECORD_DEL) can never
2526          * be deleted by the frontend.
2527          *
2528          * Any other record type (aka DATA) can be deleted by the frontend.
2529          * XXX At the moment the flusher must skip it because there may
2530          * be another data record in the flush group for the same block,
2531          * meaning that some frontend data changes can leak into the backend's
2532          * synchronization point.
2533          */
2534         if (record->flags & HAMMER_RECF_DELETED_FE) {
2535                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2536                         /*
2537                          * Convert a front-end deleted directory-add to
2538                          * a directory-delete entry later.
2539                          */
2540                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2541                 } else {
2542                         /*
2543                          * Dispose of the record (race case).  Mark as
2544                          * deleted by backend (and not committed).
2545                          */
2546                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2547                         record->flags |= HAMMER_RECF_DELETED_BE;
2548                         ++record->ip->rec_generation;
2549                         error = 0;
2550                         goto done;
2551                 }
2552         }
2553
2554         /*
2555          * Assign the create_tid for new records.  Deletions already
2556          * have the record's entire key properly set up.
2557          */
2558         if (record->type != HAMMER_MEM_RECORD_DEL) {
2559                 record->leaf.base.create_tid = trans->tid;
2560                 record->leaf.create_ts = trans->time32;
2561         }
2562         for (;;) {
2563                 error = hammer_ip_sync_record_cursor(cursor, record);
2564                 if (error != EDEADLK)
2565                         break;
2566                 hammer_done_cursor(cursor);
2567                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2568                                            record->ip);
2569                 if (error)
2570                         break;
2571         }
2572         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2573
2574         if (error)
2575                 error = -error;
2576 done:
2577         hammer_flush_record_done(record, error);
2578
2579         /*
2580          * Do partial finalization if we have built up too many dirty
2581          * buffers.  Otherwise a buffer cache deadlock can occur when
2582          * doing things like creating tens of thousands of tiny files.
2583          *
2584          * We must release our cursor lock to avoid a 3-way deadlock
2585          * due to the exclusive sync lock the finalizer must get.
2586          *
2587          * WARNING: See warnings in hammer_unlock_cursor() function.
2588          */
2589         if (hammer_flusher_meta_limit(hmp)) {
2590                 hammer_unlock_cursor(cursor);
2591                 hammer_flusher_finalize(trans, 0);
2592                 hammer_lock_cursor(cursor);
2593         }
2594
2595         return(error);
2596 }
2597
2598 /*
2599  * Backend function called by the flusher to sync an inode to media.
2600  */
2601 int
2602 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2603 {
2604         struct hammer_cursor cursor;
2605         hammer_node_t tmp_node;
2606         hammer_record_t depend;
2607         hammer_record_t next;
2608         int error, tmp_error;
2609         u_int64_t nlinks;
2610
2611         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2612                 return(0);
2613
2614         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2615         if (error)
2616                 goto done;
2617
2618         /*
2619          * Any directory records referencing this inode which are not in
2620          * our current flush group must adjust our nlink count for the
2621          * purposes of synchronization to disk.
2622          *
2623          * Records which are in our flush group can be unlinked from our
2624          * inode now, potentially allowing the inode to be physically
2625          * deleted.
2626          *
2627          * This cannot block.
2628          */
2629         nlinks = ip->ino_data.nlinks;
2630         next = TAILQ_FIRST(&ip->target_list);
2631         while ((depend = next) != NULL) {
2632                 next = TAILQ_NEXT(depend, target_entry);
2633                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2634                     depend->flush_group == ip->flush_group) {
2635                         /*
2636                          * If this is an ADD that was deleted by the frontend
2637                          * the frontend nlinks count will have already been
2638                          * decremented, but the backend is going to sync its
2639                          * directory entry and must account for it.  The
2640                          * record will be converted to a delete-on-disk when
2641                          * it gets synced.
2642                          *
2643                          * If the ADD was not deleted by the frontend we
2644                          * can remove the dependancy from our target_list.
2645                          */
2646                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2647                                 ++nlinks;
2648                         } else {
2649                                 TAILQ_REMOVE(&ip->target_list, depend,
2650                                              target_entry);
2651                                 depend->target_ip = NULL;
2652                         }
2653                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2654                         /*
2655                          * Not part of our flush group and not deleted by
2656                          * the front-end, adjust the link count synced to
2657                          * the media (undo what the frontend did when it
2658                          * queued the record).
2659                          */
2660                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2661                         switch(depend->type) {
2662                         case HAMMER_MEM_RECORD_ADD:
2663                                 --nlinks;
2664                                 break;
2665                         case HAMMER_MEM_RECORD_DEL:
2666                                 ++nlinks;
2667                                 break;
2668                         default:
2669                                 break;
2670                         }
2671                 }
2672         }
2673
2674         /*
2675          * Set dirty if we had to modify the link count.
2676          */
2677         if (ip->sync_ino_data.nlinks != nlinks) {
2678                 KKASSERT((int64_t)nlinks >= 0);
2679                 ip->sync_ino_data.nlinks = nlinks;
2680                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2681         }
2682
2683         /*
2684          * If there is a trunction queued destroy any data past the (aligned)
2685          * truncation point.  Userland will have dealt with the buffer
2686          * containing the truncation point for us.
2687          *
2688          * We don't flush pending frontend data buffers until after we've
2689          * dealt with the truncation.
2690          */
2691         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2692                 /*
2693                  * Interlock trunc_off.  The VOP front-end may continue to
2694                  * make adjustments to it while we are blocked.
2695                  */
2696                 off_t trunc_off;
2697                 off_t aligned_trunc_off;
2698                 int blkmask;
2699
2700                 trunc_off = ip->sync_trunc_off;
2701                 blkmask = hammer_blocksize(trunc_off) - 1;
2702                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2703
2704                 /*
2705                  * Delete any whole blocks on-media.  The front-end has
2706                  * already cleaned out any partial block and made it
2707                  * pending.  The front-end may have updated trunc_off
2708                  * while we were blocked so we only use sync_trunc_off.
2709                  *
2710                  * This operation can blow out the buffer cache, EWOULDBLOCK
2711                  * means we were unable to complete the deletion.  The
2712                  * deletion will update sync_trunc_off in that case.
2713                  */
2714                 error = hammer_ip_delete_range(&cursor, ip,
2715                                                 aligned_trunc_off,
2716                                                 0x7FFFFFFFFFFFFFFFLL, 2);
2717                 if (error == EWOULDBLOCK) {
2718                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
2719                         error = 0;
2720                         goto defer_buffer_flush;
2721                 }
2722
2723                 if (error)
2724                         goto done;
2725
2726                 /*
2727                  * Clear the truncation flag on the backend after we have
2728                  * complete the deletions.  Backend data is now good again
2729                  * (including new records we are about to sync, below).
2730                  *
2731                  * Leave sync_trunc_off intact.  As we write additional
2732                  * records the backend will update sync_trunc_off.  This
2733                  * tells the backend whether it can skip the overwrite
2734                  * test.  This should work properly even when the backend
2735                  * writes full blocks where the truncation point straddles
2736                  * the block because the comparison is against the base
2737                  * offset of the record.
2738                  */
2739                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2740                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
2741         } else {
2742                 error = 0;
2743         }
2744
2745         /*
2746          * Now sync related records.  These will typically be directory
2747          * entries, records tracking direct-writes, or delete-on-disk records.
2748          */
2749         if (error == 0) {
2750                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2751                                     hammer_sync_record_callback, &cursor);
2752                 if (tmp_error < 0)
2753                         tmp_error = -error;
2754                 if (tmp_error)
2755                         error = tmp_error;
2756         }
2757         hammer_cache_node(&ip->cache[1], cursor.node);
2758
2759         /*
2760          * Re-seek for inode update, assuming our cache hasn't been ripped
2761          * out from under us.
2762          */
2763         if (error == 0) {
2764                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
2765                 if (tmp_node) {
2766                         hammer_cursor_downgrade(&cursor);
2767                         hammer_lock_sh(&tmp_node->lock);
2768                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
2769                                 hammer_cursor_seek(&cursor, tmp_node, 0);
2770                         hammer_unlock(&tmp_node->lock);
2771                         hammer_rel_node(tmp_node);
2772                 }
2773                 error = 0;
2774         }
2775
2776         /*
2777          * If we are deleting the inode the frontend had better not have
2778          * any active references on elements making up the inode.
2779          *
2780          * The call to hammer_ip_delete_clean() cleans up auxillary records
2781          * but not DB or DATA records.  Those must have already been deleted
2782          * by the normal truncation mechanic.
2783          */
2784         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
2785                 RB_EMPTY(&ip->rec_tree)  &&
2786             (ip->sync_flags & HAMMER_INODE_DELETING) &&
2787             (ip->flags & HAMMER_INODE_DELETED) == 0) {
2788                 int count1 = 0;
2789
2790                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
2791                 if (error == 0) {
2792                         ip->flags |= HAMMER_INODE_DELETED;
2793                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
2794                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2795                         KKASSERT(RB_EMPTY(&ip->rec_tree));
2796
2797                         /*
2798                          * Set delete_tid in both the frontend and backend
2799                          * copy of the inode record.  The DELETED flag handles
2800                          * this, do not set RDIRTY.
2801                          */
2802                         ip->ino_leaf.base.delete_tid = trans->tid;
2803                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
2804                         ip->ino_leaf.delete_ts = trans->time32;
2805                         ip->sync_ino_leaf.delete_ts = trans->time32;
2806
2807
2808                         /*
2809                          * Adjust the inode count in the volume header
2810                          */
2811                         hammer_sync_lock_sh(trans);
2812                         if (ip->flags & HAMMER_INODE_ONDISK) {
2813                                 hammer_modify_volume_field(trans,
2814                                                            trans->rootvol,
2815                                                            vol0_stat_inodes);
2816                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
2817                                 hammer_modify_volume_done(trans->rootvol);
2818                         }
2819                         hammer_sync_unlock(trans);
2820                 }
2821         }
2822
2823         if (error)
2824                 goto done;
2825         ip->sync_flags &= ~HAMMER_INODE_BUFS;
2826
2827 defer_buffer_flush:
2828         /*
2829          * Now update the inode's on-disk inode-data and/or on-disk record.
2830          * DELETED and ONDISK are managed only in ip->flags.
2831          *
2832          * In the case of a defered buffer flush we still update the on-disk
2833          * inode to satisfy visibility requirements if there happen to be
2834          * directory dependancies.
2835          */
2836         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
2837         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
2838                 /*
2839                  * If deleted and on-disk, don't set any additional flags.
2840                  * the delete flag takes care of things.
2841                  *
2842                  * Clear flags which may have been set by the frontend.
2843                  */
2844                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
2845                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
2846                                     HAMMER_INODE_DELETING);
2847                 break;
2848         case HAMMER_INODE_DELETED:
2849                 /*
2850                  * Take care of the case where a deleted inode was never
2851                  * flushed to the disk in the first place.
2852                  *
2853                  * Clear flags which may have been set by the frontend.
2854                  */
2855                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
2856                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
2857                                     HAMMER_INODE_DELETING);
2858                 while (RB_ROOT(&ip->rec_tree)) {
2859                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
2860                         hammer_ref(&record->lock);
2861                         KKASSERT(record->lock.refs == 1);
2862                         record->flags |= HAMMER_RECF_DELETED_BE;
2863                         ++record->ip->rec_generation;
2864                         hammer_rel_mem_record(record);
2865                 }
2866                 break;
2867         case HAMMER_INODE_ONDISK:
2868                 /*
2869                  * If already on-disk, do not set any additional flags.
2870                  */
2871                 break;
2872         default:
2873                 /*
2874                  * If not on-disk and not deleted, set DDIRTY to force
2875                  * an initial record to be written.
2876                  *
2877                  * Also set the create_tid in both the frontend and backend
2878                  * copy of the inode record.
2879                  */
2880                 ip->ino_leaf.base.create_tid = trans->tid;
2881                 ip->ino_leaf.create_ts = trans->time32;
2882                 ip->sync_ino_leaf.base.create_tid = trans->tid;
2883                 ip->sync_ino_leaf.create_ts = trans->time32;
2884                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2885                 break;
2886         }
2887
2888         /*
2889          * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
2890          * is already on-disk the old record is marked as deleted.
2891          *
2892          * If DELETED is set hammer_update_inode() will delete the existing
2893          * record without writing out a new one.
2894          *
2895          * If *ONLY* the ITIMES flag is set we can update the record in-place.
2896          */
2897         if (ip->flags & HAMMER_INODE_DELETED) {
2898                 error = hammer_update_inode(&cursor, ip);
2899         } else 
2900         if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
2901             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
2902                 error = hammer_update_itimes(&cursor, ip);
2903         } else
2904         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
2905                 error = hammer_update_inode(&cursor, ip);
2906         }
2907 done:
2908         if (error) {
2909                 hammer_critical_error(ip->hmp, ip, error,
2910                                       "while syncing inode");
2911         }
2912         hammer_done_cursor(&cursor);
2913         return(error);
2914 }
2915
2916 /*
2917  * This routine is called when the OS is no longer actively referencing
2918  * the inode (but might still be keeping it cached), or when releasing
2919  * the last reference to an inode.
2920  *
2921  * At this point if the inode's nlinks count is zero we want to destroy
2922  * it, which may mean destroying it on-media too.
2923  */
2924 void
2925 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
2926 {
2927         struct vnode *vp;
2928
2929         /*
2930          * Set the DELETING flag when the link count drops to 0 and the
2931          * OS no longer has any opens on the inode.
2932          *
2933          * The backend will clear DELETING (a mod flag) and set DELETED
2934          * (a state flag) when it is actually able to perform the
2935          * operation.
2936          *
2937          * Don't reflag the deletion if the flusher is currently syncing
2938          * one that was already flagged.  A previously set DELETING flag
2939          * may bounce around flags and sync_flags until the operation is
2940          * completely done.
2941          */
2942         if (ip->ino_data.nlinks == 0 &&
2943             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
2944                 ip->flags |= HAMMER_INODE_DELETING;
2945                 ip->flags |= HAMMER_INODE_TRUNCATED;
2946                 ip->trunc_off = 0;
2947                 vp = NULL;
2948                 if (getvp) {
2949                         if (hammer_get_vnode(ip, &vp) != 0)
2950                                 return;
2951                 }
2952
2953                 /*
2954                  * Final cleanup
2955                  */
2956                 if (ip->vp) {
2957                         vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
2958                         vnode_pager_setsize(ip->vp, 0);
2959                 }
2960                 if (getvp) {
2961                         vput(vp);
2962                 }
2963         }
2964 }
2965
2966 /*
2967  * After potentially resolving a dependancy the inode is tested
2968  * to determine whether it needs to be reflushed.
2969  */
2970 void
2971 hammer_test_inode(hammer_inode_t ip)
2972 {
2973         if (ip->flags & HAMMER_INODE_REFLUSH) {
2974                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2975                 hammer_ref(&ip->lock);
2976                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
2977                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
2978                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2979                 } else {
2980                         hammer_flush_inode(ip, 0);
2981                 }
2982                 hammer_rel_inode(ip, 0);
2983         }
2984 }
2985
2986 /*
2987  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
2988  * reassociated with a vp or just before it gets freed.
2989  *
2990  * Pipeline wakeups to threads blocked due to an excessive number of
2991  * detached inodes.  This typically occurs when atime updates accumulate
2992  * while scanning a directory tree.
2993  */
2994 static void
2995 hammer_inode_wakereclaims(hammer_inode_t ip)
2996 {
2997         struct hammer_reclaim *reclaim;
2998         hammer_mount_t hmp = ip->hmp;
2999
3000         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
3001                 return;
3002
3003         --hammer_count_reclaiming;
3004         --hmp->inode_reclaims;
3005         ip->flags &= ~HAMMER_INODE_RECLAIM;
3006
3007         while ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3008                 if (reclaim->count > 0 && --reclaim->count == 0) {
3009                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3010                         wakeup(reclaim);
3011                 }
3012                 if (hmp->inode_reclaims > HAMMER_RECLAIM_WAIT / 2)
3013                         break;
3014         }
3015 }
3016
3017 /*
3018  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3019  * inodes build up before we start blocking.  This routine is called
3020  * if a new inode is created or an inode is loaded from media.
3021  *
3022  * When we block we don't care *which* inode has finished reclaiming,
3023  * as lone as one does.
3024  */
3025 void
3026 hammer_inode_waitreclaims(hammer_mount_t hmp)
3027 {
3028         struct hammer_reclaim reclaim;
3029
3030         if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT)
3031                 return;
3032         reclaim.count = 1;
3033         TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3034         tsleep(&reclaim, 0, "hmrrcm", hz);
3035         if (reclaim.count > 0)
3036                 TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3037 }
3038
3039 #if 0
3040
3041 /*
3042  * XXX not used, doesn't work very well due to the large batching nature
3043  * of flushes.
3044  *
3045  * A larger then normal backlog of inodes is sitting in the flusher,
3046  * enforce a general slowdown to let it catch up.  This routine is only
3047  * called on completion of a non-flusher-related transaction which
3048  * performed B-Tree node I/O.
3049  *
3050  * It is possible for the flusher to stall in a continuous load.
3051  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3052  * If the flusher is unable to catch up the inode count can bloat until
3053  * we run out of kvm.
3054  *
3055  * This is a bit of a hack.
3056  */
3057 void
3058 hammer_inode_waithard(hammer_mount_t hmp)
3059 {
3060         /*
3061          * Hysteresis.
3062          */
3063         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3064                 if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT / 2 &&
3065                     hmp->count_iqueued < hmp->count_inodes / 20) {
3066                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3067                         return;
3068                 }
3069         } else {
3070                 if (hmp->inode_reclaims < HAMMER_RECLAIM_WAIT ||
3071                     hmp->count_iqueued < hmp->count_inodes / 10) {
3072                         return;
3073                 }
3074                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3075         }
3076
3077         /*
3078          * Block for one flush cycle.
3079          */
3080         hammer_flusher_wait_next(hmp);
3081 }
3082
3083 #endif