kernel - Rewrite vnode ref-counting code to improve performance
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36 #include <vm/vm_extern.h>
37
38 static int      hammer_unload_inode(struct hammer_inode *ip);
39 static void     hammer_free_inode(hammer_inode_t ip);
40 static void     hammer_flush_inode_core(hammer_inode_t ip,
41                                         hammer_flush_group_t flg, int flags);
42 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
43 #if 0
44 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
45 #endif
46 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
47                                         hammer_flush_group_t flg);
48 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
49                                         int depth, hammer_flush_group_t flg);
50 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
51 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
52                                         pid_t pid);
53
54 #ifdef DEBUG_TRUNCATE
55 extern struct hammer_inode *HammerTruncIp;
56 #endif
57
58 struct krate hammer_gen_krate = { 1 };
59
60 /*
61  * RB-Tree support for inode structures
62  */
63 int
64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
65 {
66         if (ip1->obj_localization < ip2->obj_localization)
67                 return(-1);
68         if (ip1->obj_localization > ip2->obj_localization)
69                 return(1);
70         if (ip1->obj_id < ip2->obj_id)
71                 return(-1);
72         if (ip1->obj_id > ip2->obj_id)
73                 return(1);
74         if (ip1->obj_asof < ip2->obj_asof)
75                 return(-1);
76         if (ip1->obj_asof > ip2->obj_asof)
77                 return(1);
78         return(0);
79 }
80
81 int
82 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
83 {
84         if (ip1->redo_fifo_start < ip2->redo_fifo_start)
85                 return(-1);
86         if (ip1->redo_fifo_start > ip2->redo_fifo_start)
87                 return(1);
88         return(0);
89 }
90
91 /*
92  * RB-Tree support for inode structures / special LOOKUP_INFO
93  */
94 static int
95 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
96 {
97         if (info->obj_localization < ip->obj_localization)
98                 return(-1);
99         if (info->obj_localization > ip->obj_localization)
100                 return(1);
101         if (info->obj_id < ip->obj_id)
102                 return(-1);
103         if (info->obj_id > ip->obj_id)
104                 return(1);
105         if (info->obj_asof < ip->obj_asof)
106                 return(-1);
107         if (info->obj_asof > ip->obj_asof)
108                 return(1);
109         return(0);
110 }
111
112 /*
113  * Used by hammer_scan_inode_snapshots() to locate all of an object's
114  * snapshots.  Note that the asof field is not tested, which we can get
115  * away with because it is the lowest-priority field.
116  */
117 static int
118 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
119 {
120         hammer_inode_info_t info = data;
121
122         if (ip->obj_localization > info->obj_localization)
123                 return(1);
124         if (ip->obj_localization < info->obj_localization)
125                 return(-1);
126         if (ip->obj_id > info->obj_id)
127                 return(1);
128         if (ip->obj_id < info->obj_id)
129                 return(-1);
130         return(0);
131 }
132
133 /*
134  * Used by hammer_unload_pseudofs() to locate all inodes associated with
135  * a particular PFS.
136  */
137 static int
138 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
139 {
140         u_int32_t localization = *(u_int32_t *)data;
141         if (ip->obj_localization > localization)
142                 return(1);
143         if (ip->obj_localization < localization)
144                 return(-1);
145         return(0);
146 }
147
148 /*
149  * RB-Tree support for pseudofs structures
150  */
151 static int
152 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
153 {
154         if (p1->localization < p2->localization)
155                 return(-1);
156         if (p1->localization > p2->localization)
157                 return(1);
158         return(0);
159 }
160
161
162 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
163 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
164                 hammer_inode_info_cmp, hammer_inode_info_t);
165 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
166              hammer_pfs_rb_compare, u_int32_t, localization);
167
168 /*
169  * The kernel is not actively referencing this vnode but is still holding
170  * it cached.
171  *
172  * This is called from the frontend.
173  *
174  * MPALMOSTSAFE
175  */
176 int
177 hammer_vop_inactive(struct vop_inactive_args *ap)
178 {
179         struct hammer_inode *ip = VTOI(ap->a_vp);
180         hammer_mount_t hmp;
181
182         /*
183          * Degenerate case
184          */
185         if (ip == NULL) {
186                 vrecycle(ap->a_vp);
187                 return(0);
188         }
189
190         /*
191          * If the inode no longer has visibility in the filesystem try to
192          * recycle it immediately, even if the inode is dirty.  Recycling
193          * it quickly allows the system to reclaim buffer cache and VM
194          * resources which can matter a lot in a heavily loaded system.
195          *
196          * This can deadlock in vfsync() if we aren't careful.
197          * 
198          * Do not queue the inode to the flusher if we still have visibility,
199          * otherwise namespace calls such as chmod will unnecessarily generate
200          * multiple inode updates.
201          */
202         if (ip->ino_data.nlinks == 0) {
203                 hmp = ip->hmp;
204                 lwkt_gettoken(&hmp->fs_token);
205                 hammer_inode_unloadable_check(ip, 0);
206                 if (ip->flags & HAMMER_INODE_MODMASK)
207                         hammer_flush_inode(ip, 0);
208                 lwkt_reltoken(&hmp->fs_token);
209                 vrecycle(ap->a_vp);
210         }
211         return(0);
212 }
213
214 /*
215  * Release the vnode association.  This is typically (but not always)
216  * the last reference on the inode.
217  *
218  * Once the association is lost we are on our own with regards to
219  * flushing the inode.
220  *
221  * We must interlock ip->vp so hammer_get_vnode() can avoid races.
222  */
223 int
224 hammer_vop_reclaim(struct vop_reclaim_args *ap)
225 {
226         struct hammer_inode *ip;
227         hammer_mount_t hmp;
228         struct vnode *vp;
229
230         vp = ap->a_vp;
231
232         if ((ip = vp->v_data) != NULL) {
233                 hmp = ip->hmp;
234                 lwkt_gettoken(&hmp->fs_token);
235                 hammer_lock_ex(&ip->lock);
236                 vp->v_data = NULL;
237                 ip->vp = NULL;
238
239                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
240                         ++hammer_count_reclaims;
241                         ++hmp->count_reclaims;
242                         ip->flags |= HAMMER_INODE_RECLAIM;
243                 }
244                 hammer_unlock(&ip->lock);
245                 vclrisdirty(vp);
246                 hammer_rel_inode(ip, 1);
247                 lwkt_reltoken(&hmp->fs_token);
248         }
249         return(0);
250 }
251
252 /*
253  * Inform the kernel that the inode is dirty.  This will be checked
254  * by vn_unlock().
255  */
256 void
257 hammer_inode_dirty(struct hammer_inode *ip)
258 {
259         struct vnode *vp;
260
261         if ((ip->flags & HAMMER_INODE_MODMASK) &&
262             (vp = ip->vp) != NULL) {
263                 vsetisdirty(vp);
264         }
265 }
266
267 /*
268  * Return a locked vnode for the specified inode.  The inode must be
269  * referenced but NOT LOCKED on entry and will remain referenced on
270  * return.
271  *
272  * Called from the frontend.
273  */
274 int
275 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
276 {
277         hammer_mount_t hmp;
278         struct vnode *vp;
279         int error = 0;
280         u_int8_t obj_type;
281
282         hmp = ip->hmp;
283
284         for (;;) {
285                 if ((vp = ip->vp) == NULL) {
286                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
287                         if (error)
288                                 break;
289                         hammer_lock_ex(&ip->lock);
290                         if (ip->vp != NULL) {
291                                 hammer_unlock(&ip->lock);
292                                 vp = *vpp;
293                                 vp->v_type = VBAD;
294                                 vx_put(vp);
295                                 continue;
296                         }
297                         hammer_ref(&ip->lock);
298                         vp = *vpp;
299                         ip->vp = vp;
300
301                         obj_type = ip->ino_data.obj_type;
302                         vp->v_type = hammer_get_vnode_type(obj_type);
303
304                         hammer_inode_wakereclaims(ip);
305
306                         switch(ip->ino_data.obj_type) {
307                         case HAMMER_OBJTYPE_CDEV:
308                         case HAMMER_OBJTYPE_BDEV:
309                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
310                                 addaliasu(vp, ip->ino_data.rmajor,
311                                           ip->ino_data.rminor);
312                                 break;
313                         case HAMMER_OBJTYPE_FIFO:
314                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
315                                 break;
316                         case HAMMER_OBJTYPE_REGFILE:
317                                 break;
318                         default:
319                                 break;
320                         }
321
322                         /*
323                          * Only mark as the root vnode if the ip is not
324                          * historical, otherwise the VFS cache will get
325                          * confused.  The other half of the special handling
326                          * is in hammer_vop_nlookupdotdot().
327                          *
328                          * Pseudo-filesystem roots can be accessed via
329                          * non-root filesystem paths and setting VROOT may
330                          * confuse the namecache.  Set VPFSROOT instead.
331                          */
332                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
333                             ip->obj_asof == hmp->asof) {
334                                 if (ip->obj_localization == 0)
335                                         vsetflags(vp, VROOT);
336                                 else
337                                         vsetflags(vp, VPFSROOT);
338                         }
339
340                         vp->v_data = (void *)ip;
341                         /* vnode locked by getnewvnode() */
342                         /* make related vnode dirty if inode dirty? */
343                         hammer_unlock(&ip->lock);
344                         if (vp->v_type == VREG) {
345                                 vinitvmio(vp, ip->ino_data.size,
346                                           hammer_blocksize(ip->ino_data.size),
347                                           hammer_blockoff(ip->ino_data.size));
348                         }
349                         break;
350                 }
351
352                 /*
353                  * Interlock vnode clearing.  This does not prevent the
354                  * vnode from going into a reclaimed state but it does
355                  * prevent it from being destroyed or reused so the vget()
356                  * will properly fail.
357                  */
358                 hammer_lock_ex(&ip->lock);
359                 if ((vp = ip->vp) == NULL) {
360                         hammer_unlock(&ip->lock);
361                         continue;
362                 }
363                 vhold(vp);
364                 hammer_unlock(&ip->lock);
365
366                 /*
367                  * loop if the vget fails (aka races), or if the vp
368                  * no longer matches ip->vp.
369                  */
370                 if (vget(vp, LK_EXCLUSIVE) == 0) {
371                         if (vp == ip->vp) {
372                                 vdrop(vp);
373                                 break;
374                         }
375                         vput(vp);
376                 }
377                 vdrop(vp);
378         }
379         *vpp = vp;
380         return(error);
381 }
382
383 /*
384  * Locate all copies of the inode for obj_id compatible with the specified
385  * asof, reference, and issue the related call-back.  This routine is used
386  * for direct-io invalidation and does not create any new inodes.
387  */
388 void
389 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
390                             int (*callback)(hammer_inode_t ip, void *data),
391                             void *data)
392 {
393         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
394                                    hammer_inode_info_cmp_all_history,
395                                    callback, iinfo);
396 }
397
398 /*
399  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
400  * do not attach or detach the related vnode (use hammer_get_vnode() for
401  * that).
402  *
403  * The flags argument is only applied for newly created inodes, and only
404  * certain flags are inherited.
405  *
406  * Called from the frontend.
407  */
408 struct hammer_inode *
409 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
410                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
411                  int flags, int *errorp)
412 {
413         hammer_mount_t hmp = trans->hmp;
414         struct hammer_node_cache *cachep;
415         struct hammer_inode_info iinfo;
416         struct hammer_cursor cursor;
417         struct hammer_inode *ip;
418
419
420         /*
421          * Determine if we already have an inode cached.  If we do then
422          * we are golden.
423          *
424          * If we find an inode with no vnode we have to mark the
425          * transaction such that hammer_inode_waitreclaims() is
426          * called later on to avoid building up an infinite number
427          * of inodes.  Otherwise we can continue to * add new inodes
428          * faster then they can be disposed of, even with the tsleep
429          * delay.
430          *
431          * If we find a dummy inode we return a failure so dounlink
432          * (which does another lookup) doesn't try to mess with the
433          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
434          * to ref dummy inodes.
435          */
436         iinfo.obj_id = obj_id;
437         iinfo.obj_asof = asof;
438         iinfo.obj_localization = localization;
439 loop:
440         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
441         if (ip) {
442                 if (ip->flags & HAMMER_INODE_DUMMY) {
443                         *errorp = ENOENT;
444                         return(NULL);
445                 }
446                 hammer_ref(&ip->lock);
447                 *errorp = 0;
448                 return(ip);
449         }
450
451         /*
452          * Allocate a new inode structure and deal with races later.
453          */
454         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
455         ++hammer_count_inodes;
456         ++hmp->count_inodes;
457         ip->obj_id = obj_id;
458         ip->obj_asof = iinfo.obj_asof;
459         ip->obj_localization = localization;
460         ip->hmp = hmp;
461         ip->flags = flags & HAMMER_INODE_RO;
462         ip->cache[0].ip = ip;
463         ip->cache[1].ip = ip;
464         ip->cache[2].ip = ip;
465         ip->cache[3].ip = ip;
466         if (hmp->ronly)
467                 ip->flags |= HAMMER_INODE_RO;
468         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
469                 0x7FFFFFFFFFFFFFFFLL;
470         RB_INIT(&ip->rec_tree);
471         TAILQ_INIT(&ip->target_list);
472         hammer_ref(&ip->lock);
473
474         /*
475          * Locate the on-disk inode.  If this is a PFS root we always
476          * access the current version of the root inode and (if it is not
477          * a master) always access information under it with a snapshot
478          * TID.
479          *
480          * We cache recent inode lookups in this directory in dip->cache[2].
481          * If we can't find it we assume the inode we are looking for is
482          * close to the directory inode.
483          */
484 retry:
485         cachep = NULL;
486         if (dip) {
487                 if (dip->cache[2].node)
488                         cachep = &dip->cache[2];
489                 else
490                         cachep = &dip->cache[0];
491         }
492         hammer_init_cursor(trans, &cursor, cachep, NULL);
493         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
494         cursor.key_beg.obj_id = ip->obj_id;
495         cursor.key_beg.key = 0;
496         cursor.key_beg.create_tid = 0;
497         cursor.key_beg.delete_tid = 0;
498         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
499         cursor.key_beg.obj_type = 0;
500
501         cursor.asof = iinfo.obj_asof;
502         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
503                        HAMMER_CURSOR_ASOF;
504
505         *errorp = hammer_btree_lookup(&cursor);
506         if (*errorp == EDEADLK) {
507                 hammer_done_cursor(&cursor);
508                 goto retry;
509         }
510
511         /*
512          * On success the B-Tree lookup will hold the appropriate
513          * buffer cache buffers and provide a pointer to the requested
514          * information.  Copy the information to the in-memory inode
515          * and cache the B-Tree node to improve future operations.
516          */
517         if (*errorp == 0) {
518                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
519                 ip->ino_data = cursor.data->inode;
520
521                 /*
522                  * cache[0] tries to cache the location of the object inode.
523                  * The assumption is that it is near the directory inode.
524                  *
525                  * cache[1] tries to cache the location of the object data.
526                  * We might have something in the governing directory from
527                  * scan optimizations (see the strategy code in
528                  * hammer_vnops.c).
529                  *
530                  * We update dip->cache[2], if possible, with the location
531                  * of the object inode for future directory shortcuts.
532                  */
533                 hammer_cache_node(&ip->cache[0], cursor.node);
534                 if (dip) {
535                         if (dip->cache[3].node) {
536                                 hammer_cache_node(&ip->cache[1],
537                                                   dip->cache[3].node);
538                         }
539                         hammer_cache_node(&dip->cache[2], cursor.node);
540                 }
541
542                 /*
543                  * The file should not contain any data past the file size
544                  * stored in the inode.  Setting save_trunc_off to the
545                  * file size instead of max reduces B-Tree lookup overheads
546                  * on append by allowing the flusher to avoid checking for
547                  * record overwrites.
548                  */
549                 ip->save_trunc_off = ip->ino_data.size;
550
551                 /*
552                  * Locate and assign the pseudofs management structure to
553                  * the inode.
554                  */
555                 if (dip && dip->obj_localization == ip->obj_localization) {
556                         ip->pfsm = dip->pfsm;
557                         hammer_ref(&ip->pfsm->lock);
558                 } else {
559                         ip->pfsm = hammer_load_pseudofs(trans,
560                                                         ip->obj_localization,
561                                                         errorp);
562                         *errorp = 0;    /* ignore ENOENT */
563                 }
564         }
565
566         /*
567          * The inode is placed on the red-black tree and will be synced to
568          * the media when flushed or by the filesystem sync.  If this races
569          * another instantiation/lookup the insertion will fail.
570          */
571         if (*errorp == 0) {
572                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
573                         hammer_free_inode(ip);
574                         hammer_done_cursor(&cursor);
575                         goto loop;
576                 }
577                 ip->flags |= HAMMER_INODE_ONDISK;
578         } else {
579                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
580                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
581                         --hmp->rsv_inodes;
582                 }
583
584                 hammer_free_inode(ip);
585                 ip = NULL;
586         }
587         hammer_done_cursor(&cursor);
588
589         /*
590          * NEWINODE is only set if the inode becomes dirty later,
591          * setting it here just leads to unnecessary stalls.
592          *
593          * trans->flags |= HAMMER_TRANSF_NEWINODE;
594          */
595         return (ip);
596 }
597
598 /*
599  * Get a dummy inode to placemark a broken directory entry.
600  */
601 struct hammer_inode *
602 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
603                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
604                  int flags, int *errorp)
605 {
606         hammer_mount_t hmp = trans->hmp;
607         struct hammer_inode_info iinfo;
608         struct hammer_inode *ip;
609
610         /*
611          * Determine if we already have an inode cached.  If we do then
612          * we are golden.
613          *
614          * If we find an inode with no vnode we have to mark the
615          * transaction such that hammer_inode_waitreclaims() is
616          * called later on to avoid building up an infinite number
617          * of inodes.  Otherwise we can continue to * add new inodes
618          * faster then they can be disposed of, even with the tsleep
619          * delay.
620          *
621          * If we find a non-fake inode we return an error.  Only fake
622          * inodes can be returned by this routine.
623          */
624         iinfo.obj_id = obj_id;
625         iinfo.obj_asof = asof;
626         iinfo.obj_localization = localization;
627 loop:
628         *errorp = 0;
629         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
630         if (ip) {
631                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
632                         *errorp = ENOENT;
633                         return(NULL);
634                 }
635                 hammer_ref(&ip->lock);
636                 return(ip);
637         }
638
639         /*
640          * Allocate a new inode structure and deal with races later.
641          */
642         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
643         ++hammer_count_inodes;
644         ++hmp->count_inodes;
645         ip->obj_id = obj_id;
646         ip->obj_asof = iinfo.obj_asof;
647         ip->obj_localization = localization;
648         ip->hmp = hmp;
649         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
650         ip->cache[0].ip = ip;
651         ip->cache[1].ip = ip;
652         ip->cache[2].ip = ip;
653         ip->cache[3].ip = ip;
654         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
655                 0x7FFFFFFFFFFFFFFFLL;
656         RB_INIT(&ip->rec_tree);
657         TAILQ_INIT(&ip->target_list);
658         hammer_ref(&ip->lock);
659
660         /*
661          * Populate the dummy inode.  Leave everything zero'd out.
662          *
663          * (ip->ino_leaf and ip->ino_data)
664          *
665          * Make the dummy inode a FIFO object which most copy programs
666          * will properly ignore.
667          */
668         ip->save_trunc_off = ip->ino_data.size;
669         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
670
671         /*
672          * Locate and assign the pseudofs management structure to
673          * the inode.
674          */
675         if (dip && dip->obj_localization == ip->obj_localization) {
676                 ip->pfsm = dip->pfsm;
677                 hammer_ref(&ip->pfsm->lock);
678         } else {
679                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
680                                                 errorp);
681                 *errorp = 0;    /* ignore ENOENT */
682         }
683
684         /*
685          * The inode is placed on the red-black tree and will be synced to
686          * the media when flushed or by the filesystem sync.  If this races
687          * another instantiation/lookup the insertion will fail.
688          *
689          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
690          */
691         if (*errorp == 0) {
692                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
693                         hammer_free_inode(ip);
694                         goto loop;
695                 }
696         } else {
697                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
698                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
699                         --hmp->rsv_inodes;
700                 }
701                 hammer_free_inode(ip);
702                 ip = NULL;
703         }
704         trans->flags |= HAMMER_TRANSF_NEWINODE;
705         return (ip);
706 }
707
708 /*
709  * Return a referenced inode only if it is in our inode cache.
710  *
711  * Dummy inodes do not count.
712  */
713 struct hammer_inode *
714 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
715                   hammer_tid_t asof, u_int32_t localization)
716 {
717         hammer_mount_t hmp = trans->hmp;
718         struct hammer_inode_info iinfo;
719         struct hammer_inode *ip;
720
721         iinfo.obj_id = obj_id;
722         iinfo.obj_asof = asof;
723         iinfo.obj_localization = localization;
724
725         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
726         if (ip) {
727                 if (ip->flags & HAMMER_INODE_DUMMY)
728                         ip = NULL;
729                 else
730                         hammer_ref(&ip->lock);
731         }
732         return(ip);
733 }
734
735 /*
736  * Create a new filesystem object, returning the inode in *ipp.  The
737  * returned inode will be referenced.  The inode is created in-memory.
738  *
739  * If pfsm is non-NULL the caller wishes to create the root inode for
740  * a master PFS.
741  */
742 int
743 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
744                     struct ucred *cred,
745                     hammer_inode_t dip, const char *name, int namelen,
746                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
747 {
748         hammer_mount_t hmp;
749         hammer_inode_t ip;
750         uid_t xuid;
751         int error;
752         int64_t namekey;
753         u_int32_t dummy;
754
755         hmp = trans->hmp;
756
757         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
758         ++hammer_count_inodes;
759         ++hmp->count_inodes;
760         trans->flags |= HAMMER_TRANSF_NEWINODE;
761
762         if (pfsm) {
763                 KKASSERT(pfsm->localization != 0);
764                 ip->obj_id = HAMMER_OBJID_ROOT;
765                 ip->obj_localization = pfsm->localization;
766         } else {
767                 KKASSERT(dip != NULL);
768                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
769                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
770                 ip->obj_localization = dip->obj_localization;
771         }
772
773         KKASSERT(ip->obj_id != 0);
774         ip->obj_asof = hmp->asof;
775         ip->hmp = hmp;
776         ip->flush_state = HAMMER_FST_IDLE;
777         ip->flags = HAMMER_INODE_DDIRTY |
778                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
779         ip->cache[0].ip = ip;
780         ip->cache[1].ip = ip;
781         ip->cache[2].ip = ip;
782         ip->cache[3].ip = ip;
783
784         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
785         /* ip->save_trunc_off = 0; (already zero) */
786         RB_INIT(&ip->rec_tree);
787         TAILQ_INIT(&ip->target_list);
788
789         ip->ino_data.atime = trans->time;
790         ip->ino_data.mtime = trans->time;
791         ip->ino_data.size = 0;
792         ip->ino_data.nlinks = 0;
793
794         /*
795          * A nohistory designator on the parent directory is inherited by
796          * the child.  We will do this even for pseudo-fs creation... the
797          * sysad can turn it off.
798          */
799         if (dip) {
800                 ip->ino_data.uflags = dip->ino_data.uflags &
801                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
802         }
803
804         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
805         ip->ino_leaf.base.localization = ip->obj_localization +
806                                          HAMMER_LOCALIZE_INODE;
807         ip->ino_leaf.base.obj_id = ip->obj_id;
808         ip->ino_leaf.base.key = 0;
809         ip->ino_leaf.base.create_tid = 0;
810         ip->ino_leaf.base.delete_tid = 0;
811         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
812         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
813
814         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
815         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
816         ip->ino_data.mode = vap->va_mode;
817         ip->ino_data.ctime = trans->time;
818
819         /*
820          * If we are running version 2 or greater directory entries are
821          * inode-localized instead of data-localized.
822          */
823         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
824                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
825                         ip->ino_data.cap_flags |=
826                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
827                 }
828         }
829         if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
830                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
831                         ip->ino_data.cap_flags |=
832                                 HAMMER_INODE_CAP_DIRHASH_ALG1;
833                 }
834         }
835
836         /*
837          * Setup the ".." pointer.  This only needs to be done for directories
838          * but we do it for all objects as a recovery aid.
839          */
840         if (dip)
841                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
842 #if 0
843         /*
844          * The parent_obj_localization field only applies to pseudo-fs roots.
845          * XXX this is no longer applicable, PFSs are no longer directly
846          * tied into the parent's directory structure.
847          */
848         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
849             ip->obj_id == HAMMER_OBJID_ROOT) {
850                 ip->ino_data.ext.obj.parent_obj_localization = 
851                                                 dip->obj_localization;
852         }
853 #endif
854
855         switch(ip->ino_leaf.base.obj_type) {
856         case HAMMER_OBJTYPE_CDEV:
857         case HAMMER_OBJTYPE_BDEV:
858                 ip->ino_data.rmajor = vap->va_rmajor;
859                 ip->ino_data.rminor = vap->va_rminor;
860                 break;
861         default:
862                 break;
863         }
864
865         /*
866          * Calculate default uid/gid and overwrite with information from
867          * the vap.
868          */
869         if (dip) {
870                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
871                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
872                                              xuid, cred, &vap->va_mode);
873         } else {
874                 xuid = 0;
875         }
876         ip->ino_data.mode = vap->va_mode;
877
878         if (vap->va_vaflags & VA_UID_UUID_VALID)
879                 ip->ino_data.uid = vap->va_uid_uuid;
880         else if (vap->va_uid != (uid_t)VNOVAL)
881                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
882         else
883                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
884
885         if (vap->va_vaflags & VA_GID_UUID_VALID)
886                 ip->ino_data.gid = vap->va_gid_uuid;
887         else if (vap->va_gid != (gid_t)VNOVAL)
888                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
889         else if (dip)
890                 ip->ino_data.gid = dip->ino_data.gid;
891
892         hammer_ref(&ip->lock);
893
894         if (pfsm) {
895                 ip->pfsm = pfsm;
896                 hammer_ref(&pfsm->lock);
897                 error = 0;
898         } else if (dip->obj_localization == ip->obj_localization) {
899                 ip->pfsm = dip->pfsm;
900                 hammer_ref(&ip->pfsm->lock);
901                 error = 0;
902         } else {
903                 ip->pfsm = hammer_load_pseudofs(trans,
904                                                 ip->obj_localization,
905                                                 &error);
906                 error = 0;      /* ignore ENOENT */
907         }
908
909         if (error) {
910                 hammer_free_inode(ip);
911                 ip = NULL;
912         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
913                 panic("hammer_create_inode: duplicate obj_id %llx",
914                       (long long)ip->obj_id);
915                 /* not reached */
916                 hammer_free_inode(ip);
917         }
918         *ipp = ip;
919         return(error);
920 }
921
922 /*
923  * Final cleanup / freeing of an inode structure
924  */
925 static void
926 hammer_free_inode(hammer_inode_t ip)
927 {
928         struct hammer_mount *hmp;
929
930         hmp = ip->hmp;
931         KKASSERT(hammer_oneref(&ip->lock));
932         hammer_uncache_node(&ip->cache[0]);
933         hammer_uncache_node(&ip->cache[1]);
934         hammer_uncache_node(&ip->cache[2]);
935         hammer_uncache_node(&ip->cache[3]);
936         hammer_inode_wakereclaims(ip);
937         if (ip->objid_cache)
938                 hammer_clear_objid(ip);
939         --hammer_count_inodes;
940         --hmp->count_inodes;
941         if (ip->pfsm) {
942                 hammer_rel_pseudofs(hmp, ip->pfsm);
943                 ip->pfsm = NULL;
944         }
945         kfree(ip, hmp->m_inodes);
946         ip = NULL;
947 }
948
949 /*
950  * Retrieve pseudo-fs data.  NULL will never be returned.
951  *
952  * If an error occurs *errorp will be set and a default template is returned,
953  * otherwise *errorp is set to 0.  Typically when an error occurs it will
954  * be ENOENT.
955  */
956 hammer_pseudofs_inmem_t
957 hammer_load_pseudofs(hammer_transaction_t trans,
958                      u_int32_t localization, int *errorp)
959 {
960         hammer_mount_t hmp = trans->hmp;
961         hammer_inode_t ip;
962         hammer_pseudofs_inmem_t pfsm;
963         struct hammer_cursor cursor;
964         int bytes;
965
966 retry:
967         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
968         if (pfsm) {
969                 hammer_ref(&pfsm->lock);
970                 *errorp = 0;
971                 return(pfsm);
972         }
973
974         /*
975          * PFS records are stored in the root inode (not the PFS root inode,
976          * but the real root).  Avoid an infinite recursion if loading
977          * the PFS for the real root.
978          */
979         if (localization) {
980                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
981                                       HAMMER_MAX_TID,
982                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
983         } else {
984                 ip = NULL;
985         }
986
987         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
988         pfsm->localization = localization;
989         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
990         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
991
992         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
993         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
994                                       HAMMER_LOCALIZE_MISC;
995         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
996         cursor.key_beg.create_tid = 0;
997         cursor.key_beg.delete_tid = 0;
998         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
999         cursor.key_beg.obj_type = 0;
1000         cursor.key_beg.key = localization;
1001         cursor.asof = HAMMER_MAX_TID;
1002         cursor.flags |= HAMMER_CURSOR_ASOF;
1003
1004         if (ip)
1005                 *errorp = hammer_ip_lookup(&cursor);
1006         else
1007                 *errorp = hammer_btree_lookup(&cursor);
1008         if (*errorp == 0) {
1009                 *errorp = hammer_ip_resolve_data(&cursor);
1010                 if (*errorp == 0) {
1011                         if (cursor.data->pfsd.mirror_flags &
1012                             HAMMER_PFSD_DELETED) {
1013                                 *errorp = ENOENT;
1014                         } else {
1015                                 bytes = cursor.leaf->data_len;
1016                                 if (bytes > sizeof(pfsm->pfsd))
1017                                         bytes = sizeof(pfsm->pfsd);
1018                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
1019                         }
1020                 }
1021         }
1022         hammer_done_cursor(&cursor);
1023
1024         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1025         hammer_ref(&pfsm->lock);
1026         if (ip)
1027                 hammer_rel_inode(ip, 0);
1028         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
1029                 kfree(pfsm, hmp->m_misc);
1030                 goto retry;
1031         }
1032         return(pfsm);
1033 }
1034
1035 /*
1036  * Store pseudo-fs data.  The backend will automatically delete any prior
1037  * on-disk pseudo-fs data but we have to delete in-memory versions.
1038  */
1039 int
1040 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
1041 {
1042         struct hammer_cursor cursor;
1043         hammer_record_t record;
1044         hammer_inode_t ip;
1045         int error;
1046
1047         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1048                               HAMMER_DEF_LOCALIZATION, 0, &error);
1049 retry:
1050         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1051         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
1052         cursor.key_beg.localization = ip->obj_localization +
1053                                       HAMMER_LOCALIZE_MISC;
1054         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1055         cursor.key_beg.create_tid = 0;
1056         cursor.key_beg.delete_tid = 0;
1057         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1058         cursor.key_beg.obj_type = 0;
1059         cursor.key_beg.key = pfsm->localization;
1060         cursor.asof = HAMMER_MAX_TID;
1061         cursor.flags |= HAMMER_CURSOR_ASOF;
1062
1063         /*
1064          * Replace any in-memory version of the record.
1065          */
1066         error = hammer_ip_lookup(&cursor);
1067         if (error == 0 && hammer_cursor_inmem(&cursor)) {
1068                 record = cursor.iprec;
1069                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1070                         KKASSERT(cursor.deadlk_rec == NULL);
1071                         hammer_ref(&record->lock);
1072                         cursor.deadlk_rec = record;
1073                         error = EDEADLK;
1074                 } else {
1075                         record->flags |= HAMMER_RECF_DELETED_FE;
1076                         error = 0;
1077                 }
1078         }
1079
1080         /*
1081          * Allocate replacement general record.  The backend flush will
1082          * delete any on-disk version of the record.
1083          */
1084         if (error == 0 || error == ENOENT) {
1085                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1086                 record->type = HAMMER_MEM_RECORD_GENERAL;
1087
1088                 record->leaf.base.localization = ip->obj_localization +
1089                                                  HAMMER_LOCALIZE_MISC;
1090                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1091                 record->leaf.base.key = pfsm->localization;
1092                 record->leaf.data_len = sizeof(pfsm->pfsd);
1093                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1094                 error = hammer_ip_add_record(trans, record);
1095         }
1096         hammer_done_cursor(&cursor);
1097         if (error == EDEADLK)
1098                 goto retry;
1099         hammer_rel_inode(ip, 0);
1100         return(error);
1101 }
1102
1103 /*
1104  * Create a root directory for a PFS if one does not alredy exist.
1105  *
1106  * The PFS root stands alone so we must also bump the nlinks count
1107  * to prevent it from being destroyed on release.
1108  */
1109 int
1110 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1111                        hammer_pseudofs_inmem_t pfsm)
1112 {
1113         hammer_inode_t ip;
1114         struct vattr vap;
1115         int error;
1116
1117         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1118                               pfsm->localization, 0, &error);
1119         if (ip == NULL) {
1120                 vattr_null(&vap);
1121                 vap.va_mode = 0755;
1122                 vap.va_type = VDIR;
1123                 error = hammer_create_inode(trans, &vap, cred,
1124                                             NULL, NULL, 0,
1125                                             pfsm, &ip);
1126                 if (error == 0) {
1127                         ++ip->ino_data.nlinks;
1128                         hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
1129                 }
1130         }
1131         if (ip)
1132                 hammer_rel_inode(ip, 0);
1133         return(error);
1134 }
1135
1136 /*
1137  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1138  * if we are unable to disassociate all the inodes.
1139  */
1140 static
1141 int
1142 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1143 {
1144         int res;
1145
1146         hammer_ref(&ip->lock);
1147         if (hammer_isactive(&ip->lock) == 2 && ip->vp)
1148                 vclean_unlocked(ip->vp);
1149         if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
1150                 res = 0;
1151         else
1152                 res = -1;       /* stop, someone is using the inode */
1153         hammer_rel_inode(ip, 0);
1154         return(res);
1155 }
1156
1157 int
1158 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
1159 {
1160         int res;
1161         int try;
1162
1163         for (try = res = 0; try < 4; ++try) {
1164                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1165                                            hammer_inode_pfs_cmp,
1166                                            hammer_unload_pseudofs_callback,
1167                                            &localization);
1168                 if (res == 0 && try > 1)
1169                         break;
1170                 hammer_flusher_sync(trans->hmp);
1171         }
1172         if (res != 0)
1173                 res = ENOTEMPTY;
1174         return(res);
1175 }
1176
1177
1178 /*
1179  * Release a reference on a PFS
1180  */
1181 void
1182 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1183 {
1184         hammer_rel(&pfsm->lock);
1185         if (hammer_norefs(&pfsm->lock)) {
1186                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1187                 kfree(pfsm, hmp->m_misc);
1188         }
1189 }
1190
1191 /*
1192  * Called by hammer_sync_inode().
1193  */
1194 static int
1195 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1196 {
1197         hammer_transaction_t trans = cursor->trans;
1198         hammer_record_t record;
1199         int error;
1200         int redirty;
1201
1202 retry:
1203         error = 0;
1204
1205         /*
1206          * If the inode has a presence on-disk then locate it and mark
1207          * it deleted, setting DELONDISK.
1208          *
1209          * The record may or may not be physically deleted, depending on
1210          * the retention policy.
1211          */
1212         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1213             HAMMER_INODE_ONDISK) {
1214                 hammer_normalize_cursor(cursor);
1215                 cursor->key_beg.localization = ip->obj_localization + 
1216                                                HAMMER_LOCALIZE_INODE;
1217                 cursor->key_beg.obj_id = ip->obj_id;
1218                 cursor->key_beg.key = 0;
1219                 cursor->key_beg.create_tid = 0;
1220                 cursor->key_beg.delete_tid = 0;
1221                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1222                 cursor->key_beg.obj_type = 0;
1223                 cursor->asof = ip->obj_asof;
1224                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1225                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1226                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1227
1228                 error = hammer_btree_lookup(cursor);
1229                 if (hammer_debug_inode)
1230                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1231
1232                 if (error == 0) {
1233                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1234                         if (hammer_debug_inode)
1235                                 kprintf(" error %d\n", error);
1236                         if (error == 0) {
1237                                 ip->flags |= HAMMER_INODE_DELONDISK;
1238                         }
1239                         if (cursor->node)
1240                                 hammer_cache_node(&ip->cache[0], cursor->node);
1241                 }
1242                 if (error == EDEADLK) {
1243                         hammer_done_cursor(cursor);
1244                         error = hammer_init_cursor(trans, cursor,
1245                                                    &ip->cache[0], ip);
1246                         if (hammer_debug_inode)
1247                                 kprintf("IPDED %p %d\n", ip, error);
1248                         if (error == 0)
1249                                 goto retry;
1250                 }
1251         }
1252
1253         /*
1254          * Ok, write out the initial record or a new record (after deleting
1255          * the old one), unless the DELETED flag is set.  This routine will
1256          * clear DELONDISK if it writes out a record.
1257          *
1258          * Update our inode statistics if this is the first application of
1259          * the inode on-disk.
1260          */
1261         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1262                 /*
1263                  * Generate a record and write it to the media.  We clean-up
1264                  * the state before releasing so we do not have to set-up
1265                  * a flush_group.
1266                  */
1267                 record = hammer_alloc_mem_record(ip, 0);
1268                 record->type = HAMMER_MEM_RECORD_INODE;
1269                 record->flush_state = HAMMER_FST_FLUSH;
1270                 record->leaf = ip->sync_ino_leaf;
1271                 record->leaf.base.create_tid = trans->tid;
1272                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1273                 record->leaf.create_ts = trans->time32;
1274                 record->data = (void *)&ip->sync_ino_data;
1275                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1276
1277                 /*
1278                  * If this flag is set we cannot sync the new file size
1279                  * because we haven't finished related truncations.  The
1280                  * inode will be flushed in another flush group to finish
1281                  * the job.
1282                  */
1283                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1284                     ip->sync_ino_data.size != ip->ino_data.size) {
1285                         redirty = 1;
1286                         ip->sync_ino_data.size = ip->ino_data.size;
1287                 } else {
1288                         redirty = 0;
1289                 }
1290
1291                 for (;;) {
1292                         error = hammer_ip_sync_record_cursor(cursor, record);
1293                         if (hammer_debug_inode)
1294                                 kprintf("GENREC %p rec %08x %d\n",      
1295                                         ip, record->flags, error);
1296                         if (error != EDEADLK)
1297                                 break;
1298                         hammer_done_cursor(cursor);
1299                         error = hammer_init_cursor(trans, cursor,
1300                                                    &ip->cache[0], ip);
1301                         if (hammer_debug_inode)
1302                                 kprintf("GENREC reinit %d\n", error);
1303                         if (error)
1304                                 break;
1305                 }
1306
1307                 /*
1308                  * Note:  The record was never on the inode's record tree
1309                  * so just wave our hands importantly and destroy it.
1310                  */
1311                 record->flags |= HAMMER_RECF_COMMITTED;
1312                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1313                 record->flush_state = HAMMER_FST_IDLE;
1314                 ++ip->rec_generation;
1315                 hammer_rel_mem_record(record);
1316
1317                 /*
1318                  * Finish up.
1319                  */
1320                 if (error == 0) {
1321                         if (hammer_debug_inode)
1322                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1323                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1324                                             HAMMER_INODE_SDIRTY |
1325                                             HAMMER_INODE_ATIME |
1326                                             HAMMER_INODE_MTIME);
1327                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1328                         if (redirty)
1329                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1330
1331                         /*
1332                          * Root volume count of inodes
1333                          */
1334                         hammer_sync_lock_sh(trans);
1335                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1336                                 hammer_modify_volume_field(trans,
1337                                                            trans->rootvol,
1338                                                            vol0_stat_inodes);
1339                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1340                                 hammer_modify_volume_done(trans->rootvol);
1341                                 ip->flags |= HAMMER_INODE_ONDISK;
1342                                 if (hammer_debug_inode)
1343                                         kprintf("NOWONDISK %p\n", ip);
1344                         }
1345                         hammer_sync_unlock(trans);
1346                 }
1347         }
1348
1349         /*
1350          * If the inode has been destroyed, clean out any left-over flags
1351          * that may have been set by the frontend.
1352          */
1353         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
1354                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1355                                     HAMMER_INODE_SDIRTY |
1356                                     HAMMER_INODE_ATIME |
1357                                     HAMMER_INODE_MTIME);
1358         }
1359         return(error);
1360 }
1361
1362 /*
1363  * Update only the itimes fields.
1364  *
1365  * ATIME can be updated without generating any UNDO.  MTIME is updated
1366  * with UNDO so it is guaranteed to be synchronized properly in case of
1367  * a crash.
1368  *
1369  * Neither field is included in the B-Tree leaf element's CRC, which is how
1370  * we can get away with updating ATIME the way we do.
1371  */
1372 static int
1373 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1374 {
1375         hammer_transaction_t trans = cursor->trans;
1376         int error;
1377
1378 retry:
1379         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1380             HAMMER_INODE_ONDISK) {
1381                 return(0);
1382         }
1383
1384         hammer_normalize_cursor(cursor);
1385         cursor->key_beg.localization = ip->obj_localization + 
1386                                        HAMMER_LOCALIZE_INODE;
1387         cursor->key_beg.obj_id = ip->obj_id;
1388         cursor->key_beg.key = 0;
1389         cursor->key_beg.create_tid = 0;
1390         cursor->key_beg.delete_tid = 0;
1391         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1392         cursor->key_beg.obj_type = 0;
1393         cursor->asof = ip->obj_asof;
1394         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1395         cursor->flags |= HAMMER_CURSOR_ASOF;
1396         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1397         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1398         cursor->flags |= HAMMER_CURSOR_BACKEND;
1399
1400         error = hammer_btree_lookup(cursor);
1401         if (error == 0) {
1402                 hammer_cache_node(&ip->cache[0], cursor->node);
1403                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1404                         /*
1405                          * Updating MTIME requires an UNDO.  Just cover
1406                          * both atime and mtime.
1407                          */
1408                         hammer_sync_lock_sh(trans);
1409                         hammer_modify_buffer(trans, cursor->data_buffer,
1410                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
1411                                      HAMMER_ITIMES_BYTES);
1412                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1413                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1414                         hammer_modify_buffer_done(cursor->data_buffer);
1415                         hammer_sync_unlock(trans);
1416                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1417                         /*
1418                          * Updating atime only can be done in-place with
1419                          * no UNDO.
1420                          */
1421                         hammer_sync_lock_sh(trans);
1422                         hammer_modify_buffer(trans, cursor->data_buffer,
1423                                              NULL, 0);
1424                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1425                         hammer_modify_buffer_done(cursor->data_buffer);
1426                         hammer_sync_unlock(trans);
1427                 }
1428                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1429         }
1430         if (error == EDEADLK) {
1431                 hammer_done_cursor(cursor);
1432                 error = hammer_init_cursor(trans, cursor,
1433                                            &ip->cache[0], ip);
1434                 if (error == 0)
1435                         goto retry;
1436         }
1437         return(error);
1438 }
1439
1440 /*
1441  * Release a reference on an inode, flush as requested.
1442  *
1443  * On the last reference we queue the inode to the flusher for its final
1444  * disposition.
1445  */
1446 void
1447 hammer_rel_inode(struct hammer_inode *ip, int flush)
1448 {
1449         /*hammer_mount_t hmp = ip->hmp;*/
1450
1451         /*
1452          * Handle disposition when dropping the last ref.
1453          */
1454         for (;;) {
1455                 if (hammer_oneref(&ip->lock)) {
1456                         /*
1457                          * Determine whether on-disk action is needed for
1458                          * the inode's final disposition.
1459                          */
1460                         KKASSERT(ip->vp == NULL);
1461                         hammer_inode_unloadable_check(ip, 0);
1462                         if (ip->flags & HAMMER_INODE_MODMASK) {
1463                                 hammer_flush_inode(ip, 0);
1464                         } else if (hammer_oneref(&ip->lock)) {
1465                                 hammer_unload_inode(ip);
1466                                 break;
1467                         }
1468                 } else {
1469                         if (flush)
1470                                 hammer_flush_inode(ip, 0);
1471
1472                         /*
1473                          * The inode still has multiple refs, try to drop
1474                          * one ref.
1475                          */
1476                         KKASSERT(hammer_isactive(&ip->lock) >= 1);
1477                         if (hammer_isactive(&ip->lock) > 1) {
1478                                 hammer_rel(&ip->lock);
1479                                 break;
1480                         }
1481                 }
1482         }
1483 }
1484
1485 /*
1486  * Unload and destroy the specified inode.  Must be called with one remaining
1487  * reference.  The reference is disposed of.
1488  *
1489  * The inode must be completely clean.
1490  */
1491 static int
1492 hammer_unload_inode(struct hammer_inode *ip)
1493 {
1494         hammer_mount_t hmp = ip->hmp;
1495
1496         KASSERT(hammer_oneref(&ip->lock),
1497                 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock)));
1498         KKASSERT(ip->vp == NULL);
1499         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1500         KKASSERT(ip->cursor_ip_refs == 0);
1501         KKASSERT(hammer_notlocked(&ip->lock));
1502         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1503
1504         KKASSERT(RB_EMPTY(&ip->rec_tree));
1505         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1506
1507         if (ip->flags & HAMMER_INODE_RDIRTY) {
1508                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
1509                 ip->flags &= ~HAMMER_INODE_RDIRTY;
1510         }
1511         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1512
1513         hammer_free_inode(ip);
1514         return(0);
1515 }
1516
1517 /*
1518  * Called during unmounting if a critical error occured.  The in-memory
1519  * inode and all related structures are destroyed.
1520  *
1521  * If a critical error did not occur the unmount code calls the standard
1522  * release and asserts that the inode is gone.
1523  */
1524 int
1525 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1526 {
1527         hammer_record_t rec;
1528
1529         /*
1530          * Get rid of the inodes in-memory records, regardless of their
1531          * state, and clear the mod-mask.
1532          */
1533         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1534                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1535                 rec->target_ip = NULL;
1536                 if (rec->flush_state == HAMMER_FST_SETUP)
1537                         rec->flush_state = HAMMER_FST_IDLE;
1538         }
1539         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1540                 if (rec->flush_state == HAMMER_FST_FLUSH)
1541                         --rec->flush_group->refs;
1542                 else
1543                         hammer_ref(&rec->lock);
1544                 KKASSERT(hammer_oneref(&rec->lock));
1545                 rec->flush_state = HAMMER_FST_IDLE;
1546                 rec->flush_group = NULL;
1547                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1548                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1549                 ++ip->rec_generation;
1550                 hammer_rel_mem_record(rec);
1551         }
1552         ip->flags &= ~HAMMER_INODE_MODMASK;
1553         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1554         KKASSERT(ip->vp == NULL);
1555
1556         /*
1557          * Remove the inode from any flush group, force it idle.  FLUSH
1558          * and SETUP states have an inode ref.
1559          */
1560         switch(ip->flush_state) {
1561         case HAMMER_FST_FLUSH:
1562                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
1563                 --ip->flush_group->refs;
1564                 ip->flush_group = NULL;
1565                 /* fall through */
1566         case HAMMER_FST_SETUP:
1567                 hammer_rel(&ip->lock);
1568                 ip->flush_state = HAMMER_FST_IDLE;
1569                 /* fall through */
1570         case HAMMER_FST_IDLE:
1571                 break;
1572         }
1573
1574         /*
1575          * There shouldn't be any associated vnode.  The unload needs at
1576          * least one ref, if we do have a vp steal its ip ref.
1577          */
1578         if (ip->vp) {
1579                 kprintf("hammer_destroy_inode_callback: Unexpected "
1580                         "vnode association ip %p vp %p\n", ip, ip->vp);
1581                 ip->vp->v_data = NULL;
1582                 ip->vp = NULL;
1583         } else {
1584                 hammer_ref(&ip->lock);
1585         }
1586         hammer_unload_inode(ip);
1587         return(0);
1588 }
1589
1590 /*
1591  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1592  * the read-only flag for cached inodes.
1593  *
1594  * This routine is called from a RB_SCAN().
1595  */
1596 int
1597 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1598 {
1599         hammer_mount_t hmp = ip->hmp;
1600
1601         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1602                 ip->flags |= HAMMER_INODE_RO;
1603         else
1604                 ip->flags &= ~HAMMER_INODE_RO;
1605         return(0);
1606 }
1607
1608 /*
1609  * A transaction has modified an inode, requiring updates as specified by
1610  * the passed flags.
1611  *
1612  * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
1613  *                      and not including size changes due to write-append
1614  *                      (but other size changes are included).
1615  * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
1616  *                      write-append.
1617  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1618  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1619  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1620  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1621  */
1622 void
1623 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
1624 {
1625         /* 
1626          * ronly of 0 or 2 does not trigger assertion.
1627          * 2 is a special error state 
1628          */
1629         KKASSERT(ip->hmp->ronly != 1 ||
1630                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 
1631                             HAMMER_INODE_SDIRTY |
1632                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1633                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1634         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1635                 ip->flags |= HAMMER_INODE_RSV_INODES;
1636                 ++ip->hmp->rsv_inodes;
1637         }
1638
1639         /*
1640          * Set the NEWINODE flag in the transaction if the inode
1641          * transitions to a dirty state.  This is used to track
1642          * the load on the inode cache.
1643          */
1644         if (trans &&
1645             (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1646             (flags & HAMMER_INODE_MODMASK)) {
1647                 trans->flags |= HAMMER_TRANSF_NEWINODE;
1648         }
1649         if (flags & HAMMER_INODE_MODMASK)
1650                 hammer_inode_dirty(ip);
1651         ip->flags |= flags;
1652 }
1653
1654 /*
1655  * Attempt to quickly update the atime for a hammer inode.  Return 0 on
1656  * success, -1 on failure.
1657  *
1658  * We attempt to update the atime with only the ip lock and not the
1659  * whole filesystem lock in order to improve concurrency.  We can only
1660  * do this safely if the ATIME flag is already pending on the inode.
1661  *
1662  * This function is called via a vnops path (ip pointer is stable) without
1663  * fs_token held.
1664  */
1665 int
1666 hammer_update_atime_quick(hammer_inode_t ip)
1667 {
1668         struct timeval tv;
1669         int res = -1;
1670
1671         if ((ip->flags & HAMMER_INODE_RO) ||
1672             (ip->hmp->mp->mnt_flag & MNT_NOATIME)) {
1673                 /*
1674                  * Silently indicate success on read-only mount/snap
1675                  */
1676                 res = 0;
1677         } else if (ip->flags & HAMMER_INODE_ATIME) {
1678                 /*
1679                  * Double check with inode lock held against backend.  This
1680                  * is only safe if all we need to do is update
1681                  * ino_data.atime.
1682                  */
1683                 getmicrotime(&tv);
1684                 hammer_lock_ex(&ip->lock);
1685                 if (ip->flags & HAMMER_INODE_ATIME) {
1686                         ip->ino_data.atime =
1687                             (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec;
1688                         res = 0;
1689                 }
1690                 hammer_unlock(&ip->lock);
1691         }
1692         return res;
1693 }
1694
1695 /*
1696  * Request that an inode be flushed.  This whole mess cannot block and may
1697  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1698  * actively flush the inode until the flush can be done.
1699  *
1700  * The inode may already be flushing, or may be in a setup state.  We can
1701  * place the inode in a flushing state if it is currently idle and flag it
1702  * to reflush if it is currently flushing.
1703  *
1704  * Upon return if the inode could not be flushed due to a setup
1705  * dependancy, then it will be automatically flushed when the dependancy
1706  * is satisfied.
1707  */
1708 void
1709 hammer_flush_inode(hammer_inode_t ip, int flags)
1710 {
1711         hammer_mount_t hmp;
1712         hammer_flush_group_t flg;
1713         int good;
1714
1715         /*
1716          * fill_flush_group is the first flush group we may be able to
1717          * continue filling, it may be open or closed but it will always
1718          * be past the currently flushing (running) flg.
1719          *
1720          * next_flush_group is the next open flush group.
1721          */
1722         hmp = ip->hmp;
1723         while ((flg = hmp->fill_flush_group) != NULL) {
1724                 KKASSERT(flg->running == 0);
1725                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
1726                     flg->total_count <= hammer_autoflush) {
1727                         break;
1728                 }
1729                 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
1730                 hammer_flusher_async(ip->hmp, flg);
1731         }
1732         if (flg == NULL) {
1733                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1734                 flg->seq = hmp->flusher.next++;
1735                 if (hmp->next_flush_group == NULL)
1736                         hmp->next_flush_group = flg;
1737                 if (hmp->fill_flush_group == NULL)
1738                         hmp->fill_flush_group = flg;
1739                 RB_INIT(&flg->flush_tree);
1740                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1741         }
1742
1743         /*
1744          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1745          * state we have to put it back into an IDLE state so we can
1746          * drop the extra ref.
1747          *
1748          * If we have a parent dependancy we must still fall through
1749          * so we can run it.
1750          */
1751         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1752                 if (ip->flush_state == HAMMER_FST_SETUP &&
1753                     TAILQ_EMPTY(&ip->target_list)) {
1754                         ip->flush_state = HAMMER_FST_IDLE;
1755                         hammer_rel_inode(ip, 0);
1756                 }
1757                 if (ip->flush_state == HAMMER_FST_IDLE)
1758                         return;
1759         }
1760
1761         /*
1762          * Our flush action will depend on the current state.
1763          */
1764         switch(ip->flush_state) {
1765         case HAMMER_FST_IDLE:
1766                 /*
1767                  * We have no dependancies and can flush immediately.  Some
1768                  * our children may not be flushable so we have to re-test
1769                  * with that additional knowledge.
1770                  */
1771                 hammer_flush_inode_core(ip, flg, flags);
1772                 break;
1773         case HAMMER_FST_SETUP:
1774                 /*
1775                  * Recurse upwards through dependancies via target_list
1776                  * and start their flusher actions going if possible.
1777                  *
1778                  * 'good' is our connectivity.  -1 means we have none and
1779                  * can't flush, 0 means there weren't any dependancies, and
1780                  * 1 means we have good connectivity.
1781                  */
1782                 good = hammer_setup_parent_inodes(ip, 0, flg);
1783
1784                 if (good >= 0) {
1785                         /*
1786                          * We can continue if good >= 0.  Determine how 
1787                          * many records under our inode can be flushed (and
1788                          * mark them).
1789                          */
1790                         hammer_flush_inode_core(ip, flg, flags);
1791                 } else {
1792                         /*
1793                          * Parent has no connectivity, tell it to flush
1794                          * us as soon as it does.
1795                          *
1796                          * The REFLUSH flag is also needed to trigger
1797                          * dependancy wakeups.
1798                          */
1799                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1800                                      HAMMER_INODE_REFLUSH;
1801                         if (flags & HAMMER_FLUSH_SIGNAL) {
1802                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1803                                 hammer_flusher_async(ip->hmp, flg);
1804                         }
1805                 }
1806                 break;
1807         case HAMMER_FST_FLUSH:
1808                 /*
1809                  * We are already flushing, flag the inode to reflush
1810                  * if needed after it completes its current flush.
1811                  *
1812                  * The REFLUSH flag is also needed to trigger
1813                  * dependancy wakeups.
1814                  */
1815                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1816                         ip->flags |= HAMMER_INODE_REFLUSH;
1817                 if (flags & HAMMER_FLUSH_SIGNAL) {
1818                         ip->flags |= HAMMER_INODE_RESIGNAL;
1819                         hammer_flusher_async(ip->hmp, flg);
1820                 }
1821                 break;
1822         }
1823 }
1824
1825 /*
1826  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1827  * ip which reference our ip.
1828  *
1829  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1830  *     so for now do not ref/deref the structures.  Note that if we use the
1831  *     ref/rel code later, the rel CAN block.
1832  */
1833 static int
1834 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1835                            hammer_flush_group_t flg)
1836 {
1837         hammer_record_t depend;
1838         int good;
1839         int r;
1840
1841         /*
1842          * If we hit our recursion limit and we have parent dependencies
1843          * We cannot continue.  Returning < 0 will cause us to be flagged
1844          * for reflush.  Returning -2 cuts off additional dependency checks
1845          * because they are likely to also hit the depth limit.
1846          *
1847          * We cannot return < 0 if there are no dependencies or there might
1848          * not be anything to wakeup (ip).
1849          */
1850         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1851                 krateprintf(&hammer_gen_krate,
1852                             "HAMMER Warning: depth limit reached on "
1853                             "setup recursion, inode %p %016llx\n",
1854                             ip, (long long)ip->obj_id);
1855                 return(-2);
1856         }
1857
1858         /*
1859          * Scan dependencies
1860          */
1861         good = 0;
1862         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1863                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1864                 KKASSERT(depend->target_ip == ip);
1865                 if (r < 0 && good == 0)
1866                         good = -1;
1867                 if (r > 0)
1868                         good = 1;
1869
1870                 /*
1871                  * If we failed due to the recursion depth limit then stop
1872                  * now.
1873                  */
1874                 if (r == -2)
1875                         break;
1876         }
1877         return(good);
1878 }
1879
1880 /*
1881  * This helper function takes a record representing the dependancy between
1882  * the parent inode and child inode.
1883  *
1884  * record->ip           = parent inode
1885  * record->target_ip    = child inode
1886  * 
1887  * We are asked to recurse upwards and convert the record from SETUP
1888  * to FLUSH if possible.
1889  *
1890  * Return 1 if the record gives us connectivity
1891  *
1892  * Return 0 if the record is not relevant 
1893  *
1894  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1895  */
1896 static int
1897 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1898                                   hammer_flush_group_t flg)
1899 {
1900         hammer_inode_t pip;
1901         int good;
1902
1903         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1904         pip = record->ip;
1905
1906         /*
1907          * If the record is already flushing, is it in our flush group?
1908          *
1909          * If it is in our flush group but it is a general record or a 
1910          * delete-on-disk, it does not improve our connectivity (return 0),
1911          * and if the target inode is not trying to destroy itself we can't
1912          * allow the operation yet anyway (the second return -1).
1913          */
1914         if (record->flush_state == HAMMER_FST_FLUSH) {
1915                 /*
1916                  * If not in our flush group ask the parent to reflush
1917                  * us as soon as possible.
1918                  */
1919                 if (record->flush_group != flg) {
1920                         pip->flags |= HAMMER_INODE_REFLUSH;
1921                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1922                         return(-1);
1923                 }
1924
1925                 /*
1926                  * If in our flush group everything is already set up,
1927                  * just return whether the record will improve our
1928                  * visibility or not.
1929                  */
1930                 if (record->type == HAMMER_MEM_RECORD_ADD)
1931                         return(1);
1932                 return(0);
1933         }
1934
1935         /*
1936          * It must be a setup record.  Try to resolve the setup dependancies
1937          * by recursing upwards so we can place ip on the flush list.
1938          *
1939          * Limit ourselves to 20 levels of recursion to avoid blowing out
1940          * the kernel stack.  If we hit the recursion limit we can't flush
1941          * until the parent flushes.  The parent will flush independantly
1942          * on its own and ultimately a deep recursion will be resolved.
1943          */
1944         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1945
1946         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1947
1948         /*
1949          * If good < 0 the parent has no connectivity and we cannot safely
1950          * flush the directory entry, which also means we can't flush our
1951          * ip.  Flag us for downward recursion once the parent's
1952          * connectivity is resolved.  Flag the parent for [re]flush or it
1953          * may not check for downward recursions.
1954          */
1955         if (good < 0) {
1956                 pip->flags |= HAMMER_INODE_REFLUSH;
1957                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1958                 return(good);
1959         }
1960
1961         /*
1962          * We are go, place the parent inode in a flushing state so we can
1963          * place its record in a flushing state.  Note that the parent
1964          * may already be flushing.  The record must be in the same flush
1965          * group as the parent.
1966          */
1967         if (pip->flush_state != HAMMER_FST_FLUSH)
1968                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
1969         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
1970
1971         /*
1972          * It is possible for a rename to create a loop in the recursion
1973          * and revisit a record.  This will result in the record being
1974          * placed in a flush state unexpectedly.  This check deals with
1975          * the case.
1976          */
1977         if (record->flush_state == HAMMER_FST_FLUSH) {
1978                 if (record->type == HAMMER_MEM_RECORD_ADD)
1979                         return(1);
1980                 return(0);
1981         }
1982
1983         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1984
1985 #if 0
1986         if (record->type == HAMMER_MEM_RECORD_DEL &&
1987             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
1988                 /*
1989                  * Regardless of flushing state we cannot sync this path if the
1990                  * record represents a delete-on-disk but the target inode
1991                  * is not ready to sync its own deletion.
1992                  *
1993                  * XXX need to count effective nlinks to determine whether
1994                  * the flush is ok, otherwise removing a hardlink will
1995                  * just leave the DEL record to rot.
1996                  */
1997                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
1998                 return(-1);
1999         } else
2000 #endif
2001         if (pip->flush_group == flg) {
2002                 /*
2003                  * Because we have not calculated nlinks yet we can just
2004                  * set records to the flush state if the parent is in
2005                  * the same flush group as we are.
2006                  */
2007                 record->flush_state = HAMMER_FST_FLUSH;
2008                 record->flush_group = flg;
2009                 ++record->flush_group->refs;
2010                 hammer_ref(&record->lock);
2011
2012                 /*
2013                  * A general directory-add contributes to our visibility.
2014                  *
2015                  * Otherwise it is probably a directory-delete or 
2016                  * delete-on-disk record and does not contribute to our
2017                  * visbility (but we can still flush it).
2018                  */
2019                 if (record->type == HAMMER_MEM_RECORD_ADD)
2020                         return(1);
2021                 return(0);
2022         } else {
2023                 /*
2024                  * If the parent is not in our flush group we cannot
2025                  * flush this record yet, there is no visibility.
2026                  * We tell the parent to reflush and mark ourselves
2027                  * so the parent knows it should flush us too.
2028                  */
2029                 pip->flags |= HAMMER_INODE_REFLUSH;
2030                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
2031                 return(-1);
2032         }
2033 }
2034
2035 /*
2036  * This is the core routine placing an inode into the FST_FLUSH state.
2037  */
2038 static void
2039 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
2040 {
2041         hammer_mount_t hmp = ip->hmp;
2042         int go_count;
2043
2044         /*
2045          * Set flush state and prevent the flusher from cycling into
2046          * the next flush group.  Do not place the ip on the list yet.
2047          * Inodes not in the idle state get an extra reference.
2048          */
2049         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
2050         if (ip->flush_state == HAMMER_FST_IDLE)
2051                 hammer_ref(&ip->lock);
2052         ip->flush_state = HAMMER_FST_FLUSH;
2053         ip->flush_group = flg;
2054         ++hmp->flusher.group_lock;
2055         ++hmp->count_iqueued;
2056         ++hammer_count_iqueued;
2057         ++flg->total_count;
2058         hammer_redo_fifo_start_flush(ip);
2059
2060 #if 0
2061         /*
2062          * We need to be able to vfsync/truncate from the backend.
2063          *
2064          * XXX Any truncation from the backend will acquire the vnode
2065          *     independently.
2066          */
2067         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
2068         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
2069                 ip->flags |= HAMMER_INODE_VHELD;
2070                 vref(ip->vp);
2071         }
2072 #endif
2073
2074         /*
2075          * Figure out how many in-memory records we can actually flush
2076          * (not including inode meta-data, buffers, etc).
2077          */
2078         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
2079         if (flags & HAMMER_FLUSH_RECURSION) {
2080                 /*
2081                  * If this is a upwards recursion we do not want to
2082                  * recurse down again!
2083                  */
2084                 go_count = 1;
2085 #if 0
2086         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2087                 /*
2088                  * No new records are added if we must complete a flush
2089                  * from a previous cycle, but we do have to move the records
2090                  * from the previous cycle to the current one.
2091                  */
2092 #if 0
2093                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2094                                    hammer_syncgrp_child_callback, NULL);
2095 #endif
2096                 go_count = 1;
2097 #endif
2098         } else {
2099                 /*
2100                  * Normal flush, scan records and bring them into the flush.
2101                  * Directory adds and deletes are usually skipped (they are
2102                  * grouped with the related inode rather then with the
2103                  * directory).
2104                  *
2105                  * go_count can be negative, which means the scan aborted
2106                  * due to the flush group being over-full and we should
2107                  * flush what we have.
2108                  */
2109                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2110                                    hammer_setup_child_callback, NULL);
2111         }
2112
2113         /*
2114          * This is a more involved test that includes go_count.  If we
2115          * can't flush, flag the inode and return.  If go_count is 0 we
2116          * were are unable to flush any records in our rec_tree and
2117          * must ignore the XDIRTY flag.
2118          */
2119         if (go_count == 0) {
2120                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
2121                         --hmp->count_iqueued;
2122                         --hammer_count_iqueued;
2123
2124                         --flg->total_count;
2125                         ip->flush_state = HAMMER_FST_SETUP;
2126                         ip->flush_group = NULL;
2127                         if (flags & HAMMER_FLUSH_SIGNAL) {
2128                                 ip->flags |= HAMMER_INODE_REFLUSH |
2129                                              HAMMER_INODE_RESIGNAL;
2130                         } else {
2131                                 ip->flags |= HAMMER_INODE_REFLUSH;
2132                         }
2133 #if 0
2134                         if (ip->flags & HAMMER_INODE_VHELD) {
2135                                 ip->flags &= ~HAMMER_INODE_VHELD;
2136                                 vrele(ip->vp);
2137                         }
2138 #endif
2139
2140                         /*
2141                          * REFLUSH is needed to trigger dependancy wakeups
2142                          * when an inode is in SETUP.
2143                          */
2144                         ip->flags |= HAMMER_INODE_REFLUSH;
2145                         if (--hmp->flusher.group_lock == 0)
2146                                 wakeup(&hmp->flusher.group_lock);
2147                         return;
2148                 }
2149         }
2150
2151         /*
2152          * Snapshot the state of the inode for the backend flusher.
2153          *
2154          * We continue to retain save_trunc_off even when all truncations
2155          * have been resolved as an optimization to determine if we can
2156          * skip the B-Tree lookup for overwrite deletions.
2157          *
2158          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2159          * and stays in ip->flags.  Once set, it stays set until the
2160          * inode is destroyed.
2161          */
2162         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2163                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2164                 ip->sync_trunc_off = ip->trunc_off;
2165                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2166                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2167                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2168
2169                 /*
2170                  * The save_trunc_off used to cache whether the B-Tree
2171                  * holds any records past that point is not used until
2172                  * after the truncation has succeeded, so we can safely
2173                  * set it now.
2174                  */
2175                 if (ip->save_trunc_off > ip->sync_trunc_off)
2176                         ip->save_trunc_off = ip->sync_trunc_off;
2177         }
2178         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2179                            ~HAMMER_INODE_TRUNCATED);
2180         ip->sync_ino_leaf = ip->ino_leaf;
2181         ip->sync_ino_data = ip->ino_data;
2182         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2183 #ifdef DEBUG_TRUNCATE
2184         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
2185                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
2186 #endif
2187
2188         /*
2189          * The flusher list inherits our inode and reference.
2190          */
2191         KKASSERT(flg->running == 0);
2192         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
2193         if (--hmp->flusher.group_lock == 0)
2194                 wakeup(&hmp->flusher.group_lock);
2195
2196         /*
2197          * Auto-flush the group if it grows too large.  Make sure the
2198          * inode reclaim wait pipeline continues to work.
2199          */
2200         if (flg->total_count >= hammer_autoflush ||
2201             flg->total_count >= hammer_limit_reclaims / 4) {
2202                 if (hmp->fill_flush_group == flg)
2203                         hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
2204                 hammer_flusher_async(hmp, flg);
2205         }
2206 }
2207
2208 /*
2209  * Callback for scan of ip->rec_tree.  Try to include each record in our
2210  * flush.  ip->flush_group has been set but the inode has not yet been
2211  * moved into a flushing state.
2212  *
2213  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2214  * both inodes.
2215  *
2216  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2217  * the caller from shortcutting the flush.
2218  */
2219 static int
2220 hammer_setup_child_callback(hammer_record_t rec, void *data)
2221 {
2222         hammer_flush_group_t flg;
2223         hammer_inode_t target_ip;
2224         hammer_inode_t ip;
2225         int r;
2226
2227         /*
2228          * Records deleted or committed by the backend are ignored.
2229          * Note that the flush detects deleted frontend records at
2230          * multiple points to deal with races.  This is just the first
2231          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2232          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2233          * messes up link-count calculations.
2234          *
2235          * NOTE: Don't get confused between record deletion and, say,
2236          * directory entry deletion.  The deletion of a directory entry
2237          * which is on-media has nothing to do with the record deletion
2238          * flags.
2239          */
2240         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2241                           HAMMER_RECF_COMMITTED)) {
2242                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2243                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2244                         r = 1;
2245                 } else {
2246                         r = 0;
2247                 }
2248                 return(r);
2249         }
2250
2251         /*
2252          * If the record is in an idle state it has no dependancies and
2253          * can be flushed.
2254          */
2255         ip = rec->ip;
2256         flg = ip->flush_group;
2257         r = 0;
2258
2259         switch(rec->flush_state) {
2260         case HAMMER_FST_IDLE:
2261                 /*
2262                  * The record has no setup dependancy, we can flush it.
2263                  */
2264                 KKASSERT(rec->target_ip == NULL);
2265                 rec->flush_state = HAMMER_FST_FLUSH;
2266                 rec->flush_group = flg;
2267                 ++flg->refs;
2268                 hammer_ref(&rec->lock);
2269                 r = 1;
2270                 break;
2271         case HAMMER_FST_SETUP:
2272                 /*
2273                  * The record has a setup dependancy.  These are typically
2274                  * directory entry adds and deletes.  Such entries will be
2275                  * flushed when their inodes are flushed so we do not
2276                  * usually have to add them to the flush here.  However,
2277                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2278                  * it is asking us to flush this record (and it).
2279                  */
2280                 target_ip = rec->target_ip;
2281                 KKASSERT(target_ip != NULL);
2282                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2283
2284                 /*
2285                  * If the target IP is already flushing in our group
2286                  * we could associate the record, but target_ip has
2287                  * already synced ino_data to sync_ino_data and we
2288                  * would also have to adjust nlinks.   Plus there are
2289                  * ordering issues for adds and deletes.
2290                  *
2291                  * Reflush downward if this is an ADD, and upward if
2292                  * this is a DEL.
2293                  */
2294                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2295                         if (rec->type == HAMMER_MEM_RECORD_ADD)
2296                                 ip->flags |= HAMMER_INODE_REFLUSH;
2297                         else
2298                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2299                         break;
2300                 } 
2301
2302                 /*
2303                  * Target IP is not yet flushing.  This can get complex
2304                  * because we have to be careful about the recursion.
2305                  *
2306                  * Directories create an issue for us in that if a flush
2307                  * of a directory is requested the expectation is to flush
2308                  * any pending directory entries, but this will cause the
2309                  * related inodes to recursively flush as well.  We can't
2310                  * really defer the operation so just get as many as we
2311                  * can and
2312                  */
2313 #if 0
2314                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2315                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2316                         /*
2317                          * We aren't reclaiming and the target ip was not
2318                          * previously prevented from flushing due to this
2319                          * record dependancy.  Do not flush this record.
2320                          */
2321                         /*r = 0;*/
2322                 } else
2323 #endif
2324                 if (flg->total_count + flg->refs >
2325                            ip->hmp->undo_rec_limit) {
2326                         /*
2327                          * Our flush group is over-full and we risk blowing
2328                          * out the UNDO FIFO.  Stop the scan, flush what we
2329                          * have, then reflush the directory.
2330                          *
2331                          * The directory may be forced through multiple
2332                          * flush groups before it can be completely
2333                          * flushed.
2334                          */
2335                         ip->flags |= HAMMER_INODE_RESIGNAL |
2336                                      HAMMER_INODE_REFLUSH;
2337                         r = -1;
2338                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2339                         /*
2340                          * If the target IP is not flushing we can force
2341                          * it to flush, even if it is unable to write out
2342                          * any of its own records we have at least one in
2343                          * hand that we CAN deal with.
2344                          */
2345                         rec->flush_state = HAMMER_FST_FLUSH;
2346                         rec->flush_group = flg;
2347                         ++flg->refs;
2348                         hammer_ref(&rec->lock);
2349                         hammer_flush_inode_core(target_ip, flg,
2350                                                 HAMMER_FLUSH_RECURSION);
2351                         r = 1;
2352                 } else {
2353                         /*
2354                          * General or delete-on-disk record.
2355                          *
2356                          * XXX this needs help.  If a delete-on-disk we could
2357                          * disconnect the target.  If the target has its own
2358                          * dependancies they really need to be flushed.
2359                          *
2360                          * XXX
2361                          */
2362                         rec->flush_state = HAMMER_FST_FLUSH;
2363                         rec->flush_group = flg;
2364                         ++flg->refs;
2365                         hammer_ref(&rec->lock);
2366                         hammer_flush_inode_core(target_ip, flg,
2367                                                 HAMMER_FLUSH_RECURSION);
2368                         r = 1;
2369                 }
2370                 break;
2371         case HAMMER_FST_FLUSH:
2372                 /* 
2373                  * The record could be part of a previous flush group if the
2374                  * inode is a directory (the record being a directory entry).
2375                  * Once the flush group was closed a hammer_test_inode()
2376                  * function can cause a new flush group to be setup, placing
2377                  * the directory inode itself in a new flush group.
2378                  *
2379                  * When associated with a previous flush group we count it
2380                  * as if it were in our current flush group, since it will
2381                  * effectively be flushed by the time we flush our current
2382                  * flush group.
2383                  */
2384                 KKASSERT(
2385                     rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY ||
2386                     rec->flush_group == flg);
2387                 r = 1;
2388                 break;
2389         }
2390         return(r);
2391 }
2392
2393 #if 0
2394 /*
2395  * This version just moves records already in a flush state to the new
2396  * flush group and that is it.
2397  */
2398 static int
2399 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2400 {
2401         hammer_inode_t ip = rec->ip;
2402
2403         switch(rec->flush_state) {
2404         case HAMMER_FST_FLUSH:
2405                 KKASSERT(rec->flush_group == ip->flush_group);
2406                 break;
2407         default:
2408                 break;
2409         }
2410         return(0);
2411 }
2412 #endif
2413
2414 /*
2415  * Wait for a previously queued flush to complete.
2416  *
2417  * If a critical error occured we don't try to wait.
2418  */
2419 void
2420 hammer_wait_inode(hammer_inode_t ip)
2421 {
2422         /*
2423          * The inode can be in a SETUP state in which case RESIGNAL
2424          * should be set.  If RESIGNAL is not set then the previous
2425          * flush completed and a later operation placed the inode
2426          * in a passive setup state again, so we're done.
2427          *
2428          * The inode can be in a FLUSH state in which case we
2429          * can just wait for completion.
2430          */
2431         while (ip->flush_state == HAMMER_FST_FLUSH ||
2432             (ip->flush_state == HAMMER_FST_SETUP &&
2433              (ip->flags & HAMMER_INODE_RESIGNAL))) {
2434                 /*
2435                  * Don't try to flush on a critical error
2436                  */
2437                 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
2438                         break;
2439
2440                 /*
2441                  * If the inode was already being flushed its flg
2442                  * may not have been queued to the backend.  We
2443                  * have to make sure it gets queued or we can wind
2444                  * up blocked or deadlocked (particularly if we are
2445                  * the vnlru thread).
2446                  */
2447                 if (ip->flush_state == HAMMER_FST_FLUSH) {
2448                         KKASSERT(ip->flush_group);
2449                         if (ip->flush_group->closed == 0) {
2450                                 if (hammer_debug_inode) {
2451                                         kprintf("hammer: debug: forcing "
2452                                                 "async flush ip %016jx\n",
2453                                                 (intmax_t)ip->obj_id);
2454                                 }
2455                                 hammer_flusher_async(ip->hmp,
2456                                                      ip->flush_group);
2457                                 continue; /* retest */
2458                         }
2459                 }
2460
2461                 /*
2462                  * In a flush state with the flg queued to the backend
2463                  * or in a setup state with RESIGNAL set, we can safely
2464                  * wait.
2465                  */
2466                 ip->flags |= HAMMER_INODE_FLUSHW;
2467                 tsleep(&ip->flags, 0, "hmrwin", 0);
2468         }
2469
2470 #if 0
2471         /*
2472          * The inode may have been in a passive setup state,
2473          * call flush to make sure we get signaled.
2474          */
2475         if (ip->flush_state == HAMMER_FST_SETUP)
2476                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2477 #endif
2478
2479 }
2480
2481 /*
2482  * Called by the backend code when a flush has been completed.
2483  * The inode has already been removed from the flush list.
2484  *
2485  * A pipelined flush can occur, in which case we must re-enter the
2486  * inode on the list and re-copy its fields.
2487  */
2488 void
2489 hammer_flush_inode_done(hammer_inode_t ip, int error)
2490 {
2491         hammer_mount_t hmp;
2492         int dorel;
2493
2494         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2495
2496         hmp = ip->hmp;
2497
2498         /*
2499          * Auto-reflush if the backend could not completely flush
2500          * the inode.  This fixes a case where a deferred buffer flush
2501          * could cause fsync to return early.
2502          */
2503         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2504                 ip->flags |= HAMMER_INODE_REFLUSH;
2505
2506         /*
2507          * Merge left-over flags back into the frontend and fix the state.
2508          * Incomplete truncations are retained by the backend.
2509          */
2510         ip->error = error;
2511         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2512         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2513
2514         /*
2515          * The backend may have adjusted nlinks, so if the adjusted nlinks
2516          * does not match the fronttend set the frontend's DDIRTY flag again.
2517          */
2518         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2519                 ip->flags |= HAMMER_INODE_DDIRTY;
2520
2521         /*
2522          * Fix up the dirty buffer status.
2523          */
2524         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2525                 ip->flags |= HAMMER_INODE_BUFS;
2526         }
2527         hammer_redo_fifo_end_flush(ip);
2528
2529         /*
2530          * Re-set the XDIRTY flag if some of the inode's in-memory records
2531          * could not be flushed.
2532          */
2533         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2534                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2535                  (!RB_EMPTY(&ip->rec_tree) &&
2536                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2537
2538         /*
2539          * Do not lose track of inodes which no longer have vnode
2540          * assocations, otherwise they may never get flushed again.
2541          *
2542          * The reflush flag can be set superfluously, causing extra pain
2543          * for no reason.  If the inode is no longer modified it no longer
2544          * needs to be flushed.
2545          */
2546         if (ip->flags & HAMMER_INODE_MODMASK) {
2547                 if (ip->vp == NULL)
2548                         ip->flags |= HAMMER_INODE_REFLUSH;
2549         } else {
2550                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2551         }
2552         if (ip->flags & HAMMER_INODE_MODMASK)
2553                 hammer_inode_dirty(ip);
2554
2555         /*
2556          * Adjust the flush state.
2557          */
2558         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2559                 /*
2560                  * We were unable to flush out all our records, leave the
2561                  * inode in a flush state and in the current flush group.
2562                  * The flush group will be re-run.
2563                  *
2564                  * This occurs if the UNDO block gets too full or there is
2565                  * too much dirty meta-data and allows the flusher to
2566                  * finalize the UNDO block and then re-flush.
2567                  */
2568                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2569                 dorel = 0;
2570         } else {
2571                 /*
2572                  * Remove from the flush_group
2573                  */
2574                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
2575                 ip->flush_group = NULL;
2576
2577 #if 0
2578                 /*
2579                  * Clean up the vnode ref and tracking counts.
2580                  */
2581                 if (ip->flags & HAMMER_INODE_VHELD) {
2582                         ip->flags &= ~HAMMER_INODE_VHELD;
2583                         vrele(ip->vp);
2584                 }
2585 #endif
2586                 --hmp->count_iqueued;
2587                 --hammer_count_iqueued;
2588
2589                 /*
2590                  * And adjust the state.
2591                  */
2592                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2593                         ip->flush_state = HAMMER_FST_IDLE;
2594                         dorel = 1;
2595                 } else {
2596                         ip->flush_state = HAMMER_FST_SETUP;
2597                         dorel = 0;
2598                 }
2599
2600                 /*
2601                  * If the frontend is waiting for a flush to complete,
2602                  * wake it up.
2603                  */
2604                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2605                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2606                         wakeup(&ip->flags);
2607                 }
2608
2609                 /*
2610                  * If the frontend made more changes and requested another
2611                  * flush, then try to get it running.
2612                  *
2613                  * Reflushes are aborted when the inode is errored out.
2614                  */
2615                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2616                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2617                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2618                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2619                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2620                         } else {
2621                                 hammer_flush_inode(ip, 0);
2622                         }
2623                 }
2624         }
2625
2626         /*
2627          * If we have no parent dependancies we can clear CONN_DOWN
2628          */
2629         if (TAILQ_EMPTY(&ip->target_list))
2630                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2631
2632         /*
2633          * If the inode is now clean drop the space reservation.
2634          */
2635         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2636             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2637                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2638                 --hmp->rsv_inodes;
2639         }
2640
2641         ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
2642
2643         if (dorel)
2644                 hammer_rel_inode(ip, 0);
2645 }
2646
2647 /*
2648  * Called from hammer_sync_inode() to synchronize in-memory records
2649  * to the media.
2650  */
2651 static int
2652 hammer_sync_record_callback(hammer_record_t record, void *data)
2653 {
2654         hammer_cursor_t cursor = data;
2655         hammer_transaction_t trans = cursor->trans;
2656         hammer_mount_t hmp = trans->hmp;
2657         int error;
2658
2659         /*
2660          * Skip records that do not belong to the current flush.
2661          */
2662         ++hammer_stats_record_iterations;
2663         if (record->flush_state != HAMMER_FST_FLUSH)
2664                 return(0);
2665
2666 #if 1
2667         if (record->flush_group != record->ip->flush_group) {
2668                 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
2669                 if (hammer_debug_critical)
2670                         Debugger("blah2");
2671                 return(0);
2672         }
2673 #endif
2674         KKASSERT(record->flush_group == record->ip->flush_group);
2675
2676         /*
2677          * Interlock the record using the BE flag.  Once BE is set the
2678          * frontend cannot change the state of FE.
2679          *
2680          * NOTE: If FE is set prior to us setting BE we still sync the
2681          * record out, but the flush completion code converts it to 
2682          * a delete-on-disk record instead of destroying it.
2683          */
2684         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2685         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2686
2687         /*
2688          * The backend has already disposed of the record.
2689          */
2690         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2691                 error = 0;
2692                 goto done;
2693         }
2694
2695         /*
2696          * If the whole inode is being deleted and all on-disk records will
2697          * be deleted very soon, we can't sync any new records to disk
2698          * because they will be deleted in the same transaction they were
2699          * created in (delete_tid == create_tid), which will assert.
2700          *
2701          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2702          * that we currently panic on.
2703          */
2704         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2705                 switch(record->type) {
2706                 case HAMMER_MEM_RECORD_DATA:
2707                         /*
2708                          * We don't have to do anything, if the record was
2709                          * committed the space will have been accounted for
2710                          * in the blockmap.
2711                          */
2712                         /* fall through */
2713                 case HAMMER_MEM_RECORD_GENERAL:
2714                         /*
2715                          * Set deleted-by-backend flag.  Do not set the
2716                          * backend committed flag, because we are throwing
2717                          * the record away.
2718                          */
2719                         record->flags |= HAMMER_RECF_DELETED_BE;
2720                         ++record->ip->rec_generation;
2721                         error = 0;
2722                         goto done;
2723                 case HAMMER_MEM_RECORD_ADD:
2724                         panic("hammer_sync_record_callback: illegal add "
2725                               "during inode deletion record %p", record);
2726                         break; /* NOT REACHED */
2727                 case HAMMER_MEM_RECORD_INODE:
2728                         panic("hammer_sync_record_callback: attempt to "
2729                               "sync inode record %p?", record);
2730                         break; /* NOT REACHED */
2731                 case HAMMER_MEM_RECORD_DEL:
2732                         /* 
2733                          * Follow through and issue the on-disk deletion
2734                          */
2735                         break;
2736                 }
2737         }
2738
2739         /*
2740          * If DELETED_FE is set special handling is needed for directory
2741          * entries.  Dependant pieces related to the directory entry may
2742          * have already been synced to disk.  If this occurs we have to
2743          * sync the directory entry and then change the in-memory record
2744          * from an ADD to a DELETE to cover the fact that it's been
2745          * deleted by the frontend.
2746          *
2747          * A directory delete covering record (MEM_RECORD_DEL) can never
2748          * be deleted by the frontend.
2749          *
2750          * Any other record type (aka DATA) can be deleted by the frontend.
2751          * XXX At the moment the flusher must skip it because there may
2752          * be another data record in the flush group for the same block,
2753          * meaning that some frontend data changes can leak into the backend's
2754          * synchronization point.
2755          */
2756         if (record->flags & HAMMER_RECF_DELETED_FE) {
2757                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2758                         /*
2759                          * Convert a front-end deleted directory-add to
2760                          * a directory-delete entry later.
2761                          */
2762                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2763                 } else {
2764                         /*
2765                          * Dispose of the record (race case).  Mark as
2766                          * deleted by backend (and not committed).
2767                          */
2768                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2769                         record->flags |= HAMMER_RECF_DELETED_BE;
2770                         ++record->ip->rec_generation;
2771                         error = 0;
2772                         goto done;
2773                 }
2774         }
2775
2776         /*
2777          * Assign the create_tid for new records.  Deletions already
2778          * have the record's entire key properly set up.
2779          */
2780         if (record->type != HAMMER_MEM_RECORD_DEL) {
2781                 record->leaf.base.create_tid = trans->tid;
2782                 record->leaf.create_ts = trans->time32;
2783         }
2784
2785         /*
2786          * This actually moves the record to the on-media B-Tree.  We
2787          * must also generate REDO_TERM entries in the UNDO/REDO FIFO
2788          * indicating that the related REDO_WRITE(s) have been committed.
2789          *
2790          * During recovery any REDO_TERM's within the nominal recovery span
2791          * are ignored since the related meta-data is being undone, causing
2792          * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
2793          * the nominal recovery span will match against REDO_WRITEs and
2794          * prevent them from being executed (because the meta-data has
2795          * already been synchronized).
2796          */
2797         if (record->flags & HAMMER_RECF_REDO) {
2798                 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
2799                 hammer_generate_redo(trans, record->ip,
2800                                      record->leaf.base.key -
2801                                          record->leaf.data_len,
2802                                      HAMMER_REDO_TERM_WRITE,
2803                                      NULL,
2804                                      record->leaf.data_len);
2805         }
2806
2807         for (;;) {
2808                 error = hammer_ip_sync_record_cursor(cursor, record);
2809                 if (error != EDEADLK)
2810                         break;
2811                 hammer_done_cursor(cursor);
2812                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2813                                            record->ip);
2814                 if (error)
2815                         break;
2816         }
2817         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2818
2819         if (error)
2820                 error = -error;
2821 done:
2822         hammer_flush_record_done(record, error);
2823
2824         /*
2825          * Do partial finalization if we have built up too many dirty
2826          * buffers.  Otherwise a buffer cache deadlock can occur when
2827          * doing things like creating tens of thousands of tiny files.
2828          *
2829          * We must release our cursor lock to avoid a 3-way deadlock
2830          * due to the exclusive sync lock the finalizer must get.
2831          *
2832          * WARNING: See warnings in hammer_unlock_cursor() function.
2833          */
2834         if (hammer_flusher_meta_limit(hmp) ||
2835             vm_page_count_severe()) {
2836                 hammer_unlock_cursor(cursor);
2837                 hammer_flusher_finalize(trans, 0);
2838                 hammer_lock_cursor(cursor);
2839         }
2840         return(error);
2841 }
2842
2843 /*
2844  * Backend function called by the flusher to sync an inode to media.
2845  */
2846 int
2847 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2848 {
2849         struct hammer_cursor cursor;
2850         hammer_node_t tmp_node;
2851         hammer_record_t depend;
2852         hammer_record_t next;
2853         int error, tmp_error;
2854         u_int64_t nlinks;
2855
2856         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2857                 return(0);
2858
2859         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2860         if (error)
2861                 goto done;
2862
2863         /*
2864          * Any directory records referencing this inode which are not in
2865          * our current flush group must adjust our nlink count for the
2866          * purposes of synchronizating to disk.
2867          *
2868          * Records which are in our flush group can be unlinked from our
2869          * inode now, potentially allowing the inode to be physically
2870          * deleted.
2871          *
2872          * This cannot block.
2873          */
2874         nlinks = ip->ino_data.nlinks;
2875         next = TAILQ_FIRST(&ip->target_list);
2876         while ((depend = next) != NULL) {
2877                 next = TAILQ_NEXT(depend, target_entry);
2878                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2879                     depend->flush_group == ip->flush_group) {
2880                         /*
2881                          * If this is an ADD that was deleted by the frontend
2882                          * the frontend nlinks count will have already been
2883                          * decremented, but the backend is going to sync its
2884                          * directory entry and must account for it.  The
2885                          * record will be converted to a delete-on-disk when
2886                          * it gets synced.
2887                          *
2888                          * If the ADD was not deleted by the frontend we
2889                          * can remove the dependancy from our target_list.
2890                          */
2891                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2892                                 ++nlinks;
2893                         } else {
2894                                 TAILQ_REMOVE(&ip->target_list, depend,
2895                                              target_entry);
2896                                 depend->target_ip = NULL;
2897                         }
2898                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2899                         /*
2900                          * Not part of our flush group and not deleted by
2901                          * the front-end, adjust the link count synced to
2902                          * the media (undo what the frontend did when it
2903                          * queued the record).
2904                          */
2905                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2906                         switch(depend->type) {
2907                         case HAMMER_MEM_RECORD_ADD:
2908                                 --nlinks;
2909                                 break;
2910                         case HAMMER_MEM_RECORD_DEL:
2911                                 ++nlinks;
2912                                 break;
2913                         default:
2914                                 break;
2915                         }
2916                 }
2917         }
2918
2919         /*
2920          * Set dirty if we had to modify the link count.
2921          */
2922         if (ip->sync_ino_data.nlinks != nlinks) {
2923                 KKASSERT((int64_t)nlinks >= 0);
2924                 ip->sync_ino_data.nlinks = nlinks;
2925                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2926         }
2927
2928         /*
2929          * If there is a trunction queued destroy any data past the (aligned)
2930          * truncation point.  Userland will have dealt with the buffer
2931          * containing the truncation point for us.
2932          *
2933          * We don't flush pending frontend data buffers until after we've
2934          * dealt with the truncation.
2935          */
2936         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2937                 /*
2938                  * Interlock trunc_off.  The VOP front-end may continue to
2939                  * make adjustments to it while we are blocked.
2940                  */
2941                 off_t trunc_off;
2942                 off_t aligned_trunc_off;
2943                 int blkmask;
2944
2945                 trunc_off = ip->sync_trunc_off;
2946                 blkmask = hammer_blocksize(trunc_off) - 1;
2947                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2948
2949                 /*
2950                  * Delete any whole blocks on-media.  The front-end has
2951                  * already cleaned out any partial block and made it
2952                  * pending.  The front-end may have updated trunc_off
2953                  * while we were blocked so we only use sync_trunc_off.
2954                  *
2955                  * This operation can blow out the buffer cache, EWOULDBLOCK
2956                  * means we were unable to complete the deletion.  The
2957                  * deletion will update sync_trunc_off in that case.
2958                  */
2959                 error = hammer_ip_delete_range(&cursor, ip,
2960                                                 aligned_trunc_off,
2961                                                 0x7FFFFFFFFFFFFFFFLL, 2);
2962                 if (error == EWOULDBLOCK) {
2963                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
2964                         error = 0;
2965                         goto defer_buffer_flush;
2966                 }
2967
2968                 if (error)
2969                         goto done;
2970
2971                 /*
2972                  * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
2973                  *
2974                  * XXX we do this even if we did not previously generate
2975                  * a REDO_TRUNC record.  This operation may enclosed the
2976                  * range for multiple prior truncation entries in the REDO
2977                  * log.
2978                  */
2979                 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
2980                     (ip->flags & HAMMER_INODE_RDIRTY)) {
2981                         hammer_generate_redo(trans, ip, aligned_trunc_off,
2982                                              HAMMER_REDO_TERM_TRUNC,
2983                                              NULL, 0);
2984                 }
2985
2986                 /*
2987                  * Clear the truncation flag on the backend after we have
2988                  * completed the deletions.  Backend data is now good again
2989                  * (including new records we are about to sync, below).
2990                  *
2991                  * Leave sync_trunc_off intact.  As we write additional
2992                  * records the backend will update sync_trunc_off.  This
2993                  * tells the backend whether it can skip the overwrite
2994                  * test.  This should work properly even when the backend
2995                  * writes full blocks where the truncation point straddles
2996                  * the block because the comparison is against the base
2997                  * offset of the record.
2998                  */
2999                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3000                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
3001         } else {
3002                 error = 0;
3003         }
3004
3005         /*
3006          * Now sync related records.  These will typically be directory
3007          * entries, records tracking direct-writes, or delete-on-disk records.
3008          */
3009         if (error == 0) {
3010                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
3011                                     hammer_sync_record_callback, &cursor);
3012                 if (tmp_error < 0)
3013                         tmp_error = -error;
3014                 if (tmp_error)
3015                         error = tmp_error;
3016         }
3017         hammer_cache_node(&ip->cache[1], cursor.node);
3018
3019         /*
3020          * Re-seek for inode update, assuming our cache hasn't been ripped
3021          * out from under us.
3022          */
3023         if (error == 0) {
3024                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
3025                 if (tmp_node) {
3026                         hammer_cursor_downgrade(&cursor);
3027                         hammer_lock_sh(&tmp_node->lock);
3028                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
3029                                 hammer_cursor_seek(&cursor, tmp_node, 0);
3030                         hammer_unlock(&tmp_node->lock);
3031                         hammer_rel_node(tmp_node);
3032                 }
3033                 error = 0;
3034         }
3035
3036         /*
3037          * If we are deleting the inode the frontend had better not have
3038          * any active references on elements making up the inode.
3039          *
3040          * The call to hammer_ip_delete_clean() cleans up auxillary records
3041          * but not DB or DATA records.  Those must have already been deleted
3042          * by the normal truncation mechanic.
3043          */
3044         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
3045                 RB_EMPTY(&ip->rec_tree)  &&
3046             (ip->sync_flags & HAMMER_INODE_DELETING) &&
3047             (ip->flags & HAMMER_INODE_DELETED) == 0) {
3048                 int count1 = 0;
3049
3050                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
3051                 if (error == 0) {
3052                         ip->flags |= HAMMER_INODE_DELETED;
3053                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
3054                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3055                         KKASSERT(RB_EMPTY(&ip->rec_tree));
3056
3057                         /*
3058                          * Set delete_tid in both the frontend and backend
3059                          * copy of the inode record.  The DELETED flag handles
3060                          * this, do not set DDIRTY.
3061                          */
3062                         ip->ino_leaf.base.delete_tid = trans->tid;
3063                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
3064                         ip->ino_leaf.delete_ts = trans->time32;
3065                         ip->sync_ino_leaf.delete_ts = trans->time32;
3066
3067
3068                         /*
3069                          * Adjust the inode count in the volume header
3070                          */
3071                         hammer_sync_lock_sh(trans);
3072                         if (ip->flags & HAMMER_INODE_ONDISK) {
3073                                 hammer_modify_volume_field(trans,
3074                                                            trans->rootvol,
3075                                                            vol0_stat_inodes);
3076                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
3077                                 hammer_modify_volume_done(trans->rootvol);
3078                         }
3079                         hammer_sync_unlock(trans);
3080                 }
3081         }
3082
3083         if (error)
3084                 goto done;
3085         ip->sync_flags &= ~HAMMER_INODE_BUFS;
3086
3087 defer_buffer_flush:
3088         /*
3089          * Now update the inode's on-disk inode-data and/or on-disk record.
3090          * DELETED and ONDISK are managed only in ip->flags.
3091          *
3092          * In the case of a defered buffer flush we still update the on-disk
3093          * inode to satisfy visibility requirements if there happen to be
3094          * directory dependancies.
3095          */
3096         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
3097         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
3098                 /*
3099                  * If deleted and on-disk, don't set any additional flags.
3100                  * the delete flag takes care of things.
3101                  *
3102                  * Clear flags which may have been set by the frontend.
3103                  */
3104                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3105                                     HAMMER_INODE_SDIRTY |
3106                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3107                                     HAMMER_INODE_DELETING);
3108                 break;
3109         case HAMMER_INODE_DELETED:
3110                 /*
3111                  * Take care of the case where a deleted inode was never
3112                  * flushed to the disk in the first place.
3113                  *
3114                  * Clear flags which may have been set by the frontend.
3115                  */
3116                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3117                                     HAMMER_INODE_SDIRTY |
3118                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3119                                     HAMMER_INODE_DELETING);
3120                 while (RB_ROOT(&ip->rec_tree)) {
3121                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
3122                         hammer_ref(&record->lock);
3123                         KKASSERT(hammer_oneref(&record->lock));
3124                         record->flags |= HAMMER_RECF_DELETED_BE;
3125                         ++record->ip->rec_generation;
3126                         hammer_rel_mem_record(record);
3127                 }
3128                 break;
3129         case HAMMER_INODE_ONDISK:
3130                 /*
3131                  * If already on-disk, do not set any additional flags.
3132                  */
3133                 break;
3134         default:
3135                 /*
3136                  * If not on-disk and not deleted, set DDIRTY to force
3137                  * an initial record to be written.
3138                  *
3139                  * Also set the create_tid in both the frontend and backend
3140                  * copy of the inode record.
3141                  */
3142                 ip->ino_leaf.base.create_tid = trans->tid;
3143                 ip->ino_leaf.create_ts = trans->time32;
3144                 ip->sync_ino_leaf.base.create_tid = trans->tid;
3145                 ip->sync_ino_leaf.create_ts = trans->time32;
3146                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
3147                 break;
3148         }
3149
3150         /*
3151          * If DDIRTY or SDIRTY is set, write out a new record.
3152          * If the inode is already on-disk the old record is marked as
3153          * deleted.
3154          *
3155          * If DELETED is set hammer_update_inode() will delete the existing
3156          * record without writing out a new one.
3157          *
3158          * If *ONLY* the ITIMES flag is set we can update the record in-place.
3159          */
3160         if (ip->flags & HAMMER_INODE_DELETED) {
3161                 error = hammer_update_inode(&cursor, ip);
3162         } else 
3163         if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
3164             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
3165                 error = hammer_update_itimes(&cursor, ip);
3166         } else
3167         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
3168                               HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
3169                 error = hammer_update_inode(&cursor, ip);
3170         }
3171 done:
3172         if (ip->flags & HAMMER_INODE_MODMASK)
3173                 hammer_inode_dirty(ip);
3174         if (error) {
3175                 hammer_critical_error(ip->hmp, ip, error,
3176                                       "while syncing inode");
3177         }
3178         hammer_done_cursor(&cursor);
3179         return(error);
3180 }
3181
3182 /*
3183  * This routine is called when the OS is no longer actively referencing
3184  * the inode (but might still be keeping it cached), or when releasing
3185  * the last reference to an inode.
3186  *
3187  * At this point if the inode's nlinks count is zero we want to destroy
3188  * it, which may mean destroying it on-media too.
3189  */
3190 void
3191 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
3192 {
3193         struct vnode *vp;
3194
3195         /*
3196          * Set the DELETING flag when the link count drops to 0 and the
3197          * OS no longer has any opens on the inode.
3198          *
3199          * The backend will clear DELETING (a mod flag) and set DELETED
3200          * (a state flag) when it is actually able to perform the
3201          * operation.
3202          *
3203          * Don't reflag the deletion if the flusher is currently syncing
3204          * one that was already flagged.  A previously set DELETING flag
3205          * may bounce around flags and sync_flags until the operation is
3206          * completely done.
3207          *
3208          * Do not attempt to modify a snapshot inode (one set to read-only).
3209          */
3210         if (ip->ino_data.nlinks == 0 &&
3211             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
3212                 ip->flags |= HAMMER_INODE_DELETING;
3213                 ip->flags |= HAMMER_INODE_TRUNCATED;
3214                 ip->trunc_off = 0;
3215                 vp = NULL;
3216                 if (getvp) {
3217                         if (hammer_get_vnode(ip, &vp) != 0)
3218                                 return;
3219                 }
3220
3221                 /*
3222                  * Final cleanup
3223                  */
3224                 if (ip->vp)
3225                         nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0);
3226                 if (ip->flags & HAMMER_INODE_MODMASK)
3227                         hammer_inode_dirty(ip);
3228                 if (getvp)
3229                         vput(vp);
3230         }
3231 }
3232
3233 /*
3234  * After potentially resolving a dependancy the inode is tested
3235  * to determine whether it needs to be reflushed.
3236  */
3237 void
3238 hammer_test_inode(hammer_inode_t ip)
3239 {
3240         if (ip->flags & HAMMER_INODE_REFLUSH) {
3241                 ip->flags &= ~HAMMER_INODE_REFLUSH;
3242                 hammer_ref(&ip->lock);
3243                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
3244                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
3245                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
3246                 } else {
3247                         hammer_flush_inode(ip, 0);
3248                 }
3249                 hammer_rel_inode(ip, 0);
3250         }
3251 }
3252
3253 /*
3254  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
3255  * reassociated with a vp or just before it gets freed.
3256  *
3257  * Pipeline wakeups to threads blocked due to an excessive number of
3258  * detached inodes.  This typically occurs when atime updates accumulate
3259  * while scanning a directory tree.
3260  */
3261 static void
3262 hammer_inode_wakereclaims(hammer_inode_t ip)
3263 {
3264         struct hammer_reclaim *reclaim;
3265         hammer_mount_t hmp = ip->hmp;
3266
3267         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
3268                 return;
3269
3270         --hammer_count_reclaims;
3271         --hmp->count_reclaims;
3272         ip->flags &= ~HAMMER_INODE_RECLAIM;
3273
3274         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3275                 KKASSERT(reclaim->count > 0);
3276                 if (--reclaim->count == 0) {
3277                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3278                         wakeup(reclaim);
3279                 }
3280         }
3281 }
3282
3283 /*
3284  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3285  * inodes build up before we start blocking.  This routine is called
3286  * if a new inode is created or an inode is loaded from media.
3287  *
3288  * When we block we don't care *which* inode has finished reclaiming,
3289  * as long as one does.
3290  *
3291  * The reclaim pipeline is primarily governed by the auto-flush which is
3292  * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
3293  * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
3294  * dynamically governed.
3295  */
3296 void
3297 hammer_inode_waitreclaims(hammer_transaction_t trans)
3298 {
3299         hammer_mount_t hmp = trans->hmp;
3300         struct hammer_reclaim reclaim;
3301         int lower_limit;
3302
3303         /*
3304          * Track inode load, delay if the number of reclaiming inodes is
3305          * between 2/4 and 4/4 hammer_limit_reclaims, depending.
3306          */
3307         if (curthread->td_proc) {
3308                 struct hammer_inostats *stats;
3309
3310                 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
3311                 ++stats->count;
3312
3313                 if (stats->count > hammer_limit_reclaims / 2)
3314                         stats->count = hammer_limit_reclaims / 2;
3315                 lower_limit = hammer_limit_reclaims - stats->count;
3316                 if (hammer_debug_general & 0x10000) {
3317                         kprintf("pid %5d limit %d\n",
3318                                 (int)curthread->td_proc->p_pid, lower_limit);
3319                 }
3320         } else {
3321                 lower_limit = hammer_limit_reclaims * 3 / 4;
3322         }
3323         if (hmp->count_reclaims >= lower_limit) {
3324                 reclaim.count = 1;
3325                 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3326                 tsleep(&reclaim, 0, "hmrrcm", hz);
3327                 if (reclaim.count > 0)
3328                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3329         }
3330 }
3331
3332 /*
3333  * Keep track of reclaim statistics on a per-pid basis using a loose
3334  * 4-way set associative hash table.  Collisions inherit the count of
3335  * the previous entry.
3336  *
3337  * NOTE: We want to be careful here to limit the chain size.  If the chain
3338  *       size is too large a pid will spread its stats out over too many
3339  *       entries under certain types of heavy filesystem activity and
3340  *       wind up not delaying long enough.
3341  */
3342 static
3343 struct hammer_inostats *
3344 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
3345 {
3346         struct hammer_inostats *stats;
3347         int delta;
3348         int chain;
3349         static volatile int iterator;   /* we don't care about MP races */
3350
3351         /*
3352          * Chain up to 4 times to find our entry.
3353          */
3354         for (chain = 0; chain < 4; ++chain) {
3355                 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
3356                 if (stats->pid == pid)
3357                         break;
3358         }
3359
3360         /*
3361          * Replace one of the four chaining entries with our new entry.
3362          */
3363         if (chain == 4) {
3364                 stats = &hmp->inostats[(pid + (iterator++ & 3)) &
3365                                        HAMMER_INOSTATS_HMASK];
3366                 stats->pid = pid;
3367         }
3368
3369         /*
3370          * Decay the entry
3371          */
3372         if (stats->count && stats->ltick != ticks) {
3373                 delta = ticks - stats->ltick;
3374                 stats->ltick = ticks;
3375                 if (delta <= 0 || delta > hz * 60)
3376                         stats->count = 0;
3377                 else
3378                         stats->count = stats->count * hz / (hz + delta);
3379         }
3380         if (hammer_debug_general & 0x10000)
3381                 kprintf("pid %5d stats %d\n", (int)pid, stats->count);
3382         return (stats);
3383 }
3384
3385 #if 0
3386
3387 /*
3388  * XXX not used, doesn't work very well due to the large batching nature
3389  * of flushes.
3390  *
3391  * A larger then normal backlog of inodes is sitting in the flusher,
3392  * enforce a general slowdown to let it catch up.  This routine is only
3393  * called on completion of a non-flusher-related transaction which
3394  * performed B-Tree node I/O.
3395  *
3396  * It is possible for the flusher to stall in a continuous load.
3397  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3398  * If the flusher is unable to catch up the inode count can bloat until
3399  * we run out of kvm.
3400  *
3401  * This is a bit of a hack.
3402  */
3403 void
3404 hammer_inode_waithard(hammer_mount_t hmp)
3405 {
3406         /*
3407          * Hysteresis.
3408          */
3409         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3410                 if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
3411                     hmp->count_iqueued < hmp->count_inodes / 20) {
3412                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3413                         return;
3414                 }
3415         } else {
3416                 if (hmp->count_reclaims < hammer_limit_reclaims ||
3417                     hmp->count_iqueued < hmp->count_inodes / 10) {
3418                         return;
3419                 }
3420                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3421         }
3422
3423         /*
3424          * Block for one flush cycle.
3425          */
3426         hammer_flusher_wait_next(hmp);
3427 }
3428
3429 #endif