gdb - Local mods (compile)
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <vm/vm_page2.h>
36
37 #include "hammer.h"
38
39 static int      hammer_unload_inode(struct hammer_inode *ip);
40 static void     hammer_free_inode(hammer_inode_t ip);
41 static void     hammer_flush_inode_core(hammer_inode_t ip,
42                                         hammer_flush_group_t flg, int flags);
43 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
44 #if 0
45 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
46 #endif
47 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
48                                         hammer_flush_group_t flg);
49 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
50                                         int depth, hammer_flush_group_t flg);
51 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
52 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
53                                         pid_t pid);
54 static struct hammer_inode *__hammer_find_inode(hammer_transaction_t trans,
55                                         int64_t obj_id, hammer_tid_t asof,
56                                         uint32_t localization);
57
58 struct krate hammer_gen_krate = { 1 };
59
60 /*
61  * RB-Tree support for inode structures
62  */
63 int
64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
65 {
66         if (ip1->obj_localization < ip2->obj_localization)
67                 return(-1);
68         if (ip1->obj_localization > ip2->obj_localization)
69                 return(1);
70         if (ip1->obj_id < ip2->obj_id)
71                 return(-1);
72         if (ip1->obj_id > ip2->obj_id)
73                 return(1);
74         if (ip1->obj_asof < ip2->obj_asof)
75                 return(-1);
76         if (ip1->obj_asof > ip2->obj_asof)
77                 return(1);
78         return(0);
79 }
80
81 int
82 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
83 {
84         if (ip1->redo_fifo_start < ip2->redo_fifo_start)
85                 return(-1);
86         if (ip1->redo_fifo_start > ip2->redo_fifo_start)
87                 return(1);
88         return(0);
89 }
90
91 /*
92  * RB-Tree support for inode structures / special LOOKUP_INFO
93  */
94 static int
95 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
96 {
97         if (info->obj_localization < ip->obj_localization)
98                 return(-1);
99         if (info->obj_localization > ip->obj_localization)
100                 return(1);
101         if (info->obj_id < ip->obj_id)
102                 return(-1);
103         if (info->obj_id > ip->obj_id)
104                 return(1);
105         if (info->obj_asof < ip->obj_asof)
106                 return(-1);
107         if (info->obj_asof > ip->obj_asof)
108                 return(1);
109         return(0);
110 }
111
112 /*
113  * Used by hammer_scan_inode_snapshots() to locate all of an object's
114  * snapshots.  Note that the asof field is not tested, which we can get
115  * away with because it is the lowest-priority field.
116  */
117 static int
118 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
119 {
120         hammer_inode_info_t info = data;
121
122         if (ip->obj_localization > info->obj_localization)
123                 return(1);
124         if (ip->obj_localization < info->obj_localization)
125                 return(-1);
126         if (ip->obj_id > info->obj_id)
127                 return(1);
128         if (ip->obj_id < info->obj_id)
129                 return(-1);
130         return(0);
131 }
132
133 /*
134  * Used by hammer_unload_pseudofs() to locate all inodes associated with
135  * a particular PFS.
136  */
137 static int
138 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
139 {
140         uint32_t localization = *(uint32_t *)data;
141         if (ip->obj_localization > localization)
142                 return(1);
143         if (ip->obj_localization < localization)
144                 return(-1);
145         return(0);
146 }
147
148 /*
149  * RB-Tree support for pseudofs structures
150  */
151 static int
152 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
153 {
154         if (p1->localization < p2->localization)
155                 return(-1);
156         if (p1->localization > p2->localization)
157                 return(1);
158         return(0);
159 }
160
161
162 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
163 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
164                 hammer_inode_info_cmp, hammer_inode_info_t);
165 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
166              hammer_pfs_rb_compare, uint32_t, localization);
167
168 /*
169  * The kernel is not actively referencing this vnode but is still holding
170  * it cached.
171  *
172  * This is called from the frontend.
173  *
174  * MPALMOSTSAFE
175  */
176 int
177 hammer_vop_inactive(struct vop_inactive_args *ap)
178 {
179         struct hammer_inode *ip = VTOI(ap->a_vp);
180         hammer_mount_t hmp;
181
182         /*
183          * Degenerate case
184          */
185         if (ip == NULL) {
186                 vrecycle(ap->a_vp);
187                 return(0);
188         }
189
190         /*
191          * If the inode no longer has visibility in the filesystem try to
192          * recycle it immediately, even if the inode is dirty.  Recycling
193          * it quickly allows the system to reclaim buffer cache and VM
194          * resources which can matter a lot in a heavily loaded system.
195          *
196          * This can deadlock in vfsync() if we aren't careful.
197          *
198          * Do not queue the inode to the flusher if we still have visibility,
199          * otherwise namespace calls such as chmod will unnecessarily generate
200          * multiple inode updates.
201          */
202         if (ip->ino_data.nlinks == 0) {
203                 hmp = ip->hmp;
204                 lwkt_gettoken(&hmp->fs_token);
205                 hammer_inode_unloadable_check(ip, 0);
206                 if (ip->flags & HAMMER_INODE_MODMASK)
207                         hammer_flush_inode(ip, 0);
208                 lwkt_reltoken(&hmp->fs_token);
209                 vrecycle(ap->a_vp);
210         }
211         return(0);
212 }
213
214 /*
215  * Release the vnode association.  This is typically (but not always)
216  * the last reference on the inode.
217  *
218  * Once the association is lost we are on our own with regards to
219  * flushing the inode.
220  *
221  * We must interlock ip->vp so hammer_get_vnode() can avoid races.
222  */
223 int
224 hammer_vop_reclaim(struct vop_reclaim_args *ap)
225 {
226         struct hammer_inode *ip;
227         hammer_mount_t hmp;
228         struct vnode *vp;
229
230         vp = ap->a_vp;
231
232         if ((ip = vp->v_data) != NULL) {
233                 hmp = ip->hmp;
234                 lwkt_gettoken(&hmp->fs_token);
235                 hammer_lock_ex(&ip->lock);
236                 vp->v_data = NULL;
237                 ip->vp = NULL;
238
239                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
240                         ++hammer_count_reclaims;
241                         ++hmp->count_reclaims;
242                         ip->flags |= HAMMER_INODE_RECLAIM;
243                 }
244                 hammer_unlock(&ip->lock);
245                 vclrisdirty(vp);
246                 hammer_rel_inode(ip, 1);
247                 lwkt_reltoken(&hmp->fs_token);
248         }
249         return(0);
250 }
251
252 /*
253  * Inform the kernel that the inode is dirty.  This will be checked
254  * by vn_unlock().
255  *
256  * Theoretically in order to reclaim a vnode the hammer_vop_reclaim()
257  * must be called which will interlock against our inode lock, so
258  * if VRECLAIMED is not set vp->v_mount (as used by vsetisdirty())
259  * should be stable without having to acquire any new locks.
260  */
261 void
262 hammer_inode_dirty(struct hammer_inode *ip)
263 {
264         struct vnode *vp;
265
266         if ((ip->flags & HAMMER_INODE_MODMASK) &&
267             (vp = ip->vp) != NULL &&
268             (vp->v_flag & (VRECLAIMED | VISDIRTY)) == 0) {
269                 vsetisdirty(vp);
270         }
271 }
272
273 /*
274  * Return a locked vnode for the specified inode.  The inode must be
275  * referenced but NOT LOCKED on entry and will remain referenced on
276  * return.
277  *
278  * Called from the frontend.
279  */
280 int
281 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
282 {
283         hammer_mount_t hmp;
284         struct vnode *vp;
285         int error = 0;
286         uint8_t obj_type;
287
288         hmp = ip->hmp;
289
290         for (;;) {
291                 if ((vp = ip->vp) == NULL) {
292                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
293                         if (error)
294                                 break;
295                         hammer_lock_ex(&ip->lock);
296                         if (ip->vp != NULL) {
297                                 hammer_unlock(&ip->lock);
298                                 vp = *vpp;
299                                 vp->v_type = VBAD;
300                                 vx_put(vp);
301                                 continue;
302                         }
303                         hammer_ref(&ip->lock);
304                         vp = *vpp;
305                         ip->vp = vp;
306
307                         obj_type = ip->ino_data.obj_type;
308                         vp->v_type = hammer_get_vnode_type(obj_type);
309
310                         hammer_inode_wakereclaims(ip);
311
312                         switch(ip->ino_data.obj_type) {
313                         case HAMMER_OBJTYPE_CDEV:
314                         case HAMMER_OBJTYPE_BDEV:
315                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
316                                 addaliasu(vp, ip->ino_data.rmajor,
317                                           ip->ino_data.rminor);
318                                 break;
319                         case HAMMER_OBJTYPE_FIFO:
320                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
321                                 break;
322                         case HAMMER_OBJTYPE_REGFILE:
323                                 break;
324                         default:
325                                 break;
326                         }
327
328                         /*
329                          * Only mark as the root vnode if the ip is not
330                          * historical, otherwise the VFS cache will get
331                          * confused.  The other half of the special handling
332                          * is in hammer_vop_nlookupdotdot().
333                          *
334                          * Pseudo-filesystem roots can be accessed via
335                          * non-root filesystem paths and setting VROOT may
336                          * confuse the namecache.  Set VPFSROOT instead.
337                          */
338                         if (ip->obj_id == HAMMER_OBJID_ROOT) {
339                                 if (ip->obj_asof == hmp->asof) {
340                                         if (ip->obj_localization ==
341                                                 HAMMER_DEF_LOCALIZATION)
342                                                 vsetflags(vp, VROOT);
343                                         else
344                                                 vsetflags(vp, VPFSROOT);
345                                 } else {
346                                         vsetflags(vp, VPFSROOT);
347                                 }
348                         }
349
350                         vp->v_data = (void *)ip;
351                         /* vnode locked by getnewvnode() */
352                         /* make related vnode dirty if inode dirty? */
353                         hammer_unlock(&ip->lock);
354                         if (vp->v_type == VREG) {
355                                 vinitvmio(vp, ip->ino_data.size,
356                                           hammer_blocksize(ip->ino_data.size),
357                                           hammer_blockoff(ip->ino_data.size));
358                         }
359                         break;
360                 }
361
362                 /*
363                  * Interlock vnode clearing.  This does not prevent the
364                  * vnode from going into a reclaimed state but it does
365                  * prevent it from being destroyed or reused so the vget()
366                  * will properly fail.
367                  */
368                 hammer_lock_ex(&ip->lock);
369                 if ((vp = ip->vp) == NULL) {
370                         hammer_unlock(&ip->lock);
371                         continue;
372                 }
373                 vhold(vp);
374                 hammer_unlock(&ip->lock);
375
376                 /*
377                  * loop if the vget fails (aka races), or if the vp
378                  * no longer matches ip->vp.
379                  */
380                 if (vget(vp, LK_EXCLUSIVE) == 0) {
381                         if (vp == ip->vp) {
382                                 vdrop(vp);
383                                 break;
384                         }
385                         vput(vp);
386                 }
387                 vdrop(vp);
388         }
389         *vpp = vp;
390         return(error);
391 }
392
393 /*
394  * Locate all copies of the inode for obj_id compatible with the specified
395  * asof, reference, and issue the related call-back.  This routine is used
396  * for direct-io invalidation and does not create any new inodes.
397  */
398 void
399 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
400                             int (*callback)(hammer_inode_t ip, void *data),
401                             void *data)
402 {
403         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
404                                    hammer_inode_info_cmp_all_history,
405                                    callback, iinfo);
406 }
407
408 /*
409  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
410  * do not attach or detach the related vnode (use hammer_get_vnode() for
411  * that).
412  *
413  * The flags argument is only applied for newly created inodes, and only
414  * certain flags are inherited.
415  *
416  * Called from the frontend.
417  */
418 struct hammer_inode *
419 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
420                  int64_t obj_id, hammer_tid_t asof, uint32_t localization,
421                  int flags, int *errorp)
422 {
423         hammer_mount_t hmp = trans->hmp;
424         struct hammer_node_cache *cachep;
425         struct hammer_cursor cursor;
426         struct hammer_inode *ip;
427
428
429         /*
430          * Determine if we already have an inode cached.  If we do then
431          * we are golden.
432          *
433          * If we find an inode with no vnode we have to mark the
434          * transaction such that hammer_inode_waitreclaims() is
435          * called later on to avoid building up an infinite number
436          * of inodes.  Otherwise we can continue to * add new inodes
437          * faster then they can be disposed of, even with the tsleep
438          * delay.
439          *
440          * If we find a dummy inode we return a failure so dounlink
441          * (which does another lookup) doesn't try to mess with the
442          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
443          * to ref dummy inodes.
444          */
445 loop:
446         *errorp = 0;
447         ip = __hammer_find_inode(trans, obj_id, asof, localization);
448         if (ip) {
449                 if (ip->flags & HAMMER_INODE_DUMMY) {
450                         *errorp = ENOENT;
451                         return(NULL);
452                 }
453                 hammer_ref(&ip->lock);
454                 return(ip);
455         }
456
457         /*
458          * Allocate a new inode structure and deal with races later.
459          */
460         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
461         ++hammer_count_inodes;
462         ++hmp->count_inodes;
463         ip->obj_id = obj_id;
464         ip->obj_asof = asof;
465         ip->obj_localization = localization;
466         ip->hmp = hmp;
467         ip->flags = flags & HAMMER_INODE_RO;
468         ip->cache[0].ip = ip;
469         ip->cache[1].ip = ip;
470         ip->cache[2].ip = ip;
471         ip->cache[3].ip = ip;
472         if (hmp->ronly)
473                 ip->flags |= HAMMER_INODE_RO;
474         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
475                 0x7FFFFFFFFFFFFFFFLL;
476         RB_INIT(&ip->rec_tree);
477         TAILQ_INIT(&ip->target_list);
478         hammer_ref(&ip->lock);
479
480         /*
481          * Locate the on-disk inode.  If this is a PFS root we always
482          * access the current version of the root inode and (if it is not
483          * a master) always access information under it with a snapshot
484          * TID.
485          *
486          * We cache recent inode lookups in this directory in dip->cache[2].
487          * If we can't find it we assume the inode we are looking for is
488          * close to the directory inode.
489          */
490 retry:
491         cachep = NULL;
492         if (dip) {
493                 if (dip->cache[2].node)
494                         cachep = &dip->cache[2];
495                 else
496                         cachep = &dip->cache[0];
497         }
498         hammer_init_cursor(trans, &cursor, cachep, NULL);
499         cursor.key_beg.localization = localization | HAMMER_LOCALIZE_INODE;
500         cursor.key_beg.obj_id = ip->obj_id;
501         cursor.key_beg.key = 0;
502         cursor.key_beg.create_tid = 0;
503         cursor.key_beg.delete_tid = 0;
504         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
505         cursor.key_beg.obj_type = 0;
506
507         cursor.asof = asof;
508         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
509                        HAMMER_CURSOR_ASOF;
510
511         *errorp = hammer_btree_lookup(&cursor);
512         if (*errorp == EDEADLK) {
513                 hammer_done_cursor(&cursor);
514                 goto retry;
515         }
516
517         /*
518          * On success the B-Tree lookup will hold the appropriate
519          * buffer cache buffers and provide a pointer to the requested
520          * information.  Copy the information to the in-memory inode
521          * and cache the B-Tree node to improve future operations.
522          */
523         if (*errorp == 0) {
524                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
525                 ip->ino_data = cursor.data->inode;
526
527                 /*
528                  * cache[0] tries to cache the location of the object inode.
529                  * The assumption is that it is near the directory inode.
530                  *
531                  * cache[1] tries to cache the location of the object data.
532                  * We might have something in the governing directory from
533                  * scan optimizations (see the strategy code in
534                  * hammer_vnops.c).
535                  *
536                  * We update dip->cache[2], if possible, with the location
537                  * of the object inode for future directory shortcuts.
538                  */
539                 hammer_cache_node(&ip->cache[0], cursor.node);
540                 if (dip) {
541                         if (dip->cache[3].node) {
542                                 hammer_cache_node(&ip->cache[1],
543                                                   dip->cache[3].node);
544                         }
545                         hammer_cache_node(&dip->cache[2], cursor.node);
546                 }
547
548                 /*
549                  * The file should not contain any data past the file size
550                  * stored in the inode.  Setting save_trunc_off to the
551                  * file size instead of max reduces B-Tree lookup overheads
552                  * on append by allowing the flusher to avoid checking for
553                  * record overwrites.
554                  */
555                 ip->save_trunc_off = ip->ino_data.size;
556
557                 /*
558                  * Locate and assign the pseudofs management structure to
559                  * the inode.
560                  */
561                 if (dip && dip->obj_localization == ip->obj_localization) {
562                         ip->pfsm = dip->pfsm;
563                         hammer_ref(&ip->pfsm->lock);
564                 } else {
565                         ip->pfsm = hammer_load_pseudofs(trans,
566                                                         ip->obj_localization,
567                                                         errorp);
568                         *errorp = 0;    /* ignore ENOENT */
569                 }
570         }
571
572         /*
573          * The inode is placed on the red-black tree and will be synced to
574          * the media when flushed or by the filesystem sync.  If this races
575          * another instantiation/lookup the insertion will fail.
576          */
577         if (*errorp == 0) {
578                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
579                         hammer_free_inode(ip);
580                         hammer_done_cursor(&cursor);
581                         goto loop;
582                 }
583                 ip->flags |= HAMMER_INODE_ONDISK;
584         } else {
585                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
586                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
587                         --hmp->rsv_inodes;
588                 }
589
590                 hammer_free_inode(ip);
591                 ip = NULL;
592         }
593         hammer_done_cursor(&cursor);
594
595         /*
596          * NEWINODE is only set if the inode becomes dirty later,
597          * setting it here just leads to unnecessary stalls.
598          *
599          * trans->flags |= HAMMER_TRANSF_NEWINODE;
600          */
601         return (ip);
602 }
603
604 /*
605  * Get a dummy inode to placemark a broken directory entry.
606  */
607 struct hammer_inode *
608 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
609                  int64_t obj_id, hammer_tid_t asof, uint32_t localization,
610                  int flags, int *errorp)
611 {
612         hammer_mount_t hmp = trans->hmp;
613         struct hammer_inode *ip;
614
615         /*
616          * Determine if we already have an inode cached.  If we do then
617          * we are golden.
618          *
619          * If we find an inode with no vnode we have to mark the
620          * transaction such that hammer_inode_waitreclaims() is
621          * called later on to avoid building up an infinite number
622          * of inodes.  Otherwise we can continue to * add new inodes
623          * faster then they can be disposed of, even with the tsleep
624          * delay.
625          *
626          * If we find a non-fake inode we return an error.  Only fake
627          * inodes can be returned by this routine.
628          */
629 loop:
630         *errorp = 0;
631         ip = __hammer_find_inode(trans, obj_id, asof, localization);
632         if (ip) {
633                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
634                         *errorp = ENOENT;
635                         return(NULL);
636                 }
637                 hammer_ref(&ip->lock);
638                 return(ip);
639         }
640
641         /*
642          * Allocate a new inode structure and deal with races later.
643          */
644         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
645         ++hammer_count_inodes;
646         ++hmp->count_inodes;
647         ip->obj_id = obj_id;
648         ip->obj_asof = asof;
649         ip->obj_localization = localization;
650         ip->hmp = hmp;
651         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
652         ip->cache[0].ip = ip;
653         ip->cache[1].ip = ip;
654         ip->cache[2].ip = ip;
655         ip->cache[3].ip = ip;
656         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
657                 0x7FFFFFFFFFFFFFFFLL;
658         RB_INIT(&ip->rec_tree);
659         TAILQ_INIT(&ip->target_list);
660         hammer_ref(&ip->lock);
661
662         /*
663          * Populate the dummy inode.  Leave everything zero'd out.
664          *
665          * (ip->ino_leaf and ip->ino_data)
666          *
667          * Make the dummy inode a FIFO object which most copy programs
668          * will properly ignore.
669          */
670         ip->save_trunc_off = ip->ino_data.size;
671         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
672
673         /*
674          * Locate and assign the pseudofs management structure to
675          * the inode.
676          */
677         if (dip && dip->obj_localization == ip->obj_localization) {
678                 ip->pfsm = dip->pfsm;
679                 hammer_ref(&ip->pfsm->lock);
680         } else {
681                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
682                                                 errorp);
683                 *errorp = 0;    /* ignore ENOENT */
684         }
685
686         /*
687          * The inode is placed on the red-black tree and will be synced to
688          * the media when flushed or by the filesystem sync.  If this races
689          * another instantiation/lookup the insertion will fail.
690          *
691          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
692          */
693         if (*errorp == 0) {
694                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
695                         hammer_free_inode(ip);
696                         goto loop;
697                 }
698         } else {
699                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
700                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
701                         --hmp->rsv_inodes;
702                 }
703                 hammer_free_inode(ip);
704                 ip = NULL;
705         }
706         trans->flags |= HAMMER_TRANSF_NEWINODE;
707         return (ip);
708 }
709
710 /*
711  * Return a referenced inode only if it is in our inode cache.
712  * Dummy inodes do not count.
713  */
714 struct hammer_inode *
715 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
716                   hammer_tid_t asof, uint32_t localization)
717 {
718         struct hammer_inode *ip;
719
720         ip = __hammer_find_inode(trans, obj_id, asof, localization);
721         if (ip) {
722                 if (ip->flags & HAMMER_INODE_DUMMY)
723                         ip = NULL;
724                 else
725                         hammer_ref(&ip->lock);
726         }
727         return(ip);
728 }
729
730 /*
731  * Return a referenced inode only if it is in our inode cache.
732  * This function does not reference inode.
733  */
734 static struct hammer_inode *
735 __hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
736                   hammer_tid_t asof, uint32_t localization)
737 {
738         hammer_mount_t hmp = trans->hmp;
739         struct hammer_inode_info iinfo;
740         struct hammer_inode *ip;
741
742         iinfo.obj_id = obj_id;
743         iinfo.obj_asof = asof;
744         iinfo.obj_localization = localization;
745
746         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
747
748         return(ip);
749 }
750
751 /*
752  * Create a new filesystem object, returning the inode in *ipp.  The
753  * returned inode will be referenced.  The inode is created in-memory.
754  *
755  * If pfsm is non-NULL the caller wishes to create the root inode for
756  * a non-root PFS.
757  */
758 int
759 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
760                     struct ucred *cred,
761                     hammer_inode_t dip, const char *name, int namelen,
762                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
763 {
764         hammer_mount_t hmp;
765         hammer_inode_t ip;
766         uid_t xuid;
767         int error;
768         int64_t namekey;
769         uint32_t dummy;
770
771         hmp = trans->hmp;
772
773         /*
774          * Disallow the creation of new inodes in directories which
775          * have been deleted.  In HAMMER, this will cause a record
776          * syncing assertion later on in the flush code.
777          */
778         if (dip && dip->ino_data.nlinks == 0) {
779                 *ipp = NULL;
780                 return (EINVAL);
781         }
782
783         /*
784          * Allocate inode
785          */
786         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
787         ++hammer_count_inodes;
788         ++hmp->count_inodes;
789         trans->flags |= HAMMER_TRANSF_NEWINODE;
790
791         if (pfsm) {
792                 KKASSERT(pfsm->localization != HAMMER_DEF_LOCALIZATION);
793                 ip->obj_id = HAMMER_OBJID_ROOT;
794                 ip->obj_localization = pfsm->localization;
795         } else {
796                 KKASSERT(dip != NULL);
797                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
798                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
799                 ip->obj_localization = dip->obj_localization;
800         }
801
802         KKASSERT(ip->obj_id != 0);
803         ip->obj_asof = hmp->asof;
804         ip->hmp = hmp;
805         ip->flush_state = HAMMER_FST_IDLE;
806         ip->flags = HAMMER_INODE_DDIRTY |
807                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
808         ip->cache[0].ip = ip;
809         ip->cache[1].ip = ip;
810         ip->cache[2].ip = ip;
811         ip->cache[3].ip = ip;
812
813         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
814         /* ip->save_trunc_off = 0; (already zero) */
815         RB_INIT(&ip->rec_tree);
816         TAILQ_INIT(&ip->target_list);
817
818         ip->ino_data.atime = trans->time;
819         ip->ino_data.mtime = trans->time;
820         ip->ino_data.size = 0;
821         ip->ino_data.nlinks = 0;
822
823         /*
824          * A nohistory designator on the parent directory is inherited by
825          * the child.  We will do this even for pseudo-fs creation... the
826          * sysad can turn it off.
827          */
828         if (dip) {
829                 ip->ino_data.uflags = dip->ino_data.uflags &
830                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
831         }
832
833         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
834         ip->ino_leaf.base.localization = ip->obj_localization |
835                                          HAMMER_LOCALIZE_INODE;
836         ip->ino_leaf.base.obj_id = ip->obj_id;
837         ip->ino_leaf.base.key = 0;
838         ip->ino_leaf.base.create_tid = 0;
839         ip->ino_leaf.base.delete_tid = 0;
840         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
841         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
842
843         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
844         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
845         ip->ino_data.mode = vap->va_mode;
846         ip->ino_data.ctime = trans->time;
847
848         /*
849          * If we are running version 2 or greater directory entries are
850          * inode-localized instead of data-localized.
851          */
852         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
853                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
854                         ip->ino_data.cap_flags |=
855                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
856                 }
857         }
858         if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
859                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
860                         ip->ino_data.cap_flags |=
861                                 HAMMER_INODE_CAP_DIRHASH_ALG1;
862                 }
863         }
864
865         /*
866          * Setup the ".." pointer.  This only needs to be done for directories
867          * but we do it for all objects as a recovery aid if dip exists.
868          * The inode is probably a PFS root if dip is NULL.
869          */
870         if (dip)
871                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
872
873         switch(ip->ino_leaf.base.obj_type) {
874         case HAMMER_OBJTYPE_CDEV:
875         case HAMMER_OBJTYPE_BDEV:
876                 ip->ino_data.rmajor = vap->va_rmajor;
877                 ip->ino_data.rminor = vap->va_rminor;
878                 break;
879         default:
880                 break;
881         }
882
883         /*
884          * Calculate default uid/gid and overwrite with information from
885          * the vap.
886          */
887         if (dip) {
888                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
889                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
890                                              xuid, cred, &vap->va_mode);
891         } else {
892                 xuid = 0;
893         }
894         ip->ino_data.mode = vap->va_mode;
895
896         if (vap->va_vaflags & VA_UID_UUID_VALID)
897                 ip->ino_data.uid = vap->va_uid_uuid;
898         else if (vap->va_uid != (uid_t)VNOVAL)
899                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
900         else
901                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
902
903         if (vap->va_vaflags & VA_GID_UUID_VALID)
904                 ip->ino_data.gid = vap->va_gid_uuid;
905         else if (vap->va_gid != (gid_t)VNOVAL)
906                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
907         else if (dip)
908                 ip->ino_data.gid = dip->ino_data.gid;
909
910         hammer_ref(&ip->lock);
911
912         if (pfsm) {
913                 ip->pfsm = pfsm;
914                 hammer_ref(&pfsm->lock);
915                 error = 0;
916         } else if (dip->obj_localization == ip->obj_localization) {
917                 ip->pfsm = dip->pfsm;
918                 hammer_ref(&ip->pfsm->lock);
919                 error = 0;
920         } else {
921                 ip->pfsm = hammer_load_pseudofs(trans,
922                                                 ip->obj_localization,
923                                                 &error);
924                 error = 0;      /* ignore ENOENT */
925         }
926
927         if (error) {
928                 hammer_free_inode(ip);
929                 ip = NULL;
930         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
931                 hpanic("duplicate obj_id %llx", (long long)ip->obj_id);
932                 /* not reached */
933                 hammer_free_inode(ip);
934         }
935         *ipp = ip;
936         return(error);
937 }
938
939 /*
940  * Final cleanup / freeing of an inode structure
941  */
942 static void
943 hammer_free_inode(hammer_inode_t ip)
944 {
945         struct hammer_mount *hmp;
946
947         hmp = ip->hmp;
948         KKASSERT(hammer_oneref(&ip->lock));
949         hammer_uncache_node(&ip->cache[0]);
950         hammer_uncache_node(&ip->cache[1]);
951         hammer_uncache_node(&ip->cache[2]);
952         hammer_uncache_node(&ip->cache[3]);
953         hammer_inode_wakereclaims(ip);
954         if (ip->objid_cache)
955                 hammer_clear_objid(ip);
956         --hammer_count_inodes;
957         --hmp->count_inodes;
958         if (ip->pfsm) {
959                 hammer_rel_pseudofs(hmp, ip->pfsm);
960                 ip->pfsm = NULL;
961         }
962         kfree(ip, hmp->m_inodes);
963 }
964
965 /*
966  * Retrieve pseudo-fs data.  NULL will never be returned.
967  *
968  * If an error occurs *errorp will be set and a default template is returned,
969  * otherwise *errorp is set to 0.  Typically when an error occurs it will
970  * be ENOENT.
971  */
972 hammer_pseudofs_inmem_t
973 hammer_load_pseudofs(hammer_transaction_t trans,
974                      uint32_t localization, int *errorp)
975 {
976         hammer_mount_t hmp = trans->hmp;
977         hammer_inode_t ip;
978         hammer_pseudofs_inmem_t pfsm;
979         struct hammer_cursor cursor;
980         int bytes;
981
982 retry:
983         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
984         if (pfsm) {
985                 hammer_ref(&pfsm->lock);
986                 *errorp = 0;
987                 return(pfsm);
988         }
989
990         /*
991          * PFS records are associated with the root inode (not the PFS root
992          * inode, but the real root).  Avoid an infinite recursion if loading
993          * the PFS for the real root.
994          */
995         if (localization) {
996                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
997                                       HAMMER_MAX_TID,
998                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
999         } else {
1000                 ip = NULL;
1001         }
1002
1003         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
1004         pfsm->localization = localization;
1005         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
1006         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
1007
1008         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
1009         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION |
1010                                       HAMMER_LOCALIZE_MISC;
1011         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1012         cursor.key_beg.create_tid = 0;
1013         cursor.key_beg.delete_tid = 0;
1014         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1015         cursor.key_beg.obj_type = 0;
1016         cursor.key_beg.key = localization;
1017         cursor.asof = HAMMER_MAX_TID;
1018         cursor.flags |= HAMMER_CURSOR_ASOF;
1019
1020         if (ip)
1021                 *errorp = hammer_ip_lookup(&cursor);
1022         else
1023                 *errorp = hammer_btree_lookup(&cursor);
1024         if (*errorp == 0) {
1025                 *errorp = hammer_ip_resolve_data(&cursor);
1026                 if (*errorp == 0) {
1027                         if (cursor.data->pfsd.mirror_flags &
1028                             HAMMER_PFSD_DELETED) {
1029                                 *errorp = ENOENT;
1030                         } else {
1031                                 bytes = cursor.leaf->data_len;
1032                                 if (bytes > sizeof(pfsm->pfsd))
1033                                         bytes = sizeof(pfsm->pfsd);
1034                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
1035                         }
1036                 }
1037         }
1038         hammer_done_cursor(&cursor);
1039
1040         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1041         hammer_ref(&pfsm->lock);
1042         if (ip)
1043                 hammer_rel_inode(ip, 0);
1044         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
1045                 kfree(pfsm, hmp->m_misc);
1046                 goto retry;
1047         }
1048         return(pfsm);
1049 }
1050
1051 /*
1052  * Store pseudo-fs data.  The backend will automatically delete any prior
1053  * on-disk pseudo-fs data but we have to delete in-memory versions.
1054  */
1055 int
1056 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
1057 {
1058         struct hammer_cursor cursor;
1059         hammer_record_t record;
1060         hammer_inode_t ip;
1061         int error;
1062
1063         /*
1064          * PFS records are associated with the root inode (not the PFS root
1065          * inode, but the real root).
1066          */
1067         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1068                               HAMMER_DEF_LOCALIZATION, 0, &error);
1069 retry:
1070         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1071         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
1072         cursor.key_beg.localization = ip->obj_localization |
1073                                       HAMMER_LOCALIZE_MISC;
1074         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1075         cursor.key_beg.create_tid = 0;
1076         cursor.key_beg.delete_tid = 0;
1077         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1078         cursor.key_beg.obj_type = 0;
1079         cursor.key_beg.key = pfsm->localization;
1080         cursor.asof = HAMMER_MAX_TID;
1081         cursor.flags |= HAMMER_CURSOR_ASOF;
1082
1083         /*
1084          * Replace any in-memory version of the record.
1085          */
1086         error = hammer_ip_lookup(&cursor);
1087         if (error == 0 && hammer_cursor_inmem(&cursor)) {
1088                 record = cursor.iprec;
1089                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1090                         KKASSERT(cursor.deadlk_rec == NULL);
1091                         hammer_ref(&record->lock);
1092                         cursor.deadlk_rec = record;
1093                         error = EDEADLK;
1094                 } else {
1095                         record->flags |= HAMMER_RECF_DELETED_FE;
1096                         error = 0;
1097                 }
1098         }
1099
1100         /*
1101          * Allocate replacement general record.  The backend flush will
1102          * delete any on-disk version of the record.
1103          */
1104         if (error == 0 || error == ENOENT) {
1105                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1106                 record->type = HAMMER_MEM_RECORD_GENERAL;
1107
1108                 record->leaf.base.localization = ip->obj_localization |
1109                                                  HAMMER_LOCALIZE_MISC;
1110                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1111                 record->leaf.base.key = pfsm->localization;
1112                 record->leaf.data_len = sizeof(pfsm->pfsd);
1113                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1114                 error = hammer_ip_add_record(trans, record);
1115         }
1116         hammer_done_cursor(&cursor);
1117         if (error == EDEADLK)
1118                 goto retry;
1119         hammer_rel_inode(ip, 0);
1120         return(error);
1121 }
1122
1123 /*
1124  * Create a root directory for a PFS if one does not alredy exist.
1125  *
1126  * The PFS root stands alone so we must also bump the nlinks count
1127  * to prevent it from being destroyed on release.
1128  */
1129 int
1130 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1131                        hammer_pseudofs_inmem_t pfsm)
1132 {
1133         hammer_inode_t ip;
1134         struct vattr vap;
1135         int error;
1136
1137         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1138                               pfsm->localization, 0, &error);
1139         if (ip == NULL) {
1140                 vattr_null(&vap);
1141                 vap.va_mode = 0755;
1142                 vap.va_type = VDIR;
1143                 error = hammer_create_inode(trans, &vap, cred,
1144                                             NULL, NULL, 0,
1145                                             pfsm, &ip);
1146                 if (error == 0) {
1147                         ++ip->ino_data.nlinks;
1148                         hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
1149                 }
1150         }
1151         if (ip)
1152                 hammer_rel_inode(ip, 0);
1153         return(error);
1154 }
1155
1156 /*
1157  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1158  * if we are unable to disassociate all the inodes.
1159  */
1160 static
1161 int
1162 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1163 {
1164         int res;
1165
1166         hammer_ref(&ip->lock);
1167         if (ip->vp && (ip->vp->v_flag & VPFSROOT)) {
1168                 /*
1169                  * The hammer pfs-upgrade directive itself might have the
1170                  * root of the pfs open.  Just allow it.
1171                  */
1172                 res = 0;
1173         } else {
1174                 /*
1175                  * Don't allow any subdirectories or files to be open.
1176                  */
1177                 if (hammer_isactive(&ip->lock) == 2 && ip->vp)
1178                         vclean_unlocked(ip->vp);
1179                 if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
1180                         res = 0;
1181                 else
1182                         res = -1;       /* stop, someone is using the inode */
1183         }
1184         hammer_rel_inode(ip, 0);
1185         return(res);
1186 }
1187
1188 int
1189 hammer_unload_pseudofs(hammer_transaction_t trans, uint32_t localization)
1190 {
1191         int res;
1192         int try;
1193
1194         for (try = res = 0; try < 4; ++try) {
1195                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1196                                            hammer_inode_pfs_cmp,
1197                                            hammer_unload_pseudofs_callback,
1198                                            &localization);
1199                 if (res == 0 && try > 1)
1200                         break;
1201                 hammer_flusher_sync(trans->hmp);
1202         }
1203         if (res != 0)
1204                 res = ENOTEMPTY;
1205         return(res);
1206 }
1207
1208
1209 /*
1210  * Release a reference on a PFS
1211  */
1212 void
1213 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1214 {
1215         hammer_rel(&pfsm->lock);
1216         if (hammer_norefs(&pfsm->lock)) {
1217                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1218                 kfree(pfsm, hmp->m_misc);
1219         }
1220 }
1221
1222 /*
1223  * Called by hammer_sync_inode().
1224  */
1225 static int
1226 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1227 {
1228         hammer_transaction_t trans = cursor->trans;
1229         hammer_record_t record;
1230         int error;
1231         int redirty;
1232
1233 retry:
1234         error = 0;
1235
1236         /*
1237          * If the inode has a presence on-disk then locate it and mark
1238          * it deleted, setting DELONDISK.
1239          *
1240          * The record may or may not be physically deleted, depending on
1241          * the retention policy.
1242          */
1243         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1244             HAMMER_INODE_ONDISK) {
1245                 hammer_normalize_cursor(cursor);
1246                 cursor->key_beg.localization = ip->obj_localization |
1247                                                HAMMER_LOCALIZE_INODE;
1248                 cursor->key_beg.obj_id = ip->obj_id;
1249                 cursor->key_beg.key = 0;
1250                 cursor->key_beg.create_tid = 0;
1251                 cursor->key_beg.delete_tid = 0;
1252                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1253                 cursor->key_beg.obj_type = 0;
1254                 cursor->asof = ip->obj_asof;
1255                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1256                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1257                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1258
1259                 error = hammer_btree_lookup(cursor);
1260                 if (hammer_debug_inode)
1261                         hdkprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1262
1263                 if (error == 0) {
1264                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1265                         if (hammer_debug_inode)
1266                                 hdkprintf("error %d\n", error);
1267                         if (error == 0) {
1268                                 ip->flags |= HAMMER_INODE_DELONDISK;
1269                         }
1270                         if (cursor->node)
1271                                 hammer_cache_node(&ip->cache[0], cursor->node);
1272                 }
1273                 if (error == EDEADLK) {
1274                         hammer_done_cursor(cursor);
1275                         error = hammer_init_cursor(trans, cursor,
1276                                                    &ip->cache[0], ip);
1277                         if (hammer_debug_inode)
1278                                 hdkprintf("IPDED %p %d\n", ip, error);
1279                         if (error == 0)
1280                                 goto retry;
1281                 }
1282         }
1283
1284         /*
1285          * Ok, write out the initial record or a new record (after deleting
1286          * the old one), unless the DELETED flag is set.  This routine will
1287          * clear DELONDISK if it writes out a record.
1288          *
1289          * Update our inode statistics if this is the first application of
1290          * the inode on-disk.
1291          */
1292         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1293                 /*
1294                  * Generate a record and write it to the media.  We clean-up
1295                  * the state before releasing so we do not have to set-up
1296                  * a flush_group.
1297                  */
1298                 record = hammer_alloc_mem_record(ip, 0);
1299                 record->type = HAMMER_MEM_RECORD_INODE;
1300                 record->flush_state = HAMMER_FST_FLUSH;
1301                 record->leaf = ip->sync_ino_leaf;
1302                 record->leaf.base.create_tid = trans->tid;
1303                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1304                 record->leaf.create_ts = trans->time32;
1305                 record->data = (void *)&ip->sync_ino_data;
1306                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1307
1308                 /*
1309                  * If this flag is set we cannot sync the new file size
1310                  * because we haven't finished related truncations.  The
1311                  * inode will be flushed in another flush group to finish
1312                  * the job.
1313                  */
1314                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1315                     ip->sync_ino_data.size != ip->ino_data.size) {
1316                         redirty = 1;
1317                         ip->sync_ino_data.size = ip->ino_data.size;
1318                 } else {
1319                         redirty = 0;
1320                 }
1321
1322                 for (;;) {
1323                         error = hammer_ip_sync_record_cursor(cursor, record);
1324                         if (hammer_debug_inode)
1325                                 hdkprintf("GENREC %p rec %08x %d\n",
1326                                         ip, record->flags, error);
1327                         if (error != EDEADLK)
1328                                 break;
1329                         hammer_done_cursor(cursor);
1330                         error = hammer_init_cursor(trans, cursor,
1331                                                    &ip->cache[0], ip);
1332                         if (hammer_debug_inode)
1333                                 hdkprintf("GENREC reinit %d\n", error);
1334                         if (error)
1335                                 break;
1336                 }
1337
1338                 /*
1339                  * Note:  The record was never on the inode's record tree
1340                  * so just wave our hands importantly and destroy it.
1341                  */
1342                 record->flags |= HAMMER_RECF_COMMITTED;
1343                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1344                 record->flush_state = HAMMER_FST_IDLE;
1345                 ++ip->rec_generation;
1346                 hammer_rel_mem_record(record);
1347
1348                 /*
1349                  * Finish up.
1350                  */
1351                 if (error == 0) {
1352                         if (hammer_debug_inode)
1353                                 hdkprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1354                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1355                                             HAMMER_INODE_SDIRTY |
1356                                             HAMMER_INODE_ATIME |
1357                                             HAMMER_INODE_MTIME);
1358                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1359                         if (redirty)
1360                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1361
1362                         /*
1363                          * Root volume count of inodes
1364                          */
1365                         hammer_sync_lock_sh(trans);
1366                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1367                                 hammer_modify_volume_field(trans,
1368                                                            trans->rootvol,
1369                                                            vol0_stat_inodes);
1370                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1371                                 hammer_modify_volume_done(trans->rootvol);
1372                                 ip->flags |= HAMMER_INODE_ONDISK;
1373                                 if (hammer_debug_inode)
1374                                         hdkprintf("NOWONDISK %p\n", ip);
1375                         }
1376                         hammer_sync_unlock(trans);
1377                 }
1378         }
1379
1380         /*
1381          * If the inode has been destroyed, clean out any left-over flags
1382          * that may have been set by the frontend.
1383          */
1384         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
1385                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1386                                     HAMMER_INODE_SDIRTY |
1387                                     HAMMER_INODE_ATIME |
1388                                     HAMMER_INODE_MTIME);
1389         }
1390         return(error);
1391 }
1392
1393 /*
1394  * Update only the itimes fields.
1395  *
1396  * ATIME can be updated without generating any UNDO.  MTIME is updated
1397  * with UNDO so it is guaranteed to be synchronized properly in case of
1398  * a crash.
1399  *
1400  * Neither field is included in the B-Tree leaf element's CRC, which is how
1401  * we can get away with updating ATIME the way we do.
1402  */
1403 static int
1404 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1405 {
1406         hammer_transaction_t trans = cursor->trans;
1407         int error;
1408
1409 retry:
1410         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1411             HAMMER_INODE_ONDISK) {
1412                 return(0);
1413         }
1414
1415         hammer_normalize_cursor(cursor);
1416         cursor->key_beg.localization = ip->obj_localization |
1417                                        HAMMER_LOCALIZE_INODE;
1418         cursor->key_beg.obj_id = ip->obj_id;
1419         cursor->key_beg.key = 0;
1420         cursor->key_beg.create_tid = 0;
1421         cursor->key_beg.delete_tid = 0;
1422         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1423         cursor->key_beg.obj_type = 0;
1424         cursor->asof = ip->obj_asof;
1425         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1426         cursor->flags |= HAMMER_CURSOR_ASOF;
1427         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1428         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1429         cursor->flags |= HAMMER_CURSOR_BACKEND;
1430
1431         error = hammer_btree_lookup(cursor);
1432         if (error == 0) {
1433                 hammer_cache_node(&ip->cache[0], cursor->node);
1434                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1435                         /*
1436                          * Updating MTIME requires an UNDO.  Just cover
1437                          * both atime and mtime.
1438                          */
1439                         hammer_sync_lock_sh(trans);
1440                         hammer_modify_buffer(trans, cursor->data_buffer,
1441                                 &cursor->data->inode.mtime,
1442                                 sizeof(cursor->data->inode.atime) +
1443                                 sizeof(cursor->data->inode.mtime));
1444                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1445                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1446                         hammer_modify_buffer_done(cursor->data_buffer);
1447                         hammer_sync_unlock(trans);
1448                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1449                         /*
1450                          * Updating atime only can be done in-place with
1451                          * no UNDO.
1452                          */
1453                         hammer_sync_lock_sh(trans);
1454                         hammer_modify_buffer_noundo(trans, cursor->data_buffer);
1455                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1456                         hammer_modify_buffer_done(cursor->data_buffer);
1457                         hammer_sync_unlock(trans);
1458                 }
1459                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1460         }
1461         if (error == EDEADLK) {
1462                 hammer_done_cursor(cursor);
1463                 error = hammer_init_cursor(trans, cursor, &ip->cache[0], ip);
1464                 if (error == 0)
1465                         goto retry;
1466         }
1467         return(error);
1468 }
1469
1470 /*
1471  * Release a reference on an inode, flush as requested.
1472  *
1473  * On the last reference we queue the inode to the flusher for its final
1474  * disposition.
1475  */
1476 void
1477 hammer_rel_inode(struct hammer_inode *ip, int flush)
1478 {
1479         /*
1480          * Handle disposition when dropping the last ref.
1481          */
1482         for (;;) {
1483                 if (hammer_oneref(&ip->lock)) {
1484                         /*
1485                          * Determine whether on-disk action is needed for
1486                          * the inode's final disposition.
1487                          */
1488                         KKASSERT(ip->vp == NULL);
1489                         hammer_inode_unloadable_check(ip, 0);
1490                         if (ip->flags & HAMMER_INODE_MODMASK) {
1491                                 hammer_flush_inode(ip, 0);
1492                         } else if (hammer_oneref(&ip->lock)) {
1493                                 hammer_unload_inode(ip);
1494                                 break;
1495                         }
1496                 } else {
1497                         if (flush)
1498                                 hammer_flush_inode(ip, 0);
1499
1500                         /*
1501                          * The inode still has multiple refs, try to drop
1502                          * one ref.
1503                          */
1504                         KKASSERT(hammer_isactive(&ip->lock) >= 1);
1505                         if (hammer_isactive(&ip->lock) > 1) {
1506                                 hammer_rel(&ip->lock);
1507                                 break;
1508                         }
1509                 }
1510         }
1511 }
1512
1513 /*
1514  * Unload and destroy the specified inode.  Must be called with one remaining
1515  * reference.  The reference is disposed of.
1516  *
1517  * The inode must be completely clean.
1518  */
1519 static int
1520 hammer_unload_inode(struct hammer_inode *ip)
1521 {
1522         hammer_mount_t hmp = ip->hmp;
1523
1524         KASSERT(hammer_oneref(&ip->lock),
1525                 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock)));
1526         KKASSERT(ip->vp == NULL);
1527         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1528         KKASSERT(ip->cursor_ip_refs == 0);
1529         KKASSERT(hammer_notlocked(&ip->lock));
1530         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1531
1532         KKASSERT(RB_EMPTY(&ip->rec_tree));
1533         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1534
1535         if (ip->flags & HAMMER_INODE_RDIRTY) {
1536                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
1537                 ip->flags &= ~HAMMER_INODE_RDIRTY;
1538         }
1539         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1540
1541         hammer_free_inode(ip);
1542         return(0);
1543 }
1544
1545 /*
1546  * Called during unmounting if a critical error occured.  The in-memory
1547  * inode and all related structures are destroyed.
1548  *
1549  * If a critical error did not occur the unmount code calls the standard
1550  * release and asserts that the inode is gone.
1551  */
1552 int
1553 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1554 {
1555         hammer_record_t rec;
1556
1557         /*
1558          * Get rid of the inodes in-memory records, regardless of their
1559          * state, and clear the mod-mask.
1560          */
1561         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1562                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1563                 rec->target_ip = NULL;
1564                 if (rec->flush_state == HAMMER_FST_SETUP)
1565                         rec->flush_state = HAMMER_FST_IDLE;
1566         }
1567         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1568                 if (rec->flush_state == HAMMER_FST_FLUSH)
1569                         --rec->flush_group->refs;
1570                 else
1571                         hammer_ref(&rec->lock);
1572                 KKASSERT(hammer_oneref(&rec->lock));
1573                 rec->flush_state = HAMMER_FST_IDLE;
1574                 rec->flush_group = NULL;
1575                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1576                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1577                 ++ip->rec_generation;
1578                 hammer_rel_mem_record(rec);
1579         }
1580         ip->flags &= ~HAMMER_INODE_MODMASK;
1581         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1582         KKASSERT(ip->vp == NULL);
1583
1584         /*
1585          * Remove the inode from any flush group, force it idle.  FLUSH
1586          * and SETUP states have an inode ref.
1587          */
1588         switch(ip->flush_state) {
1589         case HAMMER_FST_FLUSH:
1590                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
1591                 --ip->flush_group->refs;
1592                 ip->flush_group = NULL;
1593                 /* fall through */
1594         case HAMMER_FST_SETUP:
1595                 hammer_rel(&ip->lock);
1596                 ip->flush_state = HAMMER_FST_IDLE;
1597                 /* fall through */
1598         case HAMMER_FST_IDLE:
1599                 break;
1600         }
1601
1602         /*
1603          * There shouldn't be any associated vnode.  The unload needs at
1604          * least one ref, if we do have a vp steal its ip ref.
1605          */
1606         if (ip->vp) {
1607                 hdkprintf("Unexpected vnode association ip %p vp %p\n",
1608                         ip, ip->vp);
1609                 ip->vp->v_data = NULL;
1610                 ip->vp = NULL;
1611         } else {
1612                 hammer_ref(&ip->lock);
1613         }
1614         hammer_unload_inode(ip);
1615         return(0);
1616 }
1617
1618 /*
1619  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1620  * the read-only flag for cached inodes.
1621  *
1622  * This routine is called from a RB_SCAN().
1623  */
1624 int
1625 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1626 {
1627         hammer_mount_t hmp = ip->hmp;
1628
1629         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1630                 ip->flags |= HAMMER_INODE_RO;
1631         else
1632                 ip->flags &= ~HAMMER_INODE_RO;
1633         return(0);
1634 }
1635
1636 /*
1637  * A transaction has modified an inode, requiring updates as specified by
1638  * the passed flags.
1639  *
1640  * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
1641  *                      and not including size changes due to write-append
1642  *                      (but other size changes are included).
1643  * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
1644  *                      write-append.
1645  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1646  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1647  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1648  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1649  */
1650 void
1651 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
1652 {
1653         /*
1654          * ronly of 0 or 2 does not trigger assertion.
1655          * 2 is a special error state
1656          */
1657         KKASSERT(ip->hmp->ronly != 1 ||
1658                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
1659                             HAMMER_INODE_SDIRTY |
1660                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1661                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1662         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1663                 ip->flags |= HAMMER_INODE_RSV_INODES;
1664                 ++ip->hmp->rsv_inodes;
1665         }
1666
1667         /*
1668          * Set the NEWINODE flag in the transaction if the inode
1669          * transitions to a dirty state.  This is used to track
1670          * the load on the inode cache.
1671          */
1672         if (trans &&
1673             (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1674             (flags & HAMMER_INODE_MODMASK)) {
1675                 trans->flags |= HAMMER_TRANSF_NEWINODE;
1676         }
1677         if (flags & HAMMER_INODE_MODMASK)
1678                 hammer_inode_dirty(ip);
1679         ip->flags |= flags;
1680 }
1681
1682 /*
1683  * Attempt to quickly update the atime for a hammer inode.  Return 0 on
1684  * success, -1 on failure.
1685  *
1686  * We attempt to update the atime with only the ip lock and not the
1687  * whole filesystem lock in order to improve concurrency.  We can only
1688  * do this safely if the ATIME flag is already pending on the inode.
1689  *
1690  * This function is called via a vnops path (ip pointer is stable) without
1691  * fs_token held.
1692  */
1693 int
1694 hammer_update_atime_quick(hammer_inode_t ip)
1695 {
1696         struct timeval tv;
1697         int res = -1;
1698
1699         if ((ip->flags & HAMMER_INODE_RO) ||
1700             (ip->hmp->mp->mnt_flag & MNT_NOATIME)) {
1701                 /*
1702                  * Silently indicate success on read-only mount/snap
1703                  */
1704                 res = 0;
1705         } else if (ip->flags & HAMMER_INODE_ATIME) {
1706                 /*
1707                  * Double check with inode lock held against backend.  This
1708                  * is only safe if all we need to do is update
1709                  * ino_data.atime.
1710                  */
1711                 getmicrotime(&tv);
1712                 hammer_lock_ex(&ip->lock);
1713                 if (ip->flags & HAMMER_INODE_ATIME) {
1714                         ip->ino_data.atime =
1715                             (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec;
1716                         res = 0;
1717                 }
1718                 hammer_unlock(&ip->lock);
1719         }
1720         return res;
1721 }
1722
1723 /*
1724  * Request that an inode be flushed.  This whole mess cannot block and may
1725  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1726  * actively flush the inode until the flush can be done.
1727  *
1728  * The inode may already be flushing, or may be in a setup state.  We can
1729  * place the inode in a flushing state if it is currently idle and flag it
1730  * to reflush if it is currently flushing.
1731  *
1732  * Upon return if the inode could not be flushed due to a setup
1733  * dependancy, then it will be automatically flushed when the dependancy
1734  * is satisfied.
1735  */
1736 void
1737 hammer_flush_inode(hammer_inode_t ip, int flags)
1738 {
1739         hammer_mount_t hmp;
1740         hammer_flush_group_t flg;
1741         int good;
1742
1743         /*
1744          * fill_flush_group is the first flush group we may be able to
1745          * continue filling, it may be open or closed but it will always
1746          * be past the currently flushing (running) flg.
1747          *
1748          * next_flush_group is the next open flush group.
1749          */
1750         hmp = ip->hmp;
1751         while ((flg = hmp->fill_flush_group) != NULL) {
1752                 KKASSERT(flg->running == 0);
1753                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
1754                     flg->total_count <= hammer_autoflush) {
1755                         break;
1756                 }
1757                 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
1758                 hammer_flusher_async(ip->hmp, flg);
1759         }
1760         if (flg == NULL) {
1761                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1762                 flg->seq = hmp->flusher.next++;
1763                 if (hmp->next_flush_group == NULL)
1764                         hmp->next_flush_group = flg;
1765                 if (hmp->fill_flush_group == NULL)
1766                         hmp->fill_flush_group = flg;
1767                 RB_INIT(&flg->flush_tree);
1768                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1769         }
1770
1771         /*
1772          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1773          * state we have to put it back into an IDLE state so we can
1774          * drop the extra ref.
1775          *
1776          * If we have a parent dependancy we must still fall through
1777          * so we can run it.
1778          */
1779         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1780                 if (ip->flush_state == HAMMER_FST_SETUP &&
1781                     TAILQ_EMPTY(&ip->target_list)) {
1782                         ip->flush_state = HAMMER_FST_IDLE;
1783                         hammer_rel_inode(ip, 0);
1784                 }
1785                 if (ip->flush_state == HAMMER_FST_IDLE)
1786                         return;
1787         }
1788
1789         /*
1790          * Our flush action will depend on the current state.
1791          */
1792         switch(ip->flush_state) {
1793         case HAMMER_FST_IDLE:
1794                 /*
1795                  * We have no dependancies and can flush immediately.  Some
1796                  * our children may not be flushable so we have to re-test
1797                  * with that additional knowledge.
1798                  */
1799                 hammer_flush_inode_core(ip, flg, flags);
1800                 break;
1801         case HAMMER_FST_SETUP:
1802                 /*
1803                  * Recurse upwards through dependancies via target_list
1804                  * and start their flusher actions going if possible.
1805                  *
1806                  * 'good' is our connectivity.  -1 means we have none and
1807                  * can't flush, 0 means there weren't any dependancies, and
1808                  * 1 means we have good connectivity.
1809                  */
1810                 good = hammer_setup_parent_inodes(ip, 0, flg);
1811
1812                 if (good >= 0) {
1813                         /*
1814                          * We can continue if good >= 0.  Determine how
1815                          * many records under our inode can be flushed (and
1816                          * mark them).
1817                          */
1818                         hammer_flush_inode_core(ip, flg, flags);
1819                 } else {
1820                         /*
1821                          * Parent has no connectivity, tell it to flush
1822                          * us as soon as it does.
1823                          *
1824                          * The REFLUSH flag is also needed to trigger
1825                          * dependancy wakeups.
1826                          */
1827                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1828                                      HAMMER_INODE_REFLUSH;
1829                         if (flags & HAMMER_FLUSH_SIGNAL) {
1830                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1831                                 hammer_flusher_async(ip->hmp, flg);
1832                         }
1833                 }
1834                 break;
1835         case HAMMER_FST_FLUSH:
1836                 /*
1837                  * We are already flushing, flag the inode to reflush
1838                  * if needed after it completes its current flush.
1839                  *
1840                  * The REFLUSH flag is also needed to trigger
1841                  * dependancy wakeups.
1842                  */
1843                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1844                         ip->flags |= HAMMER_INODE_REFLUSH;
1845                 if (flags & HAMMER_FLUSH_SIGNAL) {
1846                         ip->flags |= HAMMER_INODE_RESIGNAL;
1847                         hammer_flusher_async(ip->hmp, flg);
1848                 }
1849                 break;
1850         }
1851 }
1852
1853 /*
1854  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1855  * ip which reference our ip.
1856  *
1857  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1858  *     so for now do not ref/deref the structures.  Note that if we use the
1859  *     ref/rel code later, the rel CAN block.
1860  */
1861 static int
1862 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1863                            hammer_flush_group_t flg)
1864 {
1865         hammer_record_t depend;
1866         int good;
1867         int r;
1868
1869         /*
1870          * If we hit our recursion limit and we have parent dependencies
1871          * We cannot continue.  Returning < 0 will cause us to be flagged
1872          * for reflush.  Returning -2 cuts off additional dependency checks
1873          * because they are likely to also hit the depth limit.
1874          *
1875          * We cannot return < 0 if there are no dependencies or there might
1876          * not be anything to wakeup (ip).
1877          */
1878         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1879                 if (hammer_debug_general & 0x10000)
1880                         hkrateprintf(&hammer_gen_krate,
1881                             "Warning: depth limit reached on "
1882                             "setup recursion, inode %p %016llx\n",
1883                             ip, (long long)ip->obj_id);
1884                 return(-2);
1885         }
1886
1887         /*
1888          * Scan dependencies
1889          */
1890         good = 0;
1891         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1892                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1893                 KKASSERT(depend->target_ip == ip);
1894                 if (r < 0 && good == 0)
1895                         good = -1;
1896                 if (r > 0)
1897                         good = 1;
1898
1899                 /*
1900                  * If we failed due to the recursion depth limit then stop
1901                  * now.
1902                  */
1903                 if (r == -2)
1904                         break;
1905         }
1906         return(good);
1907 }
1908
1909 /*
1910  * This helper function takes a record representing the dependancy between
1911  * the parent inode and child inode.
1912  *
1913  * record               = record in question (*rec in below)
1914  * record->ip           = parent inode (*pip in below)
1915  * record->target_ip    = child inode (*ip in below)
1916  *
1917  * *pip--------------\
1918  *    ^               \rec_tree
1919  *     \               \
1920  *      \ip            /\\\\\ rbtree of recs from parent inode's view
1921  *       \            //\\\\\\
1922  *        \          / ........
1923  *         \        /
1924  *          \------*rec------target_ip------>*ip
1925  *               ...target_entry<----...----->target_list<---...
1926  *                                            list of recs from inode's view
1927  *
1928  * We are asked to recurse upwards and convert the record from SETUP
1929  * to FLUSH if possible.
1930  *
1931  * Return 1 if the record gives us connectivity
1932  *
1933  * Return 0 if the record is not relevant
1934  *
1935  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1936  */
1937 static int
1938 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1939                                   hammer_flush_group_t flg)
1940 {
1941         hammer_inode_t pip;
1942         int good;
1943
1944         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1945         pip = record->ip;
1946
1947         /*
1948          * If the record is already flushing, is it in our flush group?
1949          *
1950          * If it is in our flush group but it is a general record or a
1951          * delete-on-disk, it does not improve our connectivity (return 0),
1952          * and if the target inode is not trying to destroy itself we can't
1953          * allow the operation yet anyway (the second return -1).
1954          */
1955         if (record->flush_state == HAMMER_FST_FLUSH) {
1956                 /*
1957                  * If not in our flush group ask the parent to reflush
1958                  * us as soon as possible.
1959                  */
1960                 if (record->flush_group != flg) {
1961                         pip->flags |= HAMMER_INODE_REFLUSH;
1962                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1963                         return(-1);
1964                 }
1965
1966                 /*
1967                  * If in our flush group everything is already set up,
1968                  * just return whether the record will improve our
1969                  * visibility or not.
1970                  */
1971                 if (record->type == HAMMER_MEM_RECORD_ADD)
1972                         return(1);
1973                 return(0);
1974         }
1975
1976         /*
1977          * It must be a setup record.  Try to resolve the setup dependancies
1978          * by recursing upwards so we can place ip on the flush list.
1979          *
1980          * Limit ourselves to 20 levels of recursion to avoid blowing out
1981          * the kernel stack.  If we hit the recursion limit we can't flush
1982          * until the parent flushes.  The parent will flush independantly
1983          * on its own and ultimately a deep recursion will be resolved.
1984          */
1985         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1986
1987         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1988
1989         /*
1990          * If good < 0 the parent has no connectivity and we cannot safely
1991          * flush the directory entry, which also means we can't flush our
1992          * ip.  Flag us for downward recursion once the parent's
1993          * connectivity is resolved.  Flag the parent for [re]flush or it
1994          * may not check for downward recursions.
1995          */
1996         if (good < 0) {
1997                 pip->flags |= HAMMER_INODE_REFLUSH;
1998                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1999                 return(good);
2000         }
2001
2002         /*
2003          * We are go, place the parent inode in a flushing state so we can
2004          * place its record in a flushing state.  Note that the parent
2005          * may already be flushing.  The record must be in the same flush
2006          * group as the parent.
2007          */
2008         if (pip->flush_state != HAMMER_FST_FLUSH)
2009                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
2010         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
2011
2012         /*
2013          * It is possible for a rename to create a loop in the recursion
2014          * and revisit a record.  This will result in the record being
2015          * placed in a flush state unexpectedly.  This check deals with
2016          * the case.
2017          */
2018         if (record->flush_state == HAMMER_FST_FLUSH) {
2019                 if (record->type == HAMMER_MEM_RECORD_ADD)
2020                         return(1);
2021                 return(0);
2022         }
2023
2024         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
2025
2026 #if 0
2027         if (record->type == HAMMER_MEM_RECORD_DEL &&
2028             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
2029                 /*
2030                  * Regardless of flushing state we cannot sync this path if the
2031                  * record represents a delete-on-disk but the target inode
2032                  * is not ready to sync its own deletion.
2033                  *
2034                  * XXX need to count effective nlinks to determine whether
2035                  * the flush is ok, otherwise removing a hardlink will
2036                  * just leave the DEL record to rot.
2037                  */
2038                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
2039                 return(-1);
2040         } else
2041 #endif
2042         if (pip->flush_group == flg) {
2043                 /*
2044                  * Because we have not calculated nlinks yet we can just
2045                  * set records to the flush state if the parent is in
2046                  * the same flush group as we are.
2047                  */
2048                 record->flush_state = HAMMER_FST_FLUSH;
2049                 record->flush_group = flg;
2050                 ++record->flush_group->refs;
2051                 hammer_ref(&record->lock);
2052
2053                 /*
2054                  * A general directory-add contributes to our visibility.
2055                  *
2056                  * Otherwise it is probably a directory-delete or
2057                  * delete-on-disk record and does not contribute to our
2058                  * visbility (but we can still flush it).
2059                  */
2060                 if (record->type == HAMMER_MEM_RECORD_ADD)
2061                         return(1);
2062                 return(0);
2063         } else {
2064                 /*
2065                  * If the parent is not in our flush group we cannot
2066                  * flush this record yet, there is no visibility.
2067                  * We tell the parent to reflush and mark ourselves
2068                  * so the parent knows it should flush us too.
2069                  */
2070                 pip->flags |= HAMMER_INODE_REFLUSH;
2071                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
2072                 return(-1);
2073         }
2074 }
2075
2076 /*
2077  * This is the core routine placing an inode into the FST_FLUSH state.
2078  */
2079 static void
2080 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
2081 {
2082         hammer_mount_t hmp = ip->hmp;
2083         int go_count;
2084
2085         /*
2086          * Set flush state and prevent the flusher from cycling into
2087          * the next flush group.  Do not place the ip on the list yet.
2088          * Inodes not in the idle state get an extra reference.
2089          */
2090         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
2091         if (ip->flush_state == HAMMER_FST_IDLE)
2092                 hammer_ref(&ip->lock);
2093         ip->flush_state = HAMMER_FST_FLUSH;
2094         ip->flush_group = flg;
2095         ++hmp->flusher.group_lock;
2096         ++hmp->count_iqueued;
2097         ++hammer_count_iqueued;
2098         ++flg->total_count;
2099         hammer_redo_fifo_start_flush(ip);
2100
2101 #if 0
2102         /*
2103          * We need to be able to vfsync/truncate from the backend.
2104          *
2105          * XXX Any truncation from the backend will acquire the vnode
2106          *     independently.
2107          */
2108         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
2109         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
2110                 ip->flags |= HAMMER_INODE_VHELD;
2111                 vref(ip->vp);
2112         }
2113 #endif
2114
2115         /*
2116          * Figure out how many in-memory records we can actually flush
2117          * (not including inode meta-data, buffers, etc).
2118          */
2119         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
2120         if (flags & HAMMER_FLUSH_RECURSION) {
2121                 /*
2122                  * If this is a upwards recursion we do not want to
2123                  * recurse down again!
2124                  */
2125                 go_count = 1;
2126 #if 0
2127         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2128                 /*
2129                  * No new records are added if we must complete a flush
2130                  * from a previous cycle, but we do have to move the records
2131                  * from the previous cycle to the current one.
2132                  */
2133 #if 0
2134                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2135                                    hammer_syncgrp_child_callback, NULL);
2136 #endif
2137                 go_count = 1;
2138 #endif
2139         } else {
2140                 /*
2141                  * Normal flush, scan records and bring them into the flush.
2142                  * Directory adds and deletes are usually skipped (they are
2143                  * grouped with the related inode rather then with the
2144                  * directory).
2145                  *
2146                  * go_count can be negative, which means the scan aborted
2147                  * due to the flush group being over-full and we should
2148                  * flush what we have.
2149                  */
2150                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2151                                    hammer_setup_child_callback, NULL);
2152         }
2153
2154         /*
2155          * This is a more involved test that includes go_count.  If we
2156          * can't flush, flag the inode and return.  If go_count is 0 we
2157          * were are unable to flush any records in our rec_tree and
2158          * must ignore the XDIRTY flag.
2159          */
2160         if (go_count == 0) {
2161                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
2162                         --hmp->count_iqueued;
2163                         --hammer_count_iqueued;
2164
2165                         --flg->total_count;
2166                         ip->flush_state = HAMMER_FST_SETUP;
2167                         ip->flush_group = NULL;
2168                         if (flags & HAMMER_FLUSH_SIGNAL) {
2169                                 ip->flags |= HAMMER_INODE_REFLUSH |
2170                                              HAMMER_INODE_RESIGNAL;
2171                         } else {
2172                                 ip->flags |= HAMMER_INODE_REFLUSH;
2173                         }
2174 #if 0
2175                         if (ip->flags & HAMMER_INODE_VHELD) {
2176                                 ip->flags &= ~HAMMER_INODE_VHELD;
2177                                 vrele(ip->vp);
2178                         }
2179 #endif
2180
2181                         /*
2182                          * REFLUSH is needed to trigger dependancy wakeups
2183                          * when an inode is in SETUP.
2184                          */
2185                         ip->flags |= HAMMER_INODE_REFLUSH;
2186                         if (--hmp->flusher.group_lock == 0)
2187                                 wakeup(&hmp->flusher.group_lock);
2188                         return;
2189                 }
2190         }
2191
2192         /*
2193          * Snapshot the state of the inode for the backend flusher.
2194          *
2195          * We continue to retain save_trunc_off even when all truncations
2196          * have been resolved as an optimization to determine if we can
2197          * skip the B-Tree lookup for overwrite deletions.
2198          *
2199          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2200          * and stays in ip->flags.  Once set, it stays set until the
2201          * inode is destroyed.
2202          */
2203         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2204                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2205                 ip->sync_trunc_off = ip->trunc_off;
2206                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2207                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2208                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2209
2210                 /*
2211                  * The save_trunc_off used to cache whether the B-Tree
2212                  * holds any records past that point is not used until
2213                  * after the truncation has succeeded, so we can safely
2214                  * set it now.
2215                  */
2216                 if (ip->save_trunc_off > ip->sync_trunc_off)
2217                         ip->save_trunc_off = ip->sync_trunc_off;
2218         }
2219         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2220                            ~HAMMER_INODE_TRUNCATED);
2221         ip->sync_ino_leaf = ip->ino_leaf;
2222         ip->sync_ino_data = ip->ino_data;
2223         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2224
2225         /*
2226          * The flusher list inherits our inode and reference.
2227          */
2228         KKASSERT(flg->running == 0);
2229         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
2230         if (--hmp->flusher.group_lock == 0)
2231                 wakeup(&hmp->flusher.group_lock);
2232
2233         /*
2234          * Auto-flush the group if it grows too large.  Make sure the
2235          * inode reclaim wait pipeline continues to work.
2236          */
2237         if (flg->total_count >= hammer_autoflush ||
2238             flg->total_count >= hammer_limit_reclaims / 4) {
2239                 if (hmp->fill_flush_group == flg)
2240                         hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
2241                 hammer_flusher_async(hmp, flg);
2242         }
2243 }
2244
2245 /*
2246  * Callback for scan of ip->rec_tree.  Try to include each record in our
2247  * flush.  ip->flush_group has been set but the inode has not yet been
2248  * moved into a flushing state.
2249  *
2250  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2251  * both inodes.
2252  *
2253  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2254  * the caller from shortcutting the flush.
2255  */
2256 static int
2257 hammer_setup_child_callback(hammer_record_t rec, void *data)
2258 {
2259         hammer_flush_group_t flg;
2260         hammer_inode_t target_ip;
2261         hammer_inode_t ip;
2262         int r;
2263
2264         /*
2265          * Records deleted or committed by the backend are ignored.
2266          * Note that the flush detects deleted frontend records at
2267          * multiple points to deal with races.  This is just the first
2268          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2269          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2270          * messes up link-count calculations.
2271          *
2272          * NOTE: Don't get confused between record deletion and, say,
2273          * directory entry deletion.  The deletion of a directory entry
2274          * which is on-media has nothing to do with the record deletion
2275          * flags.
2276          */
2277         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2278                           HAMMER_RECF_COMMITTED)) {
2279                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2280                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2281                         r = 1;
2282                 } else {
2283                         r = 0;
2284                 }
2285                 return(r);
2286         }
2287
2288         /*
2289          * If the record is in an idle state it has no dependancies and
2290          * can be flushed.
2291          */
2292         ip = rec->ip;
2293         flg = ip->flush_group;
2294         r = 0;
2295
2296         switch(rec->flush_state) {
2297         case HAMMER_FST_IDLE:
2298                 /*
2299                  * The record has no setup dependancy, we can flush it.
2300                  */
2301                 KKASSERT(rec->target_ip == NULL);
2302                 rec->flush_state = HAMMER_FST_FLUSH;
2303                 rec->flush_group = flg;
2304                 ++flg->refs;
2305                 hammer_ref(&rec->lock);
2306                 r = 1;
2307                 break;
2308         case HAMMER_FST_SETUP:
2309                 /*
2310                  * The record has a setup dependancy.  These are typically
2311                  * directory entry adds and deletes.  Such entries will be
2312                  * flushed when their inodes are flushed so we do not
2313                  * usually have to add them to the flush here.  However,
2314                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2315                  * it is asking us to flush this record (and it).
2316                  */
2317                 target_ip = rec->target_ip;
2318                 KKASSERT(target_ip != NULL);
2319                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2320
2321                 /*
2322                  * If the target IP is already flushing in our group
2323                  * we could associate the record, but target_ip has
2324                  * already synced ino_data to sync_ino_data and we
2325                  * would also have to adjust nlinks.   Plus there are
2326                  * ordering issues for adds and deletes.
2327                  *
2328                  * Reflush downward if this is an ADD, and upward if
2329                  * this is a DEL.
2330                  */
2331                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2332                         if (rec->type == HAMMER_MEM_RECORD_ADD)
2333                                 ip->flags |= HAMMER_INODE_REFLUSH;
2334                         else
2335                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2336                         break;
2337                 }
2338
2339                 /*
2340                  * Target IP is not yet flushing.  This can get complex
2341                  * because we have to be careful about the recursion.
2342                  *
2343                  * Directories create an issue for us in that if a flush
2344                  * of a directory is requested the expectation is to flush
2345                  * any pending directory entries, but this will cause the
2346                  * related inodes to recursively flush as well.  We can't
2347                  * really defer the operation so just get as many as we
2348                  * can and
2349                  */
2350 #if 0
2351                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2352                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2353                         /*
2354                          * We aren't reclaiming and the target ip was not
2355                          * previously prevented from flushing due to this
2356                          * record dependancy.  Do not flush this record.
2357                          */
2358                         /*r = 0;*/
2359                 } else
2360 #endif
2361                 if (flg->total_count + flg->refs >
2362                            ip->hmp->undo_rec_limit) {
2363                         /*
2364                          * Our flush group is over-full and we risk blowing
2365                          * out the UNDO FIFO.  Stop the scan, flush what we
2366                          * have, then reflush the directory.
2367                          *
2368                          * The directory may be forced through multiple
2369                          * flush groups before it can be completely
2370                          * flushed.
2371                          */
2372                         ip->flags |= HAMMER_INODE_RESIGNAL |
2373                                      HAMMER_INODE_REFLUSH;
2374                         r = -1;
2375                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2376                         /*
2377                          * If the target IP is not flushing we can force
2378                          * it to flush, even if it is unable to write out
2379                          * any of its own records we have at least one in
2380                          * hand that we CAN deal with.
2381                          */
2382                         rec->flush_state = HAMMER_FST_FLUSH;
2383                         rec->flush_group = flg;
2384                         ++flg->refs;
2385                         hammer_ref(&rec->lock);
2386                         hammer_flush_inode_core(target_ip, flg,
2387                                                 HAMMER_FLUSH_RECURSION);
2388                         r = 1;
2389                 } else {
2390                         /*
2391                          * General or delete-on-disk record.
2392                          *
2393                          * XXX this needs help.  If a delete-on-disk we could
2394                          * disconnect the target.  If the target has its own
2395                          * dependancies they really need to be flushed.
2396                          *
2397                          * XXX
2398                          */
2399                         rec->flush_state = HAMMER_FST_FLUSH;
2400                         rec->flush_group = flg;
2401                         ++flg->refs;
2402                         hammer_ref(&rec->lock);
2403                         hammer_flush_inode_core(target_ip, flg,
2404                                                 HAMMER_FLUSH_RECURSION);
2405                         r = 1;
2406                 }
2407                 break;
2408         case HAMMER_FST_FLUSH:
2409                 /*
2410                  * The record could be part of a previous flush group if the
2411                  * inode is a directory (the record being a directory entry).
2412                  * Once the flush group was closed a hammer_test_inode()
2413                  * function can cause a new flush group to be setup, placing
2414                  * the directory inode itself in a new flush group.
2415                  *
2416                  * When associated with a previous flush group we count it
2417                  * as if it were in our current flush group, since it will
2418                  * effectively be flushed by the time we flush our current
2419                  * flush group.
2420                  */
2421                 KKASSERT(
2422                     rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY ||
2423                     rec->flush_group == flg);
2424                 r = 1;
2425                 break;
2426         }
2427         return(r);
2428 }
2429
2430 #if 0
2431 /*
2432  * This version just moves records already in a flush state to the new
2433  * flush group and that is it.
2434  */
2435 static int
2436 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2437 {
2438         hammer_inode_t ip = rec->ip;
2439
2440         switch(rec->flush_state) {
2441         case HAMMER_FST_FLUSH:
2442                 KKASSERT(rec->flush_group == ip->flush_group);
2443                 break;
2444         default:
2445                 break;
2446         }
2447         return(0);
2448 }
2449 #endif
2450
2451 /*
2452  * Wait for a previously queued flush to complete.
2453  *
2454  * If a critical error occured we don't try to wait.
2455  */
2456 void
2457 hammer_wait_inode(hammer_inode_t ip)
2458 {
2459         /*
2460          * The inode can be in a SETUP state in which case RESIGNAL
2461          * should be set.  If RESIGNAL is not set then the previous
2462          * flush completed and a later operation placed the inode
2463          * in a passive setup state again, so we're done.
2464          *
2465          * The inode can be in a FLUSH state in which case we
2466          * can just wait for completion.
2467          */
2468         while (ip->flush_state == HAMMER_FST_FLUSH ||
2469             (ip->flush_state == HAMMER_FST_SETUP &&
2470              (ip->flags & HAMMER_INODE_RESIGNAL))) {
2471                 /*
2472                  * Don't try to flush on a critical error
2473                  */
2474                 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
2475                         break;
2476
2477                 /*
2478                  * If the inode was already being flushed its flg
2479                  * may not have been queued to the backend.  We
2480                  * have to make sure it gets queued or we can wind
2481                  * up blocked or deadlocked (particularly if we are
2482                  * the vnlru thread).
2483                  */
2484                 if (ip->flush_state == HAMMER_FST_FLUSH) {
2485                         KKASSERT(ip->flush_group);
2486                         if (ip->flush_group->closed == 0) {
2487                                 if (hammer_debug_inode) {
2488                                         hkprintf("debug: forcing "
2489                                                 "async flush ip %016jx\n",
2490                                                 (intmax_t)ip->obj_id);
2491                                 }
2492                                 hammer_flusher_async(ip->hmp, ip->flush_group);
2493                                 continue; /* retest */
2494                         }
2495                 }
2496
2497                 /*
2498                  * In a flush state with the flg queued to the backend
2499                  * or in a setup state with RESIGNAL set, we can safely
2500                  * wait.
2501                  */
2502                 ip->flags |= HAMMER_INODE_FLUSHW;
2503                 tsleep(&ip->flags, 0, "hmrwin", 0);
2504         }
2505
2506 #if 0
2507         /*
2508          * The inode may have been in a passive setup state,
2509          * call flush to make sure we get signaled.
2510          */
2511         if (ip->flush_state == HAMMER_FST_SETUP)
2512                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2513 #endif
2514
2515 }
2516
2517 /*
2518  * Called by the backend code when a flush has been completed.
2519  * The inode has already been removed from the flush list.
2520  *
2521  * A pipelined flush can occur, in which case we must re-enter the
2522  * inode on the list and re-copy its fields.
2523  */
2524 void
2525 hammer_flush_inode_done(hammer_inode_t ip, int error)
2526 {
2527         hammer_mount_t hmp;
2528         int dorel;
2529
2530         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2531
2532         hmp = ip->hmp;
2533
2534         /*
2535          * Auto-reflush if the backend could not completely flush
2536          * the inode.  This fixes a case where a deferred buffer flush
2537          * could cause fsync to return early.
2538          */
2539         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2540                 ip->flags |= HAMMER_INODE_REFLUSH;
2541
2542         /*
2543          * Merge left-over flags back into the frontend and fix the state.
2544          * Incomplete truncations are retained by the backend.
2545          */
2546         ip->error = error;
2547         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2548         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2549
2550         /*
2551          * The backend may have adjusted nlinks, so if the adjusted nlinks
2552          * does not match the fronttend set the frontend's DDIRTY flag again.
2553          */
2554         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2555                 ip->flags |= HAMMER_INODE_DDIRTY;
2556
2557         /*
2558          * Fix up the dirty buffer status.
2559          */
2560         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2561                 ip->flags |= HAMMER_INODE_BUFS;
2562         }
2563         hammer_redo_fifo_end_flush(ip);
2564
2565         /*
2566          * Re-set the XDIRTY flag if some of the inode's in-memory records
2567          * could not be flushed.
2568          */
2569         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2570                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2571                  (!RB_EMPTY(&ip->rec_tree) &&
2572                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2573
2574         /*
2575          * Do not lose track of inodes which no longer have vnode
2576          * assocations, otherwise they may never get flushed again.
2577          *
2578          * The reflush flag can be set superfluously, causing extra pain
2579          * for no reason.  If the inode is no longer modified it no longer
2580          * needs to be flushed.
2581          */
2582         if (ip->flags & HAMMER_INODE_MODMASK) {
2583                 if (ip->vp == NULL)
2584                         ip->flags |= HAMMER_INODE_REFLUSH;
2585         } else {
2586                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2587         }
2588
2589         /*
2590          * The fs token is held but the inode lock is not held.  Because this
2591          * is a backend flush it is possible that the vnode has no references
2592          * and cause a reclaim race inside vsetisdirty() if/when it blocks.
2593          *
2594          * Therefore, we must lock the inode around this particular dirtying
2595          * operation.  We don't have to around other dirtying operations
2596          * where the vnode is implicitly or explicitly held.
2597          */
2598         if (ip->flags & HAMMER_INODE_MODMASK) {
2599                 hammer_lock_ex(&ip->lock);
2600                 hammer_inode_dirty(ip);
2601                 hammer_unlock(&ip->lock);
2602         }
2603
2604         /*
2605          * Adjust the flush state.
2606          */
2607         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2608                 /*
2609                  * We were unable to flush out all our records, leave the
2610                  * inode in a flush state and in the current flush group.
2611                  * The flush group will be re-run.
2612                  *
2613                  * This occurs if the UNDO block gets too full or there is
2614                  * too much dirty meta-data and allows the flusher to
2615                  * finalize the UNDO block and then re-flush.
2616                  */
2617                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2618                 dorel = 0;
2619         } else {
2620                 /*
2621                  * Remove from the flush_group
2622                  */
2623                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
2624                 ip->flush_group = NULL;
2625
2626 #if 0
2627                 /*
2628                  * Clean up the vnode ref and tracking counts.
2629                  */
2630                 if (ip->flags & HAMMER_INODE_VHELD) {
2631                         ip->flags &= ~HAMMER_INODE_VHELD;
2632                         vrele(ip->vp);
2633                 }
2634 #endif
2635                 --hmp->count_iqueued;
2636                 --hammer_count_iqueued;
2637
2638                 /*
2639                  * And adjust the state.
2640                  */
2641                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2642                         ip->flush_state = HAMMER_FST_IDLE;
2643                         dorel = 1;
2644                 } else {
2645                         ip->flush_state = HAMMER_FST_SETUP;
2646                         dorel = 0;
2647                 }
2648
2649                 /*
2650                  * If the frontend is waiting for a flush to complete,
2651                  * wake it up.
2652                  */
2653                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2654                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2655                         wakeup(&ip->flags);
2656                 }
2657
2658                 /*
2659                  * If the frontend made more changes and requested another
2660                  * flush, then try to get it running.
2661                  *
2662                  * Reflushes are aborted when the inode is errored out.
2663                  */
2664                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2665                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2666                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2667                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2668                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2669                         } else {
2670                                 hammer_flush_inode(ip, 0);
2671                         }
2672                 }
2673         }
2674
2675         /*
2676          * If we have no parent dependancies we can clear CONN_DOWN
2677          */
2678         if (TAILQ_EMPTY(&ip->target_list))
2679                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2680
2681         /*
2682          * If the inode is now clean drop the space reservation.
2683          */
2684         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2685             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2686                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2687                 --hmp->rsv_inodes;
2688         }
2689
2690         ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
2691
2692         if (dorel)
2693                 hammer_rel_inode(ip, 0);
2694 }
2695
2696 /*
2697  * Called from hammer_sync_inode() to synchronize in-memory records
2698  * to the media.
2699  */
2700 static int
2701 hammer_sync_record_callback(hammer_record_t record, void *data)
2702 {
2703         hammer_cursor_t cursor = data;
2704         hammer_transaction_t trans = cursor->trans;
2705         hammer_mount_t hmp = trans->hmp;
2706         int error;
2707
2708         /*
2709          * Skip records that do not belong to the current flush.
2710          */
2711         ++hammer_stats_record_iterations;
2712         if (record->flush_state != HAMMER_FST_FLUSH)
2713                 return(0);
2714
2715         if (record->flush_group != record->ip->flush_group) {
2716                 hdkprintf("rec %p ip %p bad flush group %p %p\n",
2717                         record,
2718                         record->ip,
2719                         record->flush_group,
2720                         record->ip->flush_group);
2721                 if (hammer_debug_critical)
2722                         Debugger("blah2");
2723                 return(0);
2724         }
2725         KKASSERT(record->flush_group == record->ip->flush_group);
2726
2727         /*
2728          * Interlock the record using the BE flag.  Once BE is set the
2729          * frontend cannot change the state of FE.
2730          *
2731          * NOTE: If FE is set prior to us setting BE we still sync the
2732          * record out, but the flush completion code converts it to
2733          * a delete-on-disk record instead of destroying it.
2734          */
2735         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2736         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2737
2738         /*
2739          * The backend has already disposed of the record.
2740          */
2741         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2742                 error = 0;
2743                 goto done;
2744         }
2745
2746         /*
2747          * If the whole inode is being deleted and all on-disk records will
2748          * be deleted very soon, we can't sync any new records to disk
2749          * because they will be deleted in the same transaction they were
2750          * created in (delete_tid == create_tid), which will assert.
2751          *
2752          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2753          * that we currently panic on.
2754          */
2755         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2756                 switch(record->type) {
2757                 case HAMMER_MEM_RECORD_DATA:
2758                         /*
2759                          * We don't have to do anything, if the record was
2760                          * committed the space will have been accounted for
2761                          * in the blockmap.
2762                          */
2763                         /* fall through */
2764                 case HAMMER_MEM_RECORD_GENERAL:
2765                         /*
2766                          * Set deleted-by-backend flag.  Do not set the
2767                          * backend committed flag, because we are throwing
2768                          * the record away.
2769                          */
2770                         record->flags |= HAMMER_RECF_DELETED_BE;
2771                         ++record->ip->rec_generation;
2772                         error = 0;
2773                         goto done;
2774                 case HAMMER_MEM_RECORD_ADD:
2775                         hpanic("illegal add during inode deletion record %p",
2776                                 record);
2777                         break; /* NOT REACHED */
2778                 case HAMMER_MEM_RECORD_INODE:
2779                         hpanic("attempt to sync inode record %p?", record);
2780                         break; /* NOT REACHED */
2781                 case HAMMER_MEM_RECORD_DEL:
2782                         /*
2783                          * Follow through and issue the on-disk deletion
2784                          */
2785                         break;
2786                 }
2787         }
2788
2789         /*
2790          * If DELETED_FE is set special handling is needed for directory
2791          * entries.  Dependant pieces related to the directory entry may
2792          * have already been synced to disk.  If this occurs we have to
2793          * sync the directory entry and then change the in-memory record
2794          * from an ADD to a DELETE to cover the fact that it's been
2795          * deleted by the frontend.
2796          *
2797          * A directory delete covering record (MEM_RECORD_DEL) can never
2798          * be deleted by the frontend.
2799          *
2800          * Any other record type (aka DATA) can be deleted by the frontend.
2801          * XXX At the moment the flusher must skip it because there may
2802          * be another data record in the flush group for the same block,
2803          * meaning that some frontend data changes can leak into the backend's
2804          * synchronization point.
2805          */
2806         if (record->flags & HAMMER_RECF_DELETED_FE) {
2807                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2808                         /*
2809                          * Convert a front-end deleted directory-add to
2810                          * a directory-delete entry later.
2811                          */
2812                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2813                 } else {
2814                         /*
2815                          * Dispose of the record (race case).  Mark as
2816                          * deleted by backend (and not committed).
2817                          */
2818                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2819                         record->flags |= HAMMER_RECF_DELETED_BE;
2820                         ++record->ip->rec_generation;
2821                         error = 0;
2822                         goto done;
2823                 }
2824         }
2825
2826         /*
2827          * Assign the create_tid for new records.  Deletions already
2828          * have the record's entire key properly set up.
2829          */
2830         if (record->type != HAMMER_MEM_RECORD_DEL) {
2831                 record->leaf.base.create_tid = trans->tid;
2832                 record->leaf.create_ts = trans->time32;
2833         }
2834
2835         /*
2836          * This actually moves the record to the on-media B-Tree.  We
2837          * must also generate REDO_TERM entries in the UNDO/REDO FIFO
2838          * indicating that the related REDO_WRITE(s) have been committed.
2839          *
2840          * During recovery any REDO_TERM's within the nominal recovery span
2841          * are ignored since the related meta-data is being undone, causing
2842          * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
2843          * the nominal recovery span will match against REDO_WRITEs and
2844          * prevent them from being executed (because the meta-data has
2845          * already been synchronized).
2846          */
2847         if (record->flags & HAMMER_RECF_REDO) {
2848                 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
2849                 hammer_generate_redo(trans, record->ip,
2850                                      record->leaf.base.key -
2851                                          record->leaf.data_len,
2852                                      HAMMER_REDO_TERM_WRITE,
2853                                      NULL,
2854                                      record->leaf.data_len);
2855         }
2856
2857         for (;;) {
2858                 error = hammer_ip_sync_record_cursor(cursor, record);
2859                 if (error != EDEADLK)
2860                         break;
2861                 hammer_done_cursor(cursor);
2862                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2863                                            record->ip);
2864                 if (error)
2865                         break;
2866         }
2867         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2868
2869         if (error)
2870                 error = -error;
2871 done:
2872         hammer_flush_record_done(record, error);
2873
2874         /*
2875          * Do partial finalization if we have built up too many dirty
2876          * buffers.  Otherwise a buffer cache deadlock can occur when
2877          * doing things like creating tens of thousands of tiny files.
2878          *
2879          * We must release our cursor lock to avoid a 3-way deadlock
2880          * due to the exclusive sync lock the finalizer must get.
2881          *
2882          * WARNING: See warnings in hammer_unlock_cursor() function.
2883          */
2884         if (hammer_flusher_meta_limit(hmp) ||
2885             vm_page_count_severe()) {
2886                 hammer_unlock_cursor(cursor);
2887                 hammer_flusher_finalize(trans, 0);
2888                 hammer_lock_cursor(cursor);
2889         }
2890         return(error);
2891 }
2892
2893 /*
2894  * Backend function called by the flusher to sync an inode to media.
2895  */
2896 int
2897 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2898 {
2899         struct hammer_cursor cursor;
2900         hammer_node_t tmp_node;
2901         hammer_record_t depend;
2902         hammer_record_t next;
2903         int error, tmp_error;
2904         uint64_t nlinks;
2905
2906         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2907                 return(0);
2908
2909         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2910         if (error)
2911                 goto done;
2912
2913         /*
2914          * Any directory records referencing this inode which are not in
2915          * our current flush group must adjust our nlink count for the
2916          * purposes of synchronizating to disk.
2917          *
2918          * Records which are in our flush group can be unlinked from our
2919          * inode now, potentially allowing the inode to be physically
2920          * deleted.
2921          *
2922          * This cannot block.
2923          */
2924         nlinks = ip->ino_data.nlinks;
2925         next = TAILQ_FIRST(&ip->target_list);
2926         while ((depend = next) != NULL) {
2927                 next = TAILQ_NEXT(depend, target_entry);
2928                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2929                     depend->flush_group == ip->flush_group) {
2930                         /*
2931                          * If this is an ADD that was deleted by the frontend
2932                          * the frontend nlinks count will have already been
2933                          * decremented, but the backend is going to sync its
2934                          * directory entry and must account for it.  The
2935                          * record will be converted to a delete-on-disk when
2936                          * it gets synced.
2937                          *
2938                          * If the ADD was not deleted by the frontend we
2939                          * can remove the dependancy from our target_list.
2940                          */
2941                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2942                                 ++nlinks;
2943                         } else {
2944                                 TAILQ_REMOVE(&ip->target_list, depend,
2945                                              target_entry);
2946                                 depend->target_ip = NULL;
2947                         }
2948                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2949                         /*
2950                          * Not part of our flush group and not deleted by
2951                          * the front-end, adjust the link count synced to
2952                          * the media (undo what the frontend did when it
2953                          * queued the record).
2954                          */
2955                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2956                         switch(depend->type) {
2957                         case HAMMER_MEM_RECORD_ADD:
2958                                 --nlinks;
2959                                 break;
2960                         case HAMMER_MEM_RECORD_DEL:
2961                                 ++nlinks;
2962                                 break;
2963                         default:
2964                                 break;
2965                         }
2966                 }
2967         }
2968
2969         /*
2970          * Set dirty if we had to modify the link count.
2971          */
2972         if (ip->sync_ino_data.nlinks != nlinks) {
2973                 KKASSERT((int64_t)nlinks >= 0);
2974                 ip->sync_ino_data.nlinks = nlinks;
2975                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2976         }
2977
2978         /*
2979          * If there is a trunction queued destroy any data past the (aligned)
2980          * truncation point.  Userland will have dealt with the buffer
2981          * containing the truncation point for us.
2982          *
2983          * We don't flush pending frontend data buffers until after we've
2984          * dealt with the truncation.
2985          */
2986         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2987                 /*
2988                  * Interlock trunc_off.  The VOP front-end may continue to
2989                  * make adjustments to it while we are blocked.
2990                  */
2991                 off_t trunc_off;
2992                 off_t aligned_trunc_off;
2993                 int blkmask;
2994
2995                 trunc_off = ip->sync_trunc_off;
2996                 blkmask = hammer_blocksize(trunc_off) - 1;
2997                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2998
2999                 /*
3000                  * Delete any whole blocks on-media.  The front-end has
3001                  * already cleaned out any partial block and made it
3002                  * pending.  The front-end may have updated trunc_off
3003                  * while we were blocked so we only use sync_trunc_off.
3004                  *
3005                  * This operation can blow out the buffer cache, EWOULDBLOCK
3006                  * means we were unable to complete the deletion.  The
3007                  * deletion will update sync_trunc_off in that case.
3008                  */
3009                 error = hammer_ip_delete_range(&cursor, ip,
3010                                                 aligned_trunc_off,
3011                                                 0x7FFFFFFFFFFFFFFFLL, 2);
3012                 if (error == EWOULDBLOCK) {
3013                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
3014                         error = 0;
3015                         goto defer_buffer_flush;
3016                 }
3017
3018                 if (error)
3019                         goto done;
3020
3021                 /*
3022                  * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
3023                  *
3024                  * XXX we do this even if we did not previously generate
3025                  * a REDO_TRUNC record.  This operation may enclosed the
3026                  * range for multiple prior truncation entries in the REDO
3027                  * log.
3028                  */
3029                 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
3030                     (ip->flags & HAMMER_INODE_RDIRTY)) {
3031                         hammer_generate_redo(trans, ip, aligned_trunc_off,
3032                                              HAMMER_REDO_TERM_TRUNC,
3033                                              NULL, 0);
3034                 }
3035
3036                 /*
3037                  * Clear the truncation flag on the backend after we have
3038                  * completed the deletions.  Backend data is now good again
3039                  * (including new records we are about to sync, below).
3040                  *
3041                  * Leave sync_trunc_off intact.  As we write additional
3042                  * records the backend will update sync_trunc_off.  This
3043                  * tells the backend whether it can skip the overwrite
3044                  * test.  This should work properly even when the backend
3045                  * writes full blocks where the truncation point straddles
3046                  * the block because the comparison is against the base
3047                  * offset of the record.
3048                  */
3049                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3050                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
3051         } else {
3052                 error = 0;
3053         }
3054
3055         /*
3056          * Now sync related records.  These will typically be directory
3057          * entries, records tracking direct-writes, or delete-on-disk records.
3058          */
3059         if (error == 0) {
3060                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
3061                                     hammer_sync_record_callback, &cursor);
3062                 if (tmp_error < 0)
3063                         tmp_error = -error;
3064                 if (tmp_error)
3065                         error = tmp_error;
3066         }
3067         hammer_cache_node(&ip->cache[1], cursor.node);
3068
3069         /*
3070          * Re-seek for inode update, assuming our cache hasn't been ripped
3071          * out from under us.
3072          */
3073         if (error == 0) {
3074                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
3075                 if (tmp_node) {
3076                         hammer_cursor_downgrade(&cursor);
3077                         hammer_lock_sh(&tmp_node->lock);
3078                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
3079                                 hammer_cursor_seek(&cursor, tmp_node, 0);
3080                         hammer_unlock(&tmp_node->lock);
3081                         hammer_rel_node(tmp_node);
3082                 }
3083                 error = 0;
3084         }
3085
3086         /*
3087          * If we are deleting the inode the frontend had better not have
3088          * any active references on elements making up the inode.
3089          *
3090          * The call to hammer_ip_delete_clean() cleans up auxillary records
3091          * but not DB or DATA records.  Those must have already been deleted
3092          * by the normal truncation mechanic.
3093          */
3094         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
3095                 RB_EMPTY(&ip->rec_tree)  &&
3096             (ip->sync_flags & HAMMER_INODE_DELETING) &&
3097             (ip->flags & HAMMER_INODE_DELETED) == 0) {
3098                 int count1 = 0;
3099
3100                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
3101                 if (error == 0) {
3102                         ip->flags |= HAMMER_INODE_DELETED;
3103                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
3104                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3105                         KKASSERT(RB_EMPTY(&ip->rec_tree));
3106
3107                         /*
3108                          * Set delete_tid in both the frontend and backend
3109                          * copy of the inode record.  The DELETED flag handles
3110                          * this, do not set DDIRTY.
3111                          */
3112                         ip->ino_leaf.base.delete_tid = trans->tid;
3113                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
3114                         ip->ino_leaf.delete_ts = trans->time32;
3115                         ip->sync_ino_leaf.delete_ts = trans->time32;
3116
3117
3118                         /*
3119                          * Adjust the inode count in the volume header
3120                          */
3121                         hammer_sync_lock_sh(trans);
3122                         if (ip->flags & HAMMER_INODE_ONDISK) {
3123                                 hammer_modify_volume_field(trans,
3124                                                            trans->rootvol,
3125                                                            vol0_stat_inodes);
3126                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
3127                                 hammer_modify_volume_done(trans->rootvol);
3128                         }
3129                         hammer_sync_unlock(trans);
3130                 }
3131         }
3132
3133         if (error)
3134                 goto done;
3135         ip->sync_flags &= ~HAMMER_INODE_BUFS;
3136
3137 defer_buffer_flush:
3138         /*
3139          * Now update the inode's on-disk inode-data and/or on-disk record.
3140          * DELETED and ONDISK are managed only in ip->flags.
3141          *
3142          * In the case of a defered buffer flush we still update the on-disk
3143          * inode to satisfy visibility requirements if there happen to be
3144          * directory dependancies.
3145          */
3146         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
3147         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
3148                 /*
3149                  * If deleted and on-disk, don't set any additional flags.
3150                  * the delete flag takes care of things.
3151                  *
3152                  * Clear flags which may have been set by the frontend.
3153                  */
3154                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3155                                     HAMMER_INODE_SDIRTY |
3156                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3157                                     HAMMER_INODE_DELETING);
3158                 break;
3159         case HAMMER_INODE_DELETED:
3160                 /*
3161                  * Take care of the case where a deleted inode was never
3162                  * flushed to the disk in the first place.
3163                  *
3164                  * Clear flags which may have been set by the frontend.
3165                  */
3166                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3167                                     HAMMER_INODE_SDIRTY |
3168                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3169                                     HAMMER_INODE_DELETING);
3170                 while (RB_ROOT(&ip->rec_tree)) {
3171                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
3172                         hammer_ref(&record->lock);
3173                         KKASSERT(hammer_oneref(&record->lock));
3174                         record->flags |= HAMMER_RECF_DELETED_BE;
3175                         ++record->ip->rec_generation;
3176                         hammer_rel_mem_record(record);
3177                 }
3178                 break;
3179         case HAMMER_INODE_ONDISK:
3180                 /*
3181                  * If already on-disk, do not set any additional flags.
3182                  */
3183                 break;
3184         default:
3185                 /*
3186                  * If not on-disk and not deleted, set DDIRTY to force
3187                  * an initial record to be written.
3188                  *
3189                  * Also set the create_tid in both the frontend and backend
3190                  * copy of the inode record.
3191                  */
3192                 ip->ino_leaf.base.create_tid = trans->tid;
3193                 ip->ino_leaf.create_ts = trans->time32;
3194                 ip->sync_ino_leaf.base.create_tid = trans->tid;
3195                 ip->sync_ino_leaf.create_ts = trans->time32;
3196                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
3197                 break;
3198         }
3199
3200         /*
3201          * If DDIRTY or SDIRTY is set, write out a new record.
3202          * If the inode is already on-disk the old record is marked as
3203          * deleted.
3204          *
3205          * If DELETED is set hammer_update_inode() will delete the existing
3206          * record without writing out a new one.
3207          */
3208         if (ip->flags & HAMMER_INODE_DELETED) {
3209                 error = hammer_update_inode(&cursor, ip);
3210         } else
3211         if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
3212             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
3213                 error = hammer_update_itimes(&cursor, ip);
3214         } else
3215         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
3216                               HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
3217                 error = hammer_update_inode(&cursor, ip);
3218         }
3219 done:
3220         if (ip->flags & HAMMER_INODE_MODMASK)
3221                 hammer_inode_dirty(ip);
3222         if (error) {
3223                 hammer_critical_error(ip->hmp, ip, error,
3224                                       "while syncing inode");
3225         }
3226         hammer_done_cursor(&cursor);
3227         return(error);
3228 }
3229
3230 /*
3231  * This routine is called when the OS is no longer actively referencing
3232  * the inode (but might still be keeping it cached), or when releasing
3233  * the last reference to an inode.
3234  *
3235  * At this point if the inode's nlinks count is zero we want to destroy
3236  * it, which may mean destroying it on-media too.
3237  */
3238 void
3239 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
3240 {
3241         struct vnode *vp;
3242
3243         /*
3244          * Set the DELETING flag when the link count drops to 0 and the
3245          * OS no longer has any opens on the inode.
3246          *
3247          * The backend will clear DELETING (a mod flag) and set DELETED
3248          * (a state flag) when it is actually able to perform the
3249          * operation.
3250          *
3251          * Don't reflag the deletion if the flusher is currently syncing
3252          * one that was already flagged.  A previously set DELETING flag
3253          * may bounce around flags and sync_flags until the operation is
3254          * completely done.
3255          *
3256          * Do not attempt to modify a snapshot inode (one set to read-only).
3257          */
3258         if (ip->ino_data.nlinks == 0 &&
3259             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
3260                 ip->flags |= HAMMER_INODE_DELETING;
3261                 ip->flags |= HAMMER_INODE_TRUNCATED;
3262                 ip->trunc_off = 0;
3263                 vp = NULL;
3264                 if (getvp) {
3265                         if (hammer_get_vnode(ip, &vp) != 0)
3266                                 return;
3267                 }
3268
3269                 /*
3270                  * Final cleanup
3271                  */
3272                 if (ip->vp)
3273                         nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0);
3274                 if (ip->flags & HAMMER_INODE_MODMASK)
3275                         hammer_inode_dirty(ip);
3276                 if (getvp)
3277                         vput(vp);
3278         }
3279 }
3280
3281 /*
3282  * After potentially resolving a dependancy the inode is tested
3283  * to determine whether it needs to be reflushed.
3284  */
3285 void
3286 hammer_test_inode(hammer_inode_t ip)
3287 {
3288         if (ip->flags & HAMMER_INODE_REFLUSH) {
3289                 ip->flags &= ~HAMMER_INODE_REFLUSH;
3290                 hammer_ref(&ip->lock);
3291                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
3292                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
3293                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
3294                 } else {
3295                         hammer_flush_inode(ip, 0);
3296                 }
3297                 hammer_rel_inode(ip, 0);
3298         }
3299 }
3300
3301 /*
3302  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
3303  * reassociated with a vp or just before it gets freed.
3304  *
3305  * Pipeline wakeups to threads blocked due to an excessive number of
3306  * detached inodes.  This typically occurs when atime updates accumulate
3307  * while scanning a directory tree.
3308  */
3309 static void
3310 hammer_inode_wakereclaims(hammer_inode_t ip)
3311 {
3312         struct hammer_reclaim *reclaim;
3313         hammer_mount_t hmp = ip->hmp;
3314
3315         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
3316                 return;
3317
3318         --hammer_count_reclaims;
3319         --hmp->count_reclaims;
3320         ip->flags &= ~HAMMER_INODE_RECLAIM;
3321
3322         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3323                 KKASSERT(reclaim->count > 0);
3324                 if (--reclaim->count == 0) {
3325                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3326                         wakeup(reclaim);
3327                 }
3328         }
3329 }
3330
3331 /*
3332  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3333  * inodes build up before we start blocking.  This routine is called
3334  * if a new inode is created or an inode is loaded from media.
3335  *
3336  * When we block we don't care *which* inode has finished reclaiming,
3337  * as long as one does.
3338  *
3339  * The reclaim pipeline is primarily governed by the auto-flush which is
3340  * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
3341  * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
3342  * dynamically governed.
3343  */
3344 void
3345 hammer_inode_waitreclaims(hammer_transaction_t trans)
3346 {
3347         hammer_mount_t hmp = trans->hmp;
3348         struct hammer_reclaim reclaim;
3349         int lower_limit;
3350
3351         /*
3352          * Track inode load, delay if the number of reclaiming inodes is
3353          * between 2/4 and 4/4 hammer_limit_reclaims, depending.
3354          */
3355         if (curthread->td_proc) {
3356                 struct hammer_inostats *stats;
3357
3358                 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
3359                 ++stats->count;
3360
3361                 if (stats->count > hammer_limit_reclaims / 2)
3362                         stats->count = hammer_limit_reclaims / 2;
3363                 lower_limit = hammer_limit_reclaims - stats->count;
3364                 if (hammer_debug_general & 0x10000) {
3365                         hdkprintf("pid %5d limit %d\n",
3366                                 (int)curthread->td_proc->p_pid, lower_limit);
3367                 }
3368         } else {
3369                 lower_limit = hammer_limit_reclaims * 3 / 4;
3370         }
3371         if (hmp->count_reclaims >= lower_limit) {
3372                 reclaim.count = 1;
3373                 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3374                 tsleep(&reclaim, 0, "hmrrcm", hz);
3375                 if (reclaim.count > 0)
3376                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3377         }
3378 }
3379
3380 /*
3381  * Keep track of reclaim statistics on a per-pid basis using a loose
3382  * 4-way set associative hash table.  Collisions inherit the count of
3383  * the previous entry.
3384  *
3385  * NOTE: We want to be careful here to limit the chain size.  If the chain
3386  *       size is too large a pid will spread its stats out over too many
3387  *       entries under certain types of heavy filesystem activity and
3388  *       wind up not delaying long enough.
3389  */
3390 static
3391 struct hammer_inostats *
3392 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
3393 {
3394         struct hammer_inostats *stats;
3395         int delta;
3396         int chain;
3397         static volatile int iterator;   /* we don't care about MP races */
3398
3399         /*
3400          * Chain up to 4 times to find our entry.
3401          */
3402         for (chain = 0; chain < 4; ++chain) {
3403                 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
3404                 if (stats->pid == pid)
3405                         break;
3406         }
3407
3408         /*
3409          * Replace one of the four chaining entries with our new entry.
3410          */
3411         if (chain == 4) {
3412                 stats = &hmp->inostats[(pid + (iterator++ & 3)) &
3413                                        HAMMER_INOSTATS_HMASK];
3414                 stats->pid = pid;
3415         }
3416
3417         /*
3418          * Decay the entry
3419          */
3420         if (stats->count && stats->ltick != ticks) {
3421                 delta = ticks - stats->ltick;
3422                 stats->ltick = ticks;
3423                 if (delta <= 0 || delta > hz * 60)
3424                         stats->count = 0;
3425                 else
3426                         stats->count = stats->count * hz / (hz + delta);
3427         }
3428         if (hammer_debug_general & 0x10000)
3429                 hdkprintf("pid %5d stats %d\n", (int)pid, stats->count);
3430         return (stats);
3431 }
3432
3433 #if 0
3434
3435 /*
3436  * XXX not used, doesn't work very well due to the large batching nature
3437  * of flushes.
3438  *
3439  * A larger then normal backlog of inodes is sitting in the flusher,
3440  * enforce a general slowdown to let it catch up.  This routine is only
3441  * called on completion of a non-flusher-related transaction which
3442  * performed B-Tree node I/O.
3443  *
3444  * It is possible for the flusher to stall in a continuous load.
3445  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3446  * If the flusher is unable to catch up the inode count can bloat until
3447  * we run out of kvm.
3448  *
3449  * This is a bit of a hack.
3450  */
3451 void
3452 hammer_inode_waithard(hammer_mount_t hmp)
3453 {
3454         /*
3455          * Hysteresis.
3456          */
3457         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3458                 if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
3459                     hmp->count_iqueued < hmp->count_inodes / 20) {
3460                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3461                         return;
3462                 }
3463         } else {
3464                 if (hmp->count_reclaims < hammer_limit_reclaims ||
3465                     hmp->count_iqueued < hmp->count_inodes / 10) {
3466                         return;
3467                 }
3468                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3469         }
3470
3471         /*
3472          * Block for one flush cycle.
3473          */
3474         hammer_flusher_wait_next(hmp);
3475 }
3476
3477 #endif