Fix more wrong sizeof() usages, part 1/x
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "hammer.h"
36 #include <vm/vm_extern.h>
37
38 static int      hammer_unload_inode(struct hammer_inode *ip);
39 static void     hammer_free_inode(hammer_inode_t ip);
40 static void     hammer_flush_inode_core(hammer_inode_t ip,
41                                         hammer_flush_group_t flg, int flags);
42 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
43 #if 0
44 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
45 #endif
46 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
47                                         hammer_flush_group_t flg);
48 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
49                                         int depth, hammer_flush_group_t flg);
50 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
51 static struct hammer_inostats *hammer_inode_inostats(hammer_mount_t hmp,
52                                         pid_t pid);
53
54 #ifdef DEBUG_TRUNCATE
55 extern struct hammer_inode *HammerTruncIp;
56 #endif
57
58 /*
59  * RB-Tree support for inode structures
60  */
61 int
62 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
63 {
64         if (ip1->obj_localization < ip2->obj_localization)
65                 return(-1);
66         if (ip1->obj_localization > ip2->obj_localization)
67                 return(1);
68         if (ip1->obj_id < ip2->obj_id)
69                 return(-1);
70         if (ip1->obj_id > ip2->obj_id)
71                 return(1);
72         if (ip1->obj_asof < ip2->obj_asof)
73                 return(-1);
74         if (ip1->obj_asof > ip2->obj_asof)
75                 return(1);
76         return(0);
77 }
78
79 int
80 hammer_redo_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
81 {
82         if (ip1->redo_fifo_start < ip2->redo_fifo_start)
83                 return(-1);
84         if (ip1->redo_fifo_start > ip2->redo_fifo_start)
85                 return(1);
86         return(0);
87 }
88
89 /*
90  * RB-Tree support for inode structures / special LOOKUP_INFO
91  */
92 static int
93 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
94 {
95         if (info->obj_localization < ip->obj_localization)
96                 return(-1);
97         if (info->obj_localization > ip->obj_localization)
98                 return(1);
99         if (info->obj_id < ip->obj_id)
100                 return(-1);
101         if (info->obj_id > ip->obj_id)
102                 return(1);
103         if (info->obj_asof < ip->obj_asof)
104                 return(-1);
105         if (info->obj_asof > ip->obj_asof)
106                 return(1);
107         return(0);
108 }
109
110 /*
111  * Used by hammer_scan_inode_snapshots() to locate all of an object's
112  * snapshots.  Note that the asof field is not tested, which we can get
113  * away with because it is the lowest-priority field.
114  */
115 static int
116 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
117 {
118         hammer_inode_info_t info = data;
119
120         if (ip->obj_localization > info->obj_localization)
121                 return(1);
122         if (ip->obj_localization < info->obj_localization)
123                 return(-1);
124         if (ip->obj_id > info->obj_id)
125                 return(1);
126         if (ip->obj_id < info->obj_id)
127                 return(-1);
128         return(0);
129 }
130
131 /*
132  * Used by hammer_unload_pseudofs() to locate all inodes associated with
133  * a particular PFS.
134  */
135 static int
136 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
137 {
138         u_int32_t localization = *(u_int32_t *)data;
139         if (ip->obj_localization > localization)
140                 return(1);
141         if (ip->obj_localization < localization)
142                 return(-1);
143         return(0);
144 }
145
146 /*
147  * RB-Tree support for pseudofs structures
148  */
149 static int
150 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
151 {
152         if (p1->localization < p2->localization)
153                 return(-1);
154         if (p1->localization > p2->localization)
155                 return(1);
156         return(0);
157 }
158
159
160 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
161 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
162                 hammer_inode_info_cmp, hammer_inode_info_t);
163 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
164              hammer_pfs_rb_compare, u_int32_t, localization);
165
166 /*
167  * The kernel is not actively referencing this vnode but is still holding
168  * it cached.
169  *
170  * This is called from the frontend.
171  *
172  * MPALMOSTSAFE
173  */
174 int
175 hammer_vop_inactive(struct vop_inactive_args *ap)
176 {
177         struct hammer_inode *ip = VTOI(ap->a_vp);
178         hammer_mount_t hmp;
179
180         /*
181          * Degenerate case
182          */
183         if (ip == NULL) {
184                 vrecycle(ap->a_vp);
185                 return(0);
186         }
187
188         /*
189          * If the inode no longer has visibility in the filesystem try to
190          * recycle it immediately, even if the inode is dirty.  Recycling
191          * it quickly allows the system to reclaim buffer cache and VM
192          * resources which can matter a lot in a heavily loaded system.
193          *
194          * This can deadlock in vfsync() if we aren't careful.
195          * 
196          * Do not queue the inode to the flusher if we still have visibility,
197          * otherwise namespace calls such as chmod will unnecessarily generate
198          * multiple inode updates.
199          */
200         if (ip->ino_data.nlinks == 0) {
201                 hmp = ip->hmp;
202                 lwkt_gettoken(&hmp->fs_token);
203                 hammer_inode_unloadable_check(ip, 0);
204                 if (ip->flags & HAMMER_INODE_MODMASK)
205                         hammer_flush_inode(ip, 0);
206                 lwkt_reltoken(&hmp->fs_token);
207                 vrecycle(ap->a_vp);
208         }
209         return(0);
210 }
211
212 /*
213  * Release the vnode association.  This is typically (but not always)
214  * the last reference on the inode.
215  *
216  * Once the association is lost we are on our own with regards to
217  * flushing the inode.
218  *
219  * We must interlock ip->vp so hammer_get_vnode() can avoid races.
220  */
221 int
222 hammer_vop_reclaim(struct vop_reclaim_args *ap)
223 {
224         struct hammer_inode *ip;
225         hammer_mount_t hmp;
226         struct vnode *vp;
227
228         vp = ap->a_vp;
229
230         if ((ip = vp->v_data) != NULL) {
231                 hmp = ip->hmp;
232                 lwkt_gettoken(&hmp->fs_token);
233                 hammer_lock_ex(&ip->lock);
234                 vp->v_data = NULL;
235                 ip->vp = NULL;
236
237                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
238                         ++hammer_count_reclaims;
239                         ++hmp->count_reclaims;
240                         ip->flags |= HAMMER_INODE_RECLAIM;
241                 }
242                 hammer_unlock(&ip->lock);
243                 hammer_rel_inode(ip, 1);
244                 lwkt_reltoken(&hmp->fs_token);
245         }
246         return(0);
247 }
248
249 /*
250  * Return a locked vnode for the specified inode.  The inode must be
251  * referenced but NOT LOCKED on entry and will remain referenced on
252  * return.
253  *
254  * Called from the frontend.
255  */
256 int
257 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
258 {
259         hammer_mount_t hmp;
260         struct vnode *vp;
261         int error = 0;
262         u_int8_t obj_type;
263
264         hmp = ip->hmp;
265
266         for (;;) {
267                 if ((vp = ip->vp) == NULL) {
268                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
269                         if (error)
270                                 break;
271                         hammer_lock_ex(&ip->lock);
272                         if (ip->vp != NULL) {
273                                 hammer_unlock(&ip->lock);
274                                 vp = *vpp;
275                                 vp->v_type = VBAD;
276                                 vx_put(vp);
277                                 continue;
278                         }
279                         hammer_ref(&ip->lock);
280                         vp = *vpp;
281                         ip->vp = vp;
282
283                         obj_type = ip->ino_data.obj_type;
284                         vp->v_type = hammer_get_vnode_type(obj_type);
285
286                         hammer_inode_wakereclaims(ip);
287
288                         switch(ip->ino_data.obj_type) {
289                         case HAMMER_OBJTYPE_CDEV:
290                         case HAMMER_OBJTYPE_BDEV:
291                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
292                                 addaliasu(vp, ip->ino_data.rmajor,
293                                           ip->ino_data.rminor);
294                                 break;
295                         case HAMMER_OBJTYPE_FIFO:
296                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
297                                 break;
298                         case HAMMER_OBJTYPE_REGFILE:
299                                 break;
300                         default:
301                                 break;
302                         }
303
304                         /*
305                          * Only mark as the root vnode if the ip is not
306                          * historical, otherwise the VFS cache will get
307                          * confused.  The other half of the special handling
308                          * is in hammer_vop_nlookupdotdot().
309                          *
310                          * Pseudo-filesystem roots can be accessed via
311                          * non-root filesystem paths and setting VROOT may
312                          * confuse the namecache.  Set VPFSROOT instead.
313                          */
314                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
315                             ip->obj_asof == hmp->asof) {
316                                 if (ip->obj_localization == 0)
317                                         vsetflags(vp, VROOT);
318                                 else
319                                         vsetflags(vp, VPFSROOT);
320                         }
321
322                         vp->v_data = (void *)ip;
323                         /* vnode locked by getnewvnode() */
324                         /* make related vnode dirty if inode dirty? */
325                         hammer_unlock(&ip->lock);
326                         if (vp->v_type == VREG) {
327                                 vinitvmio(vp, ip->ino_data.size,
328                                           hammer_blocksize(ip->ino_data.size),
329                                           hammer_blockoff(ip->ino_data.size));
330                         }
331                         break;
332                 }
333
334                 /*
335                  * Interlock vnode clearing.  This does not prevent the
336                  * vnode from going into a reclaimed state but it does
337                  * prevent it from being destroyed or reused so the vget()
338                  * will properly fail.
339                  */
340                 hammer_lock_ex(&ip->lock);
341                 if ((vp = ip->vp) == NULL) {
342                         hammer_unlock(&ip->lock);
343                         continue;
344                 }
345                 vhold_interlocked(vp);
346                 hammer_unlock(&ip->lock);
347
348                 /*
349                  * loop if the vget fails (aka races), or if the vp
350                  * no longer matches ip->vp.
351                  */
352                 if (vget(vp, LK_EXCLUSIVE) == 0) {
353                         if (vp == ip->vp) {
354                                 vdrop(vp);
355                                 break;
356                         }
357                         vput(vp);
358                 }
359                 vdrop(vp);
360         }
361         *vpp = vp;
362         return(error);
363 }
364
365 /*
366  * Locate all copies of the inode for obj_id compatible with the specified
367  * asof, reference, and issue the related call-back.  This routine is used
368  * for direct-io invalidation and does not create any new inodes.
369  */
370 void
371 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
372                             int (*callback)(hammer_inode_t ip, void *data),
373                             void *data)
374 {
375         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
376                                    hammer_inode_info_cmp_all_history,
377                                    callback, iinfo);
378 }
379
380 /*
381  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
382  * do not attach or detach the related vnode (use hammer_get_vnode() for
383  * that).
384  *
385  * The flags argument is only applied for newly created inodes, and only
386  * certain flags are inherited.
387  *
388  * Called from the frontend.
389  */
390 struct hammer_inode *
391 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
392                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
393                  int flags, int *errorp)
394 {
395         hammer_mount_t hmp = trans->hmp;
396         struct hammer_node_cache *cachep;
397         struct hammer_inode_info iinfo;
398         struct hammer_cursor cursor;
399         struct hammer_inode *ip;
400
401
402         /*
403          * Determine if we already have an inode cached.  If we do then
404          * we are golden.
405          *
406          * If we find an inode with no vnode we have to mark the
407          * transaction such that hammer_inode_waitreclaims() is
408          * called later on to avoid building up an infinite number
409          * of inodes.  Otherwise we can continue to * add new inodes
410          * faster then they can be disposed of, even with the tsleep
411          * delay.
412          *
413          * If we find a dummy inode we return a failure so dounlink
414          * (which does another lookup) doesn't try to mess with the
415          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
416          * to ref dummy inodes.
417          */
418         iinfo.obj_id = obj_id;
419         iinfo.obj_asof = asof;
420         iinfo.obj_localization = localization;
421 loop:
422         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
423         if (ip) {
424                 if (ip->flags & HAMMER_INODE_DUMMY) {
425                         *errorp = ENOENT;
426                         return(NULL);
427                 }
428                 hammer_ref(&ip->lock);
429                 *errorp = 0;
430                 return(ip);
431         }
432
433         /*
434          * Allocate a new inode structure and deal with races later.
435          */
436         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
437         ++hammer_count_inodes;
438         ++hmp->count_inodes;
439         ip->obj_id = obj_id;
440         ip->obj_asof = iinfo.obj_asof;
441         ip->obj_localization = localization;
442         ip->hmp = hmp;
443         ip->flags = flags & HAMMER_INODE_RO;
444         ip->cache[0].ip = ip;
445         ip->cache[1].ip = ip;
446         ip->cache[2].ip = ip;
447         ip->cache[3].ip = ip;
448         if (hmp->ronly)
449                 ip->flags |= HAMMER_INODE_RO;
450         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
451                 0x7FFFFFFFFFFFFFFFLL;
452         RB_INIT(&ip->rec_tree);
453         TAILQ_INIT(&ip->target_list);
454         hammer_ref(&ip->lock);
455
456         /*
457          * Locate the on-disk inode.  If this is a PFS root we always
458          * access the current version of the root inode and (if it is not
459          * a master) always access information under it with a snapshot
460          * TID.
461          *
462          * We cache recent inode lookups in this directory in dip->cache[2].
463          * If we can't find it we assume the inode we are looking for is
464          * close to the directory inode.
465          */
466 retry:
467         cachep = NULL;
468         if (dip) {
469                 if (dip->cache[2].node)
470                         cachep = &dip->cache[2];
471                 else
472                         cachep = &dip->cache[0];
473         }
474         hammer_init_cursor(trans, &cursor, cachep, NULL);
475         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
476         cursor.key_beg.obj_id = ip->obj_id;
477         cursor.key_beg.key = 0;
478         cursor.key_beg.create_tid = 0;
479         cursor.key_beg.delete_tid = 0;
480         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
481         cursor.key_beg.obj_type = 0;
482
483         cursor.asof = iinfo.obj_asof;
484         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
485                        HAMMER_CURSOR_ASOF;
486
487         *errorp = hammer_btree_lookup(&cursor);
488         if (*errorp == EDEADLK) {
489                 hammer_done_cursor(&cursor);
490                 goto retry;
491         }
492
493         /*
494          * On success the B-Tree lookup will hold the appropriate
495          * buffer cache buffers and provide a pointer to the requested
496          * information.  Copy the information to the in-memory inode
497          * and cache the B-Tree node to improve future operations.
498          */
499         if (*errorp == 0) {
500                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
501                 ip->ino_data = cursor.data->inode;
502
503                 /*
504                  * cache[0] tries to cache the location of the object inode.
505                  * The assumption is that it is near the directory inode.
506                  *
507                  * cache[1] tries to cache the location of the object data.
508                  * We might have something in the governing directory from
509                  * scan optimizations (see the strategy code in
510                  * hammer_vnops.c).
511                  *
512                  * We update dip->cache[2], if possible, with the location
513                  * of the object inode for future directory shortcuts.
514                  */
515                 hammer_cache_node(&ip->cache[0], cursor.node);
516                 if (dip) {
517                         if (dip->cache[3].node) {
518                                 hammer_cache_node(&ip->cache[1],
519                                                   dip->cache[3].node);
520                         }
521                         hammer_cache_node(&dip->cache[2], cursor.node);
522                 }
523
524                 /*
525                  * The file should not contain any data past the file size
526                  * stored in the inode.  Setting save_trunc_off to the
527                  * file size instead of max reduces B-Tree lookup overheads
528                  * on append by allowing the flusher to avoid checking for
529                  * record overwrites.
530                  */
531                 ip->save_trunc_off = ip->ino_data.size;
532
533                 /*
534                  * Locate and assign the pseudofs management structure to
535                  * the inode.
536                  */
537                 if (dip && dip->obj_localization == ip->obj_localization) {
538                         ip->pfsm = dip->pfsm;
539                         hammer_ref(&ip->pfsm->lock);
540                 } else {
541                         ip->pfsm = hammer_load_pseudofs(trans,
542                                                         ip->obj_localization,
543                                                         errorp);
544                         *errorp = 0;    /* ignore ENOENT */
545                 }
546         }
547
548         /*
549          * The inode is placed on the red-black tree and will be synced to
550          * the media when flushed or by the filesystem sync.  If this races
551          * another instantiation/lookup the insertion will fail.
552          */
553         if (*errorp == 0) {
554                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
555                         hammer_free_inode(ip);
556                         hammer_done_cursor(&cursor);
557                         goto loop;
558                 }
559                 ip->flags |= HAMMER_INODE_ONDISK;
560         } else {
561                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
562                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
563                         --hmp->rsv_inodes;
564                 }
565
566                 hammer_free_inode(ip);
567                 ip = NULL;
568         }
569         hammer_done_cursor(&cursor);
570
571         /*
572          * NEWINODE is only set if the inode becomes dirty later,
573          * setting it here just leads to unnecessary stalls.
574          *
575          * trans->flags |= HAMMER_TRANSF_NEWINODE;
576          */
577         return (ip);
578 }
579
580 /*
581  * Get a dummy inode to placemark a broken directory entry.
582  */
583 struct hammer_inode *
584 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
585                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
586                  int flags, int *errorp)
587 {
588         hammer_mount_t hmp = trans->hmp;
589         struct hammer_inode_info iinfo;
590         struct hammer_inode *ip;
591
592         /*
593          * Determine if we already have an inode cached.  If we do then
594          * we are golden.
595          *
596          * If we find an inode with no vnode we have to mark the
597          * transaction such that hammer_inode_waitreclaims() is
598          * called later on to avoid building up an infinite number
599          * of inodes.  Otherwise we can continue to * add new inodes
600          * faster then they can be disposed of, even with the tsleep
601          * delay.
602          *
603          * If we find a non-fake inode we return an error.  Only fake
604          * inodes can be returned by this routine.
605          */
606         iinfo.obj_id = obj_id;
607         iinfo.obj_asof = asof;
608         iinfo.obj_localization = localization;
609 loop:
610         *errorp = 0;
611         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
612         if (ip) {
613                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
614                         *errorp = ENOENT;
615                         return(NULL);
616                 }
617                 hammer_ref(&ip->lock);
618                 return(ip);
619         }
620
621         /*
622          * Allocate a new inode structure and deal with races later.
623          */
624         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
625         ++hammer_count_inodes;
626         ++hmp->count_inodes;
627         ip->obj_id = obj_id;
628         ip->obj_asof = iinfo.obj_asof;
629         ip->obj_localization = localization;
630         ip->hmp = hmp;
631         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
632         ip->cache[0].ip = ip;
633         ip->cache[1].ip = ip;
634         ip->cache[2].ip = ip;
635         ip->cache[3].ip = ip;
636         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
637                 0x7FFFFFFFFFFFFFFFLL;
638         RB_INIT(&ip->rec_tree);
639         TAILQ_INIT(&ip->target_list);
640         hammer_ref(&ip->lock);
641
642         /*
643          * Populate the dummy inode.  Leave everything zero'd out.
644          *
645          * (ip->ino_leaf and ip->ino_data)
646          *
647          * Make the dummy inode a FIFO object which most copy programs
648          * will properly ignore.
649          */
650         ip->save_trunc_off = ip->ino_data.size;
651         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
652
653         /*
654          * Locate and assign the pseudofs management structure to
655          * the inode.
656          */
657         if (dip && dip->obj_localization == ip->obj_localization) {
658                 ip->pfsm = dip->pfsm;
659                 hammer_ref(&ip->pfsm->lock);
660         } else {
661                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
662                                                 errorp);
663                 *errorp = 0;    /* ignore ENOENT */
664         }
665
666         /*
667          * The inode is placed on the red-black tree and will be synced to
668          * the media when flushed or by the filesystem sync.  If this races
669          * another instantiation/lookup the insertion will fail.
670          *
671          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
672          */
673         if (*errorp == 0) {
674                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
675                         hammer_free_inode(ip);
676                         goto loop;
677                 }
678         } else {
679                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
680                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
681                         --hmp->rsv_inodes;
682                 }
683                 hammer_free_inode(ip);
684                 ip = NULL;
685         }
686         trans->flags |= HAMMER_TRANSF_NEWINODE;
687         return (ip);
688 }
689
690 /*
691  * Return a referenced inode only if it is in our inode cache.
692  *
693  * Dummy inodes do not count.
694  */
695 struct hammer_inode *
696 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
697                   hammer_tid_t asof, u_int32_t localization)
698 {
699         hammer_mount_t hmp = trans->hmp;
700         struct hammer_inode_info iinfo;
701         struct hammer_inode *ip;
702
703         iinfo.obj_id = obj_id;
704         iinfo.obj_asof = asof;
705         iinfo.obj_localization = localization;
706
707         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
708         if (ip) {
709                 if (ip->flags & HAMMER_INODE_DUMMY)
710                         ip = NULL;
711                 else
712                         hammer_ref(&ip->lock);
713         }
714         return(ip);
715 }
716
717 /*
718  * Create a new filesystem object, returning the inode in *ipp.  The
719  * returned inode will be referenced.  The inode is created in-memory.
720  *
721  * If pfsm is non-NULL the caller wishes to create the root inode for
722  * a master PFS.
723  */
724 int
725 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
726                     struct ucred *cred,
727                     hammer_inode_t dip, const char *name, int namelen,
728                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
729 {
730         hammer_mount_t hmp;
731         hammer_inode_t ip;
732         uid_t xuid;
733         int error;
734         int64_t namekey;
735         u_int32_t dummy;
736
737         hmp = trans->hmp;
738
739         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
740         ++hammer_count_inodes;
741         ++hmp->count_inodes;
742         trans->flags |= HAMMER_TRANSF_NEWINODE;
743
744         if (pfsm) {
745                 KKASSERT(pfsm->localization != 0);
746                 ip->obj_id = HAMMER_OBJID_ROOT;
747                 ip->obj_localization = pfsm->localization;
748         } else {
749                 KKASSERT(dip != NULL);
750                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
751                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
752                 ip->obj_localization = dip->obj_localization;
753         }
754
755         KKASSERT(ip->obj_id != 0);
756         ip->obj_asof = hmp->asof;
757         ip->hmp = hmp;
758         ip->flush_state = HAMMER_FST_IDLE;
759         ip->flags = HAMMER_INODE_DDIRTY |
760                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
761         ip->cache[0].ip = ip;
762         ip->cache[1].ip = ip;
763         ip->cache[2].ip = ip;
764         ip->cache[3].ip = ip;
765
766         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
767         /* ip->save_trunc_off = 0; (already zero) */
768         RB_INIT(&ip->rec_tree);
769         TAILQ_INIT(&ip->target_list);
770
771         ip->ino_data.atime = trans->time;
772         ip->ino_data.mtime = trans->time;
773         ip->ino_data.size = 0;
774         ip->ino_data.nlinks = 0;
775
776         /*
777          * A nohistory designator on the parent directory is inherited by
778          * the child.  We will do this even for pseudo-fs creation... the
779          * sysad can turn it off.
780          */
781         if (dip) {
782                 ip->ino_data.uflags = dip->ino_data.uflags &
783                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
784         }
785
786         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
787         ip->ino_leaf.base.localization = ip->obj_localization +
788                                          HAMMER_LOCALIZE_INODE;
789         ip->ino_leaf.base.obj_id = ip->obj_id;
790         ip->ino_leaf.base.key = 0;
791         ip->ino_leaf.base.create_tid = 0;
792         ip->ino_leaf.base.delete_tid = 0;
793         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
794         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
795
796         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
797         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
798         ip->ino_data.mode = vap->va_mode;
799         ip->ino_data.ctime = trans->time;
800
801         /*
802          * If we are running version 2 or greater directory entries are
803          * inode-localized instead of data-localized.
804          */
805         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
806                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
807                         ip->ino_data.cap_flags |=
808                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
809                 }
810         }
811         if (trans->hmp->version >= HAMMER_VOL_VERSION_SIX) {
812                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
813                         ip->ino_data.cap_flags |=
814                                 HAMMER_INODE_CAP_DIRHASH_ALG1;
815                 }
816         }
817
818         /*
819          * Setup the ".." pointer.  This only needs to be done for directories
820          * but we do it for all objects as a recovery aid.
821          */
822         if (dip)
823                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
824 #if 0
825         /*
826          * The parent_obj_localization field only applies to pseudo-fs roots.
827          * XXX this is no longer applicable, PFSs are no longer directly
828          * tied into the parent's directory structure.
829          */
830         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
831             ip->obj_id == HAMMER_OBJID_ROOT) {
832                 ip->ino_data.ext.obj.parent_obj_localization = 
833                                                 dip->obj_localization;
834         }
835 #endif
836
837         switch(ip->ino_leaf.base.obj_type) {
838         case HAMMER_OBJTYPE_CDEV:
839         case HAMMER_OBJTYPE_BDEV:
840                 ip->ino_data.rmajor = vap->va_rmajor;
841                 ip->ino_data.rminor = vap->va_rminor;
842                 break;
843         default:
844                 break;
845         }
846
847         /*
848          * Calculate default uid/gid and overwrite with information from
849          * the vap.
850          */
851         if (dip) {
852                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
853                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
854                                              xuid, cred, &vap->va_mode);
855         } else {
856                 xuid = 0;
857         }
858         ip->ino_data.mode = vap->va_mode;
859
860         if (vap->va_vaflags & VA_UID_UUID_VALID)
861                 ip->ino_data.uid = vap->va_uid_uuid;
862         else if (vap->va_uid != (uid_t)VNOVAL)
863                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
864         else
865                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
866
867         if (vap->va_vaflags & VA_GID_UUID_VALID)
868                 ip->ino_data.gid = vap->va_gid_uuid;
869         else if (vap->va_gid != (gid_t)VNOVAL)
870                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
871         else if (dip)
872                 ip->ino_data.gid = dip->ino_data.gid;
873
874         hammer_ref(&ip->lock);
875
876         if (pfsm) {
877                 ip->pfsm = pfsm;
878                 hammer_ref(&pfsm->lock);
879                 error = 0;
880         } else if (dip->obj_localization == ip->obj_localization) {
881                 ip->pfsm = dip->pfsm;
882                 hammer_ref(&ip->pfsm->lock);
883                 error = 0;
884         } else {
885                 ip->pfsm = hammer_load_pseudofs(trans,
886                                                 ip->obj_localization,
887                                                 &error);
888                 error = 0;      /* ignore ENOENT */
889         }
890
891         if (error) {
892                 hammer_free_inode(ip);
893                 ip = NULL;
894         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
895                 panic("hammer_create_inode: duplicate obj_id %llx",
896                       (long long)ip->obj_id);
897                 /* not reached */
898                 hammer_free_inode(ip);
899         }
900         *ipp = ip;
901         return(error);
902 }
903
904 /*
905  * Final cleanup / freeing of an inode structure
906  */
907 static void
908 hammer_free_inode(hammer_inode_t ip)
909 {
910         struct hammer_mount *hmp;
911
912         hmp = ip->hmp;
913         KKASSERT(hammer_oneref(&ip->lock));
914         hammer_uncache_node(&ip->cache[0]);
915         hammer_uncache_node(&ip->cache[1]);
916         hammer_uncache_node(&ip->cache[2]);
917         hammer_uncache_node(&ip->cache[3]);
918         hammer_inode_wakereclaims(ip);
919         if (ip->objid_cache)
920                 hammer_clear_objid(ip);
921         --hammer_count_inodes;
922         --hmp->count_inodes;
923         if (ip->pfsm) {
924                 hammer_rel_pseudofs(hmp, ip->pfsm);
925                 ip->pfsm = NULL;
926         }
927         kfree(ip, hmp->m_inodes);
928         ip = NULL;
929 }
930
931 /*
932  * Retrieve pseudo-fs data.  NULL will never be returned.
933  *
934  * If an error occurs *errorp will be set and a default template is returned,
935  * otherwise *errorp is set to 0.  Typically when an error occurs it will
936  * be ENOENT.
937  */
938 hammer_pseudofs_inmem_t
939 hammer_load_pseudofs(hammer_transaction_t trans,
940                      u_int32_t localization, int *errorp)
941 {
942         hammer_mount_t hmp = trans->hmp;
943         hammer_inode_t ip;
944         hammer_pseudofs_inmem_t pfsm;
945         struct hammer_cursor cursor;
946         int bytes;
947
948 retry:
949         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
950         if (pfsm) {
951                 hammer_ref(&pfsm->lock);
952                 *errorp = 0;
953                 return(pfsm);
954         }
955
956         /*
957          * PFS records are stored in the root inode (not the PFS root inode,
958          * but the real root).  Avoid an infinite recursion if loading
959          * the PFS for the real root.
960          */
961         if (localization) {
962                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
963                                       HAMMER_MAX_TID,
964                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
965         } else {
966                 ip = NULL;
967         }
968
969         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
970         pfsm->localization = localization;
971         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
972         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
973
974         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
975         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
976                                       HAMMER_LOCALIZE_MISC;
977         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
978         cursor.key_beg.create_tid = 0;
979         cursor.key_beg.delete_tid = 0;
980         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
981         cursor.key_beg.obj_type = 0;
982         cursor.key_beg.key = localization;
983         cursor.asof = HAMMER_MAX_TID;
984         cursor.flags |= HAMMER_CURSOR_ASOF;
985
986         if (ip)
987                 *errorp = hammer_ip_lookup(&cursor);
988         else
989                 *errorp = hammer_btree_lookup(&cursor);
990         if (*errorp == 0) {
991                 *errorp = hammer_ip_resolve_data(&cursor);
992                 if (*errorp == 0) {
993                         if (cursor.data->pfsd.mirror_flags &
994                             HAMMER_PFSD_DELETED) {
995                                 *errorp = ENOENT;
996                         } else {
997                                 bytes = cursor.leaf->data_len;
998                                 if (bytes > sizeof(pfsm->pfsd))
999                                         bytes = sizeof(pfsm->pfsd);
1000                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
1001                         }
1002                 }
1003         }
1004         hammer_done_cursor(&cursor);
1005
1006         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1007         hammer_ref(&pfsm->lock);
1008         if (ip)
1009                 hammer_rel_inode(ip, 0);
1010         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
1011                 kfree(pfsm, hmp->m_misc);
1012                 goto retry;
1013         }
1014         return(pfsm);
1015 }
1016
1017 /*
1018  * Store pseudo-fs data.  The backend will automatically delete any prior
1019  * on-disk pseudo-fs data but we have to delete in-memory versions.
1020  */
1021 int
1022 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
1023 {
1024         struct hammer_cursor cursor;
1025         hammer_record_t record;
1026         hammer_inode_t ip;
1027         int error;
1028
1029         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1030                               HAMMER_DEF_LOCALIZATION, 0, &error);
1031 retry:
1032         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
1033         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
1034         cursor.key_beg.localization = ip->obj_localization +
1035                                       HAMMER_LOCALIZE_MISC;
1036         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
1037         cursor.key_beg.create_tid = 0;
1038         cursor.key_beg.delete_tid = 0;
1039         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
1040         cursor.key_beg.obj_type = 0;
1041         cursor.key_beg.key = pfsm->localization;
1042         cursor.asof = HAMMER_MAX_TID;
1043         cursor.flags |= HAMMER_CURSOR_ASOF;
1044
1045         /*
1046          * Replace any in-memory version of the record.
1047          */
1048         error = hammer_ip_lookup(&cursor);
1049         if (error == 0 && hammer_cursor_inmem(&cursor)) {
1050                 record = cursor.iprec;
1051                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1052                         KKASSERT(cursor.deadlk_rec == NULL);
1053                         hammer_ref(&record->lock);
1054                         cursor.deadlk_rec = record;
1055                         error = EDEADLK;
1056                 } else {
1057                         record->flags |= HAMMER_RECF_DELETED_FE;
1058                         error = 0;
1059                 }
1060         }
1061
1062         /*
1063          * Allocate replacement general record.  The backend flush will
1064          * delete any on-disk version of the record.
1065          */
1066         if (error == 0 || error == ENOENT) {
1067                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1068                 record->type = HAMMER_MEM_RECORD_GENERAL;
1069
1070                 record->leaf.base.localization = ip->obj_localization +
1071                                                  HAMMER_LOCALIZE_MISC;
1072                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1073                 record->leaf.base.key = pfsm->localization;
1074                 record->leaf.data_len = sizeof(pfsm->pfsd);
1075                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1076                 error = hammer_ip_add_record(trans, record);
1077         }
1078         hammer_done_cursor(&cursor);
1079         if (error == EDEADLK)
1080                 goto retry;
1081         hammer_rel_inode(ip, 0);
1082         return(error);
1083 }
1084
1085 /*
1086  * Create a root directory for a PFS if one does not alredy exist.
1087  *
1088  * The PFS root stands alone so we must also bump the nlinks count
1089  * to prevent it from being destroyed on release.
1090  */
1091 int
1092 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1093                        hammer_pseudofs_inmem_t pfsm)
1094 {
1095         hammer_inode_t ip;
1096         struct vattr vap;
1097         int error;
1098
1099         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1100                               pfsm->localization, 0, &error);
1101         if (ip == NULL) {
1102                 vattr_null(&vap);
1103                 vap.va_mode = 0755;
1104                 vap.va_type = VDIR;
1105                 error = hammer_create_inode(trans, &vap, cred,
1106                                             NULL, NULL, 0,
1107                                             pfsm, &ip);
1108                 if (error == 0) {
1109                         ++ip->ino_data.nlinks;
1110                         hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY);
1111                 }
1112         }
1113         if (ip)
1114                 hammer_rel_inode(ip, 0);
1115         return(error);
1116 }
1117
1118 /*
1119  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1120  * if we are unable to disassociate all the inodes.
1121  */
1122 static
1123 int
1124 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1125 {
1126         int res;
1127
1128         hammer_ref(&ip->lock);
1129         if (hammer_isactive(&ip->lock) == 2 && ip->vp)
1130                 vclean_unlocked(ip->vp);
1131         if (hammer_isactive(&ip->lock) == 1 && ip->vp == NULL)
1132                 res = 0;
1133         else
1134                 res = -1;       /* stop, someone is using the inode */
1135         hammer_rel_inode(ip, 0);
1136         return(res);
1137 }
1138
1139 int
1140 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
1141 {
1142         int res;
1143         int try;
1144
1145         for (try = res = 0; try < 4; ++try) {
1146                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1147                                            hammer_inode_pfs_cmp,
1148                                            hammer_unload_pseudofs_callback,
1149                                            &localization);
1150                 if (res == 0 && try > 1)
1151                         break;
1152                 hammer_flusher_sync(trans->hmp);
1153         }
1154         if (res != 0)
1155                 res = ENOTEMPTY;
1156         return(res);
1157 }
1158
1159
1160 /*
1161  * Release a reference on a PFS
1162  */
1163 void
1164 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1165 {
1166         hammer_rel(&pfsm->lock);
1167         if (hammer_norefs(&pfsm->lock)) {
1168                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1169                 kfree(pfsm, hmp->m_misc);
1170         }
1171 }
1172
1173 /*
1174  * Called by hammer_sync_inode().
1175  */
1176 static int
1177 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1178 {
1179         hammer_transaction_t trans = cursor->trans;
1180         hammer_record_t record;
1181         int error;
1182         int redirty;
1183
1184 retry:
1185         error = 0;
1186
1187         /*
1188          * If the inode has a presence on-disk then locate it and mark
1189          * it deleted, setting DELONDISK.
1190          *
1191          * The record may or may not be physically deleted, depending on
1192          * the retention policy.
1193          */
1194         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1195             HAMMER_INODE_ONDISK) {
1196                 hammer_normalize_cursor(cursor);
1197                 cursor->key_beg.localization = ip->obj_localization + 
1198                                                HAMMER_LOCALIZE_INODE;
1199                 cursor->key_beg.obj_id = ip->obj_id;
1200                 cursor->key_beg.key = 0;
1201                 cursor->key_beg.create_tid = 0;
1202                 cursor->key_beg.delete_tid = 0;
1203                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1204                 cursor->key_beg.obj_type = 0;
1205                 cursor->asof = ip->obj_asof;
1206                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1207                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1208                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1209
1210                 error = hammer_btree_lookup(cursor);
1211                 if (hammer_debug_inode)
1212                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1213
1214                 if (error == 0) {
1215                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1216                         if (hammer_debug_inode)
1217                                 kprintf(" error %d\n", error);
1218                         if (error == 0) {
1219                                 ip->flags |= HAMMER_INODE_DELONDISK;
1220                         }
1221                         if (cursor->node)
1222                                 hammer_cache_node(&ip->cache[0], cursor->node);
1223                 }
1224                 if (error == EDEADLK) {
1225                         hammer_done_cursor(cursor);
1226                         error = hammer_init_cursor(trans, cursor,
1227                                                    &ip->cache[0], ip);
1228                         if (hammer_debug_inode)
1229                                 kprintf("IPDED %p %d\n", ip, error);
1230                         if (error == 0)
1231                                 goto retry;
1232                 }
1233         }
1234
1235         /*
1236          * Ok, write out the initial record or a new record (after deleting
1237          * the old one), unless the DELETED flag is set.  This routine will
1238          * clear DELONDISK if it writes out a record.
1239          *
1240          * Update our inode statistics if this is the first application of
1241          * the inode on-disk.
1242          */
1243         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1244                 /*
1245                  * Generate a record and write it to the media.  We clean-up
1246                  * the state before releasing so we do not have to set-up
1247                  * a flush_group.
1248                  */
1249                 record = hammer_alloc_mem_record(ip, 0);
1250                 record->type = HAMMER_MEM_RECORD_INODE;
1251                 record->flush_state = HAMMER_FST_FLUSH;
1252                 record->leaf = ip->sync_ino_leaf;
1253                 record->leaf.base.create_tid = trans->tid;
1254                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1255                 record->leaf.create_ts = trans->time32;
1256                 record->data = (void *)&ip->sync_ino_data;
1257                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1258
1259                 /*
1260                  * If this flag is set we cannot sync the new file size
1261                  * because we haven't finished related truncations.  The
1262                  * inode will be flushed in another flush group to finish
1263                  * the job.
1264                  */
1265                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1266                     ip->sync_ino_data.size != ip->ino_data.size) {
1267                         redirty = 1;
1268                         ip->sync_ino_data.size = ip->ino_data.size;
1269                 } else {
1270                         redirty = 0;
1271                 }
1272
1273                 for (;;) {
1274                         error = hammer_ip_sync_record_cursor(cursor, record);
1275                         if (hammer_debug_inode)
1276                                 kprintf("GENREC %p rec %08x %d\n",      
1277                                         ip, record->flags, error);
1278                         if (error != EDEADLK)
1279                                 break;
1280                         hammer_done_cursor(cursor);
1281                         error = hammer_init_cursor(trans, cursor,
1282                                                    &ip->cache[0], ip);
1283                         if (hammer_debug_inode)
1284                                 kprintf("GENREC reinit %d\n", error);
1285                         if (error)
1286                                 break;
1287                 }
1288
1289                 /*
1290                  * Note:  The record was never on the inode's record tree
1291                  * so just wave our hands importantly and destroy it.
1292                  */
1293                 record->flags |= HAMMER_RECF_COMMITTED;
1294                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1295                 record->flush_state = HAMMER_FST_IDLE;
1296                 ++ip->rec_generation;
1297                 hammer_rel_mem_record(record);
1298
1299                 /*
1300                  * Finish up.
1301                  */
1302                 if (error == 0) {
1303                         if (hammer_debug_inode)
1304                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1305                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1306                                             HAMMER_INODE_SDIRTY |
1307                                             HAMMER_INODE_ATIME |
1308                                             HAMMER_INODE_MTIME);
1309                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1310                         if (redirty)
1311                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1312
1313                         /*
1314                          * Root volume count of inodes
1315                          */
1316                         hammer_sync_lock_sh(trans);
1317                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1318                                 hammer_modify_volume_field(trans,
1319                                                            trans->rootvol,
1320                                                            vol0_stat_inodes);
1321                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1322                                 hammer_modify_volume_done(trans->rootvol);
1323                                 ip->flags |= HAMMER_INODE_ONDISK;
1324                                 if (hammer_debug_inode)
1325                                         kprintf("NOWONDISK %p\n", ip);
1326                         }
1327                         hammer_sync_unlock(trans);
1328                 }
1329         }
1330
1331         /*
1332          * If the inode has been destroyed, clean out any left-over flags
1333          * that may have been set by the frontend.
1334          */
1335         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
1336                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1337                                     HAMMER_INODE_SDIRTY |
1338                                     HAMMER_INODE_ATIME |
1339                                     HAMMER_INODE_MTIME);
1340         }
1341         return(error);
1342 }
1343
1344 /*
1345  * Update only the itimes fields.
1346  *
1347  * ATIME can be updated without generating any UNDO.  MTIME is updated
1348  * with UNDO so it is guaranteed to be synchronized properly in case of
1349  * a crash.
1350  *
1351  * Neither field is included in the B-Tree leaf element's CRC, which is how
1352  * we can get away with updating ATIME the way we do.
1353  */
1354 static int
1355 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1356 {
1357         hammer_transaction_t trans = cursor->trans;
1358         int error;
1359
1360 retry:
1361         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1362             HAMMER_INODE_ONDISK) {
1363                 return(0);
1364         }
1365
1366         hammer_normalize_cursor(cursor);
1367         cursor->key_beg.localization = ip->obj_localization + 
1368                                        HAMMER_LOCALIZE_INODE;
1369         cursor->key_beg.obj_id = ip->obj_id;
1370         cursor->key_beg.key = 0;
1371         cursor->key_beg.create_tid = 0;
1372         cursor->key_beg.delete_tid = 0;
1373         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1374         cursor->key_beg.obj_type = 0;
1375         cursor->asof = ip->obj_asof;
1376         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1377         cursor->flags |= HAMMER_CURSOR_ASOF;
1378         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1379         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1380         cursor->flags |= HAMMER_CURSOR_BACKEND;
1381
1382         error = hammer_btree_lookup(cursor);
1383         if (error == 0) {
1384                 hammer_cache_node(&ip->cache[0], cursor->node);
1385                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1386                         /*
1387                          * Updating MTIME requires an UNDO.  Just cover
1388                          * both atime and mtime.
1389                          */
1390                         hammer_sync_lock_sh(trans);
1391                         hammer_modify_buffer(trans, cursor->data_buffer,
1392                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
1393                                      HAMMER_ITIMES_BYTES);
1394                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1395                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1396                         hammer_modify_buffer_done(cursor->data_buffer);
1397                         hammer_sync_unlock(trans);
1398                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1399                         /*
1400                          * Updating atime only can be done in-place with
1401                          * no UNDO.
1402                          */
1403                         hammer_sync_lock_sh(trans);
1404                         hammer_modify_buffer(trans, cursor->data_buffer,
1405                                              NULL, 0);
1406                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1407                         hammer_modify_buffer_done(cursor->data_buffer);
1408                         hammer_sync_unlock(trans);
1409                 }
1410                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1411         }
1412         if (error == EDEADLK) {
1413                 hammer_done_cursor(cursor);
1414                 error = hammer_init_cursor(trans, cursor,
1415                                            &ip->cache[0], ip);
1416                 if (error == 0)
1417                         goto retry;
1418         }
1419         return(error);
1420 }
1421
1422 /*
1423  * Release a reference on an inode, flush as requested.
1424  *
1425  * On the last reference we queue the inode to the flusher for its final
1426  * disposition.
1427  */
1428 void
1429 hammer_rel_inode(struct hammer_inode *ip, int flush)
1430 {
1431         /*hammer_mount_t hmp = ip->hmp;*/
1432
1433         /*
1434          * Handle disposition when dropping the last ref.
1435          */
1436         for (;;) {
1437                 if (hammer_oneref(&ip->lock)) {
1438                         /*
1439                          * Determine whether on-disk action is needed for
1440                          * the inode's final disposition.
1441                          */
1442                         KKASSERT(ip->vp == NULL);
1443                         hammer_inode_unloadable_check(ip, 0);
1444                         if (ip->flags & HAMMER_INODE_MODMASK) {
1445                                 hammer_flush_inode(ip, 0);
1446                         } else if (hammer_oneref(&ip->lock)) {
1447                                 hammer_unload_inode(ip);
1448                                 break;
1449                         }
1450                 } else {
1451                         if (flush)
1452                                 hammer_flush_inode(ip, 0);
1453
1454                         /*
1455                          * The inode still has multiple refs, try to drop
1456                          * one ref.
1457                          */
1458                         KKASSERT(hammer_isactive(&ip->lock) >= 1);
1459                         if (hammer_isactive(&ip->lock) > 1) {
1460                                 hammer_rel(&ip->lock);
1461                                 break;
1462                         }
1463                 }
1464         }
1465 }
1466
1467 /*
1468  * Unload and destroy the specified inode.  Must be called with one remaining
1469  * reference.  The reference is disposed of.
1470  *
1471  * The inode must be completely clean.
1472  */
1473 static int
1474 hammer_unload_inode(struct hammer_inode *ip)
1475 {
1476         hammer_mount_t hmp = ip->hmp;
1477
1478         KASSERT(hammer_oneref(&ip->lock),
1479                 ("hammer_unload_inode: %d refs", hammer_isactive(&ip->lock)));
1480         KKASSERT(ip->vp == NULL);
1481         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1482         KKASSERT(ip->cursor_ip_refs == 0);
1483         KKASSERT(hammer_notlocked(&ip->lock));
1484         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1485
1486         KKASSERT(RB_EMPTY(&ip->rec_tree));
1487         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1488
1489         if (ip->flags & HAMMER_INODE_RDIRTY) {
1490                 RB_REMOVE(hammer_redo_rb_tree, &hmp->rb_redo_root, ip);
1491                 ip->flags &= ~HAMMER_INODE_RDIRTY;
1492         }
1493         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1494
1495         hammer_free_inode(ip);
1496         return(0);
1497 }
1498
1499 /*
1500  * Called during unmounting if a critical error occured.  The in-memory
1501  * inode and all related structures are destroyed.
1502  *
1503  * If a critical error did not occur the unmount code calls the standard
1504  * release and asserts that the inode is gone.
1505  */
1506 int
1507 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1508 {
1509         hammer_record_t rec;
1510
1511         /*
1512          * Get rid of the inodes in-memory records, regardless of their
1513          * state, and clear the mod-mask.
1514          */
1515         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1516                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1517                 rec->target_ip = NULL;
1518                 if (rec->flush_state == HAMMER_FST_SETUP)
1519                         rec->flush_state = HAMMER_FST_IDLE;
1520         }
1521         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1522                 if (rec->flush_state == HAMMER_FST_FLUSH)
1523                         --rec->flush_group->refs;
1524                 else
1525                         hammer_ref(&rec->lock);
1526                 KKASSERT(hammer_oneref(&rec->lock));
1527                 rec->flush_state = HAMMER_FST_IDLE;
1528                 rec->flush_group = NULL;
1529                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1530                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1531                 ++ip->rec_generation;
1532                 hammer_rel_mem_record(rec);
1533         }
1534         ip->flags &= ~HAMMER_INODE_MODMASK;
1535         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1536         KKASSERT(ip->vp == NULL);
1537
1538         /*
1539          * Remove the inode from any flush group, force it idle.  FLUSH
1540          * and SETUP states have an inode ref.
1541          */
1542         switch(ip->flush_state) {
1543         case HAMMER_FST_FLUSH:
1544                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
1545                 --ip->flush_group->refs;
1546                 ip->flush_group = NULL;
1547                 /* fall through */
1548         case HAMMER_FST_SETUP:
1549                 hammer_rel(&ip->lock);
1550                 ip->flush_state = HAMMER_FST_IDLE;
1551                 /* fall through */
1552         case HAMMER_FST_IDLE:
1553                 break;
1554         }
1555
1556         /*
1557          * There shouldn't be any associated vnode.  The unload needs at
1558          * least one ref, if we do have a vp steal its ip ref.
1559          */
1560         if (ip->vp) {
1561                 kprintf("hammer_destroy_inode_callback: Unexpected "
1562                         "vnode association ip %p vp %p\n", ip, ip->vp);
1563                 ip->vp->v_data = NULL;
1564                 ip->vp = NULL;
1565         } else {
1566                 hammer_ref(&ip->lock);
1567         }
1568         hammer_unload_inode(ip);
1569         return(0);
1570 }
1571
1572 /*
1573  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1574  * the read-only flag for cached inodes.
1575  *
1576  * This routine is called from a RB_SCAN().
1577  */
1578 int
1579 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1580 {
1581         hammer_mount_t hmp = ip->hmp;
1582
1583         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1584                 ip->flags |= HAMMER_INODE_RO;
1585         else
1586                 ip->flags &= ~HAMMER_INODE_RO;
1587         return(0);
1588 }
1589
1590 /*
1591  * A transaction has modified an inode, requiring updates as specified by
1592  * the passed flags.
1593  *
1594  * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
1595  *                      and not including size changes due to write-append
1596  *                      (but other size changes are included).
1597  * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
1598  *                      write-append.
1599  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1600  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1601  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1602  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1603  */
1604 void
1605 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
1606 {
1607         /* 
1608          * ronly of 0 or 2 does not trigger assertion.
1609          * 2 is a special error state 
1610          */
1611         KKASSERT(ip->hmp->ronly != 1 ||
1612                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 
1613                             HAMMER_INODE_SDIRTY |
1614                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1615                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1616         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1617                 ip->flags |= HAMMER_INODE_RSV_INODES;
1618                 ++ip->hmp->rsv_inodes;
1619         }
1620
1621         /*
1622          * Set the NEWINODE flag in the transaction if the inode
1623          * transitions to a dirty state.  This is used to track
1624          * the load on the inode cache.
1625          */
1626         if (trans &&
1627             (ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1628             (flags & HAMMER_INODE_MODMASK)) {
1629                 trans->flags |= HAMMER_TRANSF_NEWINODE;
1630         }
1631
1632         ip->flags |= flags;
1633 }
1634
1635 /*
1636  * Attempt to quickly update the atime for a hammer inode.  Return 0 on
1637  * success, -1 on failure.
1638  *
1639  * We attempt to update the atime with only the ip lock and not the
1640  * whole filesystem lock in order to improve concurrency.  We can only
1641  * do this safely if the ATIME flag is already pending on the inode.
1642  *
1643  * This function is called via a vnops path (ip pointer is stable) without
1644  * fs_token held.
1645  */
1646 int
1647 hammer_update_atime_quick(hammer_inode_t ip)
1648 {
1649         struct timeval tv;
1650         int res = -1;
1651
1652         if ((ip->flags & HAMMER_INODE_RO) ||
1653             (ip->hmp->mp->mnt_flag & MNT_NOATIME)) {
1654                 /*
1655                  * Silently indicate success on read-only mount/snap
1656                  */
1657                 res = 0;
1658         } else if (ip->flags & HAMMER_INODE_ATIME) {
1659                 /*
1660                  * Double check with inode lock held against backend.  This
1661                  * is only safe if all we need to do is update
1662                  * ino_data.atime.
1663                  */
1664                 getmicrotime(&tv);
1665                 hammer_lock_ex(&ip->lock);
1666                 if (ip->flags & HAMMER_INODE_ATIME) {
1667                         ip->ino_data.atime =
1668                             (unsigned long)tv.tv_sec * 1000000ULL + tv.tv_usec;
1669                         res = 0;
1670                 }
1671                 hammer_unlock(&ip->lock);
1672         }
1673         return res;
1674 }
1675
1676 /*
1677  * Request that an inode be flushed.  This whole mess cannot block and may
1678  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1679  * actively flush the inode until the flush can be done.
1680  *
1681  * The inode may already be flushing, or may be in a setup state.  We can
1682  * place the inode in a flushing state if it is currently idle and flag it
1683  * to reflush if it is currently flushing.
1684  *
1685  * Upon return if the inode could not be flushed due to a setup
1686  * dependancy, then it will be automatically flushed when the dependancy
1687  * is satisfied.
1688  */
1689 void
1690 hammer_flush_inode(hammer_inode_t ip, int flags)
1691 {
1692         hammer_mount_t hmp;
1693         hammer_flush_group_t flg;
1694         int good;
1695
1696         /*
1697          * fill_flush_group is the first flush group we may be able to
1698          * continue filling, it may be open or closed but it will always
1699          * be past the currently flushing (running) flg.
1700          *
1701          * next_flush_group is the next open flush group.
1702          */
1703         hmp = ip->hmp;
1704         while ((flg = hmp->fill_flush_group) != NULL) {
1705                 KKASSERT(flg->running == 0);
1706                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit &&
1707                     flg->total_count <= hammer_autoflush) {
1708                         break;
1709                 }
1710                 hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
1711                 hammer_flusher_async(ip->hmp, flg);
1712         }
1713         if (flg == NULL) {
1714                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1715                 flg->seq = hmp->flusher.next++;
1716                 if (hmp->next_flush_group == NULL)
1717                         hmp->next_flush_group = flg;
1718                 if (hmp->fill_flush_group == NULL)
1719                         hmp->fill_flush_group = flg;
1720                 RB_INIT(&flg->flush_tree);
1721                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1722         }
1723
1724         /*
1725          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1726          * state we have to put it back into an IDLE state so we can
1727          * drop the extra ref.
1728          *
1729          * If we have a parent dependancy we must still fall through
1730          * so we can run it.
1731          */
1732         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1733                 if (ip->flush_state == HAMMER_FST_SETUP &&
1734                     TAILQ_EMPTY(&ip->target_list)) {
1735                         ip->flush_state = HAMMER_FST_IDLE;
1736                         hammer_rel_inode(ip, 0);
1737                 }
1738                 if (ip->flush_state == HAMMER_FST_IDLE)
1739                         return;
1740         }
1741
1742         /*
1743          * Our flush action will depend on the current state.
1744          */
1745         switch(ip->flush_state) {
1746         case HAMMER_FST_IDLE:
1747                 /*
1748                  * We have no dependancies and can flush immediately.  Some
1749                  * our children may not be flushable so we have to re-test
1750                  * with that additional knowledge.
1751                  */
1752                 hammer_flush_inode_core(ip, flg, flags);
1753                 break;
1754         case HAMMER_FST_SETUP:
1755                 /*
1756                  * Recurse upwards through dependancies via target_list
1757                  * and start their flusher actions going if possible.
1758                  *
1759                  * 'good' is our connectivity.  -1 means we have none and
1760                  * can't flush, 0 means there weren't any dependancies, and
1761                  * 1 means we have good connectivity.
1762                  */
1763                 good = hammer_setup_parent_inodes(ip, 0, flg);
1764
1765                 if (good >= 0) {
1766                         /*
1767                          * We can continue if good >= 0.  Determine how 
1768                          * many records under our inode can be flushed (and
1769                          * mark them).
1770                          */
1771                         hammer_flush_inode_core(ip, flg, flags);
1772                 } else {
1773                         /*
1774                          * Parent has no connectivity, tell it to flush
1775                          * us as soon as it does.
1776                          *
1777                          * The REFLUSH flag is also needed to trigger
1778                          * dependancy wakeups.
1779                          */
1780                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1781                                      HAMMER_INODE_REFLUSH;
1782                         if (flags & HAMMER_FLUSH_SIGNAL) {
1783                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1784                                 hammer_flusher_async(ip->hmp, flg);
1785                         }
1786                 }
1787                 break;
1788         case HAMMER_FST_FLUSH:
1789                 /*
1790                  * We are already flushing, flag the inode to reflush
1791                  * if needed after it completes its current flush.
1792                  *
1793                  * The REFLUSH flag is also needed to trigger
1794                  * dependancy wakeups.
1795                  */
1796                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1797                         ip->flags |= HAMMER_INODE_REFLUSH;
1798                 if (flags & HAMMER_FLUSH_SIGNAL) {
1799                         ip->flags |= HAMMER_INODE_RESIGNAL;
1800                         hammer_flusher_async(ip->hmp, flg);
1801                 }
1802                 break;
1803         }
1804 }
1805
1806 /*
1807  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1808  * ip which reference our ip.
1809  *
1810  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1811  *     so for now do not ref/deref the structures.  Note that if we use the
1812  *     ref/rel code later, the rel CAN block.
1813  */
1814 static int
1815 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1816                            hammer_flush_group_t flg)
1817 {
1818         hammer_record_t depend;
1819         int good;
1820         int r;
1821
1822         /*
1823          * If we hit our recursion limit and we have parent dependencies
1824          * We cannot continue.  Returning < 0 will cause us to be flagged
1825          * for reflush.  Returning -2 cuts off additional dependency checks
1826          * because they are likely to also hit the depth limit.
1827          *
1828          * We cannot return < 0 if there are no dependencies or there might
1829          * not be anything to wakeup (ip).
1830          */
1831         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1832                 kprintf("HAMMER Warning: depth limit reached on "
1833                         "setup recursion, inode %p %016llx\n",
1834                         ip, (long long)ip->obj_id);
1835                 return(-2);
1836         }
1837
1838         /*
1839          * Scan dependencies
1840          */
1841         good = 0;
1842         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1843                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1844                 KKASSERT(depend->target_ip == ip);
1845                 if (r < 0 && good == 0)
1846                         good = -1;
1847                 if (r > 0)
1848                         good = 1;
1849
1850                 /*
1851                  * If we failed due to the recursion depth limit then stop
1852                  * now.
1853                  */
1854                 if (r == -2)
1855                         break;
1856         }
1857         return(good);
1858 }
1859
1860 /*
1861  * This helper function takes a record representing the dependancy between
1862  * the parent inode and child inode.
1863  *
1864  * record->ip           = parent inode
1865  * record->target_ip    = child inode
1866  * 
1867  * We are asked to recurse upwards and convert the record from SETUP
1868  * to FLUSH if possible.
1869  *
1870  * Return 1 if the record gives us connectivity
1871  *
1872  * Return 0 if the record is not relevant 
1873  *
1874  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1875  */
1876 static int
1877 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1878                                   hammer_flush_group_t flg)
1879 {
1880         hammer_mount_t hmp;
1881         hammer_inode_t pip;
1882         int good;
1883
1884         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1885         pip = record->ip;
1886         hmp = pip->hmp;
1887
1888         /*
1889          * If the record is already flushing, is it in our flush group?
1890          *
1891          * If it is in our flush group but it is a general record or a 
1892          * delete-on-disk, it does not improve our connectivity (return 0),
1893          * and if the target inode is not trying to destroy itself we can't
1894          * allow the operation yet anyway (the second return -1).
1895          */
1896         if (record->flush_state == HAMMER_FST_FLUSH) {
1897                 /*
1898                  * If not in our flush group ask the parent to reflush
1899                  * us as soon as possible.
1900                  */
1901                 if (record->flush_group != flg) {
1902                         pip->flags |= HAMMER_INODE_REFLUSH;
1903                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1904                         return(-1);
1905                 }
1906
1907                 /*
1908                  * If in our flush group everything is already set up,
1909                  * just return whether the record will improve our
1910                  * visibility or not.
1911                  */
1912                 if (record->type == HAMMER_MEM_RECORD_ADD)
1913                         return(1);
1914                 return(0);
1915         }
1916
1917         /*
1918          * It must be a setup record.  Try to resolve the setup dependancies
1919          * by recursing upwards so we can place ip on the flush list.
1920          *
1921          * Limit ourselves to 20 levels of recursion to avoid blowing out
1922          * the kernel stack.  If we hit the recursion limit we can't flush
1923          * until the parent flushes.  The parent will flush independantly
1924          * on its own and ultimately a deep recursion will be resolved.
1925          */
1926         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1927
1928         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1929
1930         /*
1931          * If good < 0 the parent has no connectivity and we cannot safely
1932          * flush the directory entry, which also means we can't flush our
1933          * ip.  Flag us for downward recursion once the parent's
1934          * connectivity is resolved.  Flag the parent for [re]flush or it
1935          * may not check for downward recursions.
1936          */
1937         if (good < 0) {
1938                 pip->flags |= HAMMER_INODE_REFLUSH;
1939                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1940                 return(good);
1941         }
1942
1943         /*
1944          * We are go, place the parent inode in a flushing state so we can
1945          * place its record in a flushing state.  Note that the parent
1946          * may already be flushing.  The record must be in the same flush
1947          * group as the parent.
1948          */
1949         if (pip->flush_state != HAMMER_FST_FLUSH)
1950                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
1951         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
1952
1953         /*
1954          * It is possible for a rename to create a loop in the recursion
1955          * and revisit a record.  This will result in the record being
1956          * placed in a flush state unexpectedly.  This check deals with
1957          * the case.
1958          */
1959         if (record->flush_state == HAMMER_FST_FLUSH) {
1960                 if (record->type == HAMMER_MEM_RECORD_ADD)
1961                         return(1);
1962                 return(0);
1963         }
1964
1965         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1966
1967 #if 0
1968         if (record->type == HAMMER_MEM_RECORD_DEL &&
1969             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
1970                 /*
1971                  * Regardless of flushing state we cannot sync this path if the
1972                  * record represents a delete-on-disk but the target inode
1973                  * is not ready to sync its own deletion.
1974                  *
1975                  * XXX need to count effective nlinks to determine whether
1976                  * the flush is ok, otherwise removing a hardlink will
1977                  * just leave the DEL record to rot.
1978                  */
1979                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
1980                 return(-1);
1981         } else
1982 #endif
1983         if (pip->flush_group == flg) {
1984                 /*
1985                  * Because we have not calculated nlinks yet we can just
1986                  * set records to the flush state if the parent is in
1987                  * the same flush group as we are.
1988                  */
1989                 record->flush_state = HAMMER_FST_FLUSH;
1990                 record->flush_group = flg;
1991                 ++record->flush_group->refs;
1992                 hammer_ref(&record->lock);
1993
1994                 /*
1995                  * A general directory-add contributes to our visibility.
1996                  *
1997                  * Otherwise it is probably a directory-delete or 
1998                  * delete-on-disk record and does not contribute to our
1999                  * visbility (but we can still flush it).
2000                  */
2001                 if (record->type == HAMMER_MEM_RECORD_ADD)
2002                         return(1);
2003                 return(0);
2004         } else {
2005                 /*
2006                  * If the parent is not in our flush group we cannot
2007                  * flush this record yet, there is no visibility.
2008                  * We tell the parent to reflush and mark ourselves
2009                  * so the parent knows it should flush us too.
2010                  */
2011                 pip->flags |= HAMMER_INODE_REFLUSH;
2012                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
2013                 return(-1);
2014         }
2015 }
2016
2017 /*
2018  * This is the core routine placing an inode into the FST_FLUSH state.
2019  */
2020 static void
2021 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
2022 {
2023         hammer_mount_t hmp = ip->hmp;
2024         int go_count;
2025
2026         /*
2027          * Set flush state and prevent the flusher from cycling into
2028          * the next flush group.  Do not place the ip on the list yet.
2029          * Inodes not in the idle state get an extra reference.
2030          */
2031         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
2032         if (ip->flush_state == HAMMER_FST_IDLE)
2033                 hammer_ref(&ip->lock);
2034         ip->flush_state = HAMMER_FST_FLUSH;
2035         ip->flush_group = flg;
2036         ++hmp->flusher.group_lock;
2037         ++hmp->count_iqueued;
2038         ++hammer_count_iqueued;
2039         ++flg->total_count;
2040         hammer_redo_fifo_start_flush(ip);
2041
2042 #if 0
2043         /*
2044          * We need to be able to vfsync/truncate from the backend.
2045          *
2046          * XXX Any truncation from the backend will acquire the vnode
2047          *     independently.
2048          */
2049         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
2050         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
2051                 ip->flags |= HAMMER_INODE_VHELD;
2052                 vref(ip->vp);
2053         }
2054 #endif
2055
2056         /*
2057          * Figure out how many in-memory records we can actually flush
2058          * (not including inode meta-data, buffers, etc).
2059          */
2060         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
2061         if (flags & HAMMER_FLUSH_RECURSION) {
2062                 /*
2063                  * If this is a upwards recursion we do not want to
2064                  * recurse down again!
2065                  */
2066                 go_count = 1;
2067 #if 0
2068         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2069                 /*
2070                  * No new records are added if we must complete a flush
2071                  * from a previous cycle, but we do have to move the records
2072                  * from the previous cycle to the current one.
2073                  */
2074 #if 0
2075                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2076                                    hammer_syncgrp_child_callback, NULL);
2077 #endif
2078                 go_count = 1;
2079 #endif
2080         } else {
2081                 /*
2082                  * Normal flush, scan records and bring them into the flush.
2083                  * Directory adds and deletes are usually skipped (they are
2084                  * grouped with the related inode rather then with the
2085                  * directory).
2086                  *
2087                  * go_count can be negative, which means the scan aborted
2088                  * due to the flush group being over-full and we should
2089                  * flush what we have.
2090                  */
2091                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2092                                    hammer_setup_child_callback, NULL);
2093         }
2094
2095         /*
2096          * This is a more involved test that includes go_count.  If we
2097          * can't flush, flag the inode and return.  If go_count is 0 we
2098          * were are unable to flush any records in our rec_tree and
2099          * must ignore the XDIRTY flag.
2100          */
2101         if (go_count == 0) {
2102                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
2103                         --hmp->count_iqueued;
2104                         --hammer_count_iqueued;
2105
2106                         --flg->total_count;
2107                         ip->flush_state = HAMMER_FST_SETUP;
2108                         ip->flush_group = NULL;
2109                         if (flags & HAMMER_FLUSH_SIGNAL) {
2110                                 ip->flags |= HAMMER_INODE_REFLUSH |
2111                                              HAMMER_INODE_RESIGNAL;
2112                         } else {
2113                                 ip->flags |= HAMMER_INODE_REFLUSH;
2114                         }
2115 #if 0
2116                         if (ip->flags & HAMMER_INODE_VHELD) {
2117                                 ip->flags &= ~HAMMER_INODE_VHELD;
2118                                 vrele(ip->vp);
2119                         }
2120 #endif
2121
2122                         /*
2123                          * REFLUSH is needed to trigger dependancy wakeups
2124                          * when an inode is in SETUP.
2125                          */
2126                         ip->flags |= HAMMER_INODE_REFLUSH;
2127                         if (--hmp->flusher.group_lock == 0)
2128                                 wakeup(&hmp->flusher.group_lock);
2129                         return;
2130                 }
2131         }
2132
2133         /*
2134          * Snapshot the state of the inode for the backend flusher.
2135          *
2136          * We continue to retain save_trunc_off even when all truncations
2137          * have been resolved as an optimization to determine if we can
2138          * skip the B-Tree lookup for overwrite deletions.
2139          *
2140          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2141          * and stays in ip->flags.  Once set, it stays set until the
2142          * inode is destroyed.
2143          */
2144         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2145                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2146                 ip->sync_trunc_off = ip->trunc_off;
2147                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2148                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2149                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2150
2151                 /*
2152                  * The save_trunc_off used to cache whether the B-Tree
2153                  * holds any records past that point is not used until
2154                  * after the truncation has succeeded, so we can safely
2155                  * set it now.
2156                  */
2157                 if (ip->save_trunc_off > ip->sync_trunc_off)
2158                         ip->save_trunc_off = ip->sync_trunc_off;
2159         }
2160         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2161                            ~HAMMER_INODE_TRUNCATED);
2162         ip->sync_ino_leaf = ip->ino_leaf;
2163         ip->sync_ino_data = ip->ino_data;
2164         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2165 #ifdef DEBUG_TRUNCATE
2166         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
2167                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
2168 #endif
2169
2170         /*
2171          * The flusher list inherits our inode and reference.
2172          */
2173         KKASSERT(flg->running == 0);
2174         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
2175         if (--hmp->flusher.group_lock == 0)
2176                 wakeup(&hmp->flusher.group_lock);
2177
2178         /*
2179          * Auto-flush the group if it grows too large.  Make sure the
2180          * inode reclaim wait pipeline continues to work.
2181          */
2182         if (flg->total_count >= hammer_autoflush ||
2183             flg->total_count >= hammer_limit_reclaims / 4) {
2184                 if (hmp->fill_flush_group == flg)
2185                         hmp->fill_flush_group = TAILQ_NEXT(flg, flush_entry);
2186                 hammer_flusher_async(hmp, flg);
2187         }
2188 }
2189
2190 /*
2191  * Callback for scan of ip->rec_tree.  Try to include each record in our
2192  * flush.  ip->flush_group has been set but the inode has not yet been
2193  * moved into a flushing state.
2194  *
2195  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2196  * both inodes.
2197  *
2198  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2199  * the caller from shortcutting the flush.
2200  */
2201 static int
2202 hammer_setup_child_callback(hammer_record_t rec, void *data)
2203 {
2204         hammer_flush_group_t flg;
2205         hammer_inode_t target_ip;
2206         hammer_inode_t ip;
2207         int r;
2208
2209         /*
2210          * Records deleted or committed by the backend are ignored.
2211          * Note that the flush detects deleted frontend records at
2212          * multiple points to deal with races.  This is just the first
2213          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2214          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2215          * messes up link-count calculations.
2216          *
2217          * NOTE: Don't get confused between record deletion and, say,
2218          * directory entry deletion.  The deletion of a directory entry
2219          * which is on-media has nothing to do with the record deletion
2220          * flags.
2221          */
2222         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2223                           HAMMER_RECF_COMMITTED)) {
2224                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2225                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2226                         r = 1;
2227                 } else {
2228                         r = 0;
2229                 }
2230                 return(r);
2231         }
2232
2233         /*
2234          * If the record is in an idle state it has no dependancies and
2235          * can be flushed.
2236          */
2237         ip = rec->ip;
2238         flg = ip->flush_group;
2239         r = 0;
2240
2241         switch(rec->flush_state) {
2242         case HAMMER_FST_IDLE:
2243                 /*
2244                  * The record has no setup dependancy, we can flush it.
2245                  */
2246                 KKASSERT(rec->target_ip == NULL);
2247                 rec->flush_state = HAMMER_FST_FLUSH;
2248                 rec->flush_group = flg;
2249                 ++flg->refs;
2250                 hammer_ref(&rec->lock);
2251                 r = 1;
2252                 break;
2253         case HAMMER_FST_SETUP:
2254                 /*
2255                  * The record has a setup dependancy.  These are typically
2256                  * directory entry adds and deletes.  Such entries will be
2257                  * flushed when their inodes are flushed so we do not
2258                  * usually have to add them to the flush here.  However,
2259                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2260                  * it is asking us to flush this record (and it).
2261                  */
2262                 target_ip = rec->target_ip;
2263                 KKASSERT(target_ip != NULL);
2264                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2265
2266                 /*
2267                  * If the target IP is already flushing in our group
2268                  * we could associate the record, but target_ip has
2269                  * already synced ino_data to sync_ino_data and we
2270                  * would also have to adjust nlinks.   Plus there are
2271                  * ordering issues for adds and deletes.
2272                  *
2273                  * Reflush downward if this is an ADD, and upward if
2274                  * this is a DEL.
2275                  */
2276                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2277                         if (rec->type == HAMMER_MEM_RECORD_ADD)
2278                                 ip->flags |= HAMMER_INODE_REFLUSH;
2279                         else
2280                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2281                         break;
2282                 } 
2283
2284                 /*
2285                  * Target IP is not yet flushing.  This can get complex
2286                  * because we have to be careful about the recursion.
2287                  *
2288                  * Directories create an issue for us in that if a flush
2289                  * of a directory is requested the expectation is to flush
2290                  * any pending directory entries, but this will cause the
2291                  * related inodes to recursively flush as well.  We can't
2292                  * really defer the operation so just get as many as we
2293                  * can and
2294                  */
2295 #if 0
2296                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2297                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2298                         /*
2299                          * We aren't reclaiming and the target ip was not
2300                          * previously prevented from flushing due to this
2301                          * record dependancy.  Do not flush this record.
2302                          */
2303                         /*r = 0;*/
2304                 } else
2305 #endif
2306                 if (flg->total_count + flg->refs >
2307                            ip->hmp->undo_rec_limit) {
2308                         /*
2309                          * Our flush group is over-full and we risk blowing
2310                          * out the UNDO FIFO.  Stop the scan, flush what we
2311                          * have, then reflush the directory.
2312                          *
2313                          * The directory may be forced through multiple
2314                          * flush groups before it can be completely
2315                          * flushed.
2316                          */
2317                         ip->flags |= HAMMER_INODE_RESIGNAL |
2318                                      HAMMER_INODE_REFLUSH;
2319                         r = -1;
2320                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2321                         /*
2322                          * If the target IP is not flushing we can force
2323                          * it to flush, even if it is unable to write out
2324                          * any of its own records we have at least one in
2325                          * hand that we CAN deal with.
2326                          */
2327                         rec->flush_state = HAMMER_FST_FLUSH;
2328                         rec->flush_group = flg;
2329                         ++flg->refs;
2330                         hammer_ref(&rec->lock);
2331                         hammer_flush_inode_core(target_ip, flg,
2332                                                 HAMMER_FLUSH_RECURSION);
2333                         r = 1;
2334                 } else {
2335                         /*
2336                          * General or delete-on-disk record.
2337                          *
2338                          * XXX this needs help.  If a delete-on-disk we could
2339                          * disconnect the target.  If the target has its own
2340                          * dependancies they really need to be flushed.
2341                          *
2342                          * XXX
2343                          */
2344                         rec->flush_state = HAMMER_FST_FLUSH;
2345                         rec->flush_group = flg;
2346                         ++flg->refs;
2347                         hammer_ref(&rec->lock);
2348                         hammer_flush_inode_core(target_ip, flg,
2349                                                 HAMMER_FLUSH_RECURSION);
2350                         r = 1;
2351                 }
2352                 break;
2353         case HAMMER_FST_FLUSH:
2354                 /* 
2355                  * The record could be part of a previous flush group if the
2356                  * inode is a directory (the record being a directory entry).
2357                  * Once the flush group was closed a hammer_test_inode()
2358                  * function can cause a new flush group to be setup, placing
2359                  * the directory inode itself in a new flush group.
2360                  *
2361                  * When associated with a previous flush group we count it
2362                  * as if it were in our current flush group, since it will
2363                  * effectively be flushed by the time we flush our current
2364                  * flush group.
2365                  */
2366                 KKASSERT(
2367                     rec->ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY ||
2368                     rec->flush_group == flg);
2369                 r = 1;
2370                 break;
2371         }
2372         return(r);
2373 }
2374
2375 #if 0
2376 /*
2377  * This version just moves records already in a flush state to the new
2378  * flush group and that is it.
2379  */
2380 static int
2381 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2382 {
2383         hammer_inode_t ip = rec->ip;
2384
2385         switch(rec->flush_state) {
2386         case HAMMER_FST_FLUSH:
2387                 KKASSERT(rec->flush_group == ip->flush_group);
2388                 break;
2389         default:
2390                 break;
2391         }
2392         return(0);
2393 }
2394 #endif
2395
2396 /*
2397  * Wait for a previously queued flush to complete.
2398  *
2399  * If a critical error occured we don't try to wait.
2400  */
2401 void
2402 hammer_wait_inode(hammer_inode_t ip)
2403 {
2404         /*
2405          * The inode can be in a SETUP state in which case RESIGNAL
2406          * should be set.  If RESIGNAL is not set then the previous
2407          * flush completed and a later operation placed the inode
2408          * in a passive setup state again, so we're done.
2409          *
2410          * The inode can be in a FLUSH state in which case we
2411          * can just wait for completion.
2412          */
2413         while (ip->flush_state == HAMMER_FST_FLUSH ||
2414             (ip->flush_state == HAMMER_FST_SETUP &&
2415              (ip->flags & HAMMER_INODE_RESIGNAL))) {
2416                 /*
2417                  * Don't try to flush on a critical error
2418                  */
2419                 if (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
2420                         break;
2421
2422                 /*
2423                  * If the inode was already being flushed its flg
2424                  * may not have been queued to the backend.  We
2425                  * have to make sure it gets queued or we can wind
2426                  * up blocked or deadlocked (particularly if we are
2427                  * the vnlru thread).
2428                  */
2429                 if (ip->flush_state == HAMMER_FST_FLUSH) {
2430                         KKASSERT(ip->flush_group);
2431                         if (ip->flush_group->closed == 0) {
2432                                 if (hammer_debug_inode) {
2433                                         kprintf("hammer: debug: forcing "
2434                                                 "async flush ip %016jx\n",
2435                                                 (intmax_t)ip->obj_id);
2436                                 }
2437                                 hammer_flusher_async(ip->hmp,
2438                                                      ip->flush_group);
2439                                 continue; /* retest */
2440                         }
2441                 }
2442
2443                 /*
2444                  * In a flush state with the flg queued to the backend
2445                  * or in a setup state with RESIGNAL set, we can safely
2446                  * wait.
2447                  */
2448                 ip->flags |= HAMMER_INODE_FLUSHW;
2449                 tsleep(&ip->flags, 0, "hmrwin", 0);
2450         }
2451
2452 #if 0
2453         /*
2454          * The inode may have been in a passive setup state,
2455          * call flush to make sure we get signaled.
2456          */
2457         if (ip->flush_state == HAMMER_FST_SETUP)
2458                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2459 #endif
2460
2461 }
2462
2463 /*
2464  * Called by the backend code when a flush has been completed.
2465  * The inode has already been removed from the flush list.
2466  *
2467  * A pipelined flush can occur, in which case we must re-enter the
2468  * inode on the list and re-copy its fields.
2469  */
2470 void
2471 hammer_flush_inode_done(hammer_inode_t ip, int error)
2472 {
2473         hammer_mount_t hmp;
2474         int dorel;
2475
2476         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2477
2478         hmp = ip->hmp;
2479
2480         /*
2481          * Auto-reflush if the backend could not completely flush
2482          * the inode.  This fixes a case where a deferred buffer flush
2483          * could cause fsync to return early.
2484          */
2485         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2486                 ip->flags |= HAMMER_INODE_REFLUSH;
2487
2488         /*
2489          * Merge left-over flags back into the frontend and fix the state.
2490          * Incomplete truncations are retained by the backend.
2491          */
2492         ip->error = error;
2493         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2494         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2495
2496         /*
2497          * The backend may have adjusted nlinks, so if the adjusted nlinks
2498          * does not match the fronttend set the frontend's DDIRTY flag again.
2499          */
2500         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2501                 ip->flags |= HAMMER_INODE_DDIRTY;
2502
2503         /*
2504          * Fix up the dirty buffer status.
2505          */
2506         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2507                 ip->flags |= HAMMER_INODE_BUFS;
2508         }
2509         hammer_redo_fifo_end_flush(ip);
2510
2511         /*
2512          * Re-set the XDIRTY flag if some of the inode's in-memory records
2513          * could not be flushed.
2514          */
2515         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2516                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2517                  (!RB_EMPTY(&ip->rec_tree) &&
2518                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2519
2520         /*
2521          * Do not lose track of inodes which no longer have vnode
2522          * assocations, otherwise they may never get flushed again.
2523          *
2524          * The reflush flag can be set superfluously, causing extra pain
2525          * for no reason.  If the inode is no longer modified it no longer
2526          * needs to be flushed.
2527          */
2528         if (ip->flags & HAMMER_INODE_MODMASK) {
2529                 if (ip->vp == NULL)
2530                         ip->flags |= HAMMER_INODE_REFLUSH;
2531         } else {
2532                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2533         }
2534
2535         /*
2536          * Adjust the flush state.
2537          */
2538         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2539                 /*
2540                  * We were unable to flush out all our records, leave the
2541                  * inode in a flush state and in the current flush group.
2542                  * The flush group will be re-run.
2543                  *
2544                  * This occurs if the UNDO block gets too full or there is
2545                  * too much dirty meta-data and allows the flusher to
2546                  * finalize the UNDO block and then re-flush.
2547                  */
2548                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2549                 dorel = 0;
2550         } else {
2551                 /*
2552                  * Remove from the flush_group
2553                  */
2554                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
2555                 ip->flush_group = NULL;
2556
2557 #if 0
2558                 /*
2559                  * Clean up the vnode ref and tracking counts.
2560                  */
2561                 if (ip->flags & HAMMER_INODE_VHELD) {
2562                         ip->flags &= ~HAMMER_INODE_VHELD;
2563                         vrele(ip->vp);
2564                 }
2565 #endif
2566                 --hmp->count_iqueued;
2567                 --hammer_count_iqueued;
2568
2569                 /*
2570                  * And adjust the state.
2571                  */
2572                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2573                         ip->flush_state = HAMMER_FST_IDLE;
2574                         dorel = 1;
2575                 } else {
2576                         ip->flush_state = HAMMER_FST_SETUP;
2577                         dorel = 0;
2578                 }
2579
2580                 /*
2581                  * If the frontend is waiting for a flush to complete,
2582                  * wake it up.
2583                  */
2584                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2585                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2586                         wakeup(&ip->flags);
2587                 }
2588
2589                 /*
2590                  * If the frontend made more changes and requested another
2591                  * flush, then try to get it running.
2592                  *
2593                  * Reflushes are aborted when the inode is errored out.
2594                  */
2595                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2596                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2597                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2598                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2599                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2600                         } else {
2601                                 hammer_flush_inode(ip, 0);
2602                         }
2603                 }
2604         }
2605
2606         /*
2607          * If we have no parent dependancies we can clear CONN_DOWN
2608          */
2609         if (TAILQ_EMPTY(&ip->target_list))
2610                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2611
2612         /*
2613          * If the inode is now clean drop the space reservation.
2614          */
2615         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2616             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2617                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2618                 --hmp->rsv_inodes;
2619         }
2620
2621         ip->flags &= ~HAMMER_INODE_SLAVEFLUSH;
2622
2623         if (dorel)
2624                 hammer_rel_inode(ip, 0);
2625 }
2626
2627 /*
2628  * Called from hammer_sync_inode() to synchronize in-memory records
2629  * to the media.
2630  */
2631 static int
2632 hammer_sync_record_callback(hammer_record_t record, void *data)
2633 {
2634         hammer_cursor_t cursor = data;
2635         hammer_transaction_t trans = cursor->trans;
2636         hammer_mount_t hmp = trans->hmp;
2637         int error;
2638
2639         /*
2640          * Skip records that do not belong to the current flush.
2641          */
2642         ++hammer_stats_record_iterations;
2643         if (record->flush_state != HAMMER_FST_FLUSH)
2644                 return(0);
2645
2646 #if 1
2647         if (record->flush_group != record->ip->flush_group) {
2648                 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
2649                 if (hammer_debug_critical)
2650                         Debugger("blah2");
2651                 return(0);
2652         }
2653 #endif
2654         KKASSERT(record->flush_group == record->ip->flush_group);
2655
2656         /*
2657          * Interlock the record using the BE flag.  Once BE is set the
2658          * frontend cannot change the state of FE.
2659          *
2660          * NOTE: If FE is set prior to us setting BE we still sync the
2661          * record out, but the flush completion code converts it to 
2662          * a delete-on-disk record instead of destroying it.
2663          */
2664         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2665         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2666
2667         /*
2668          * The backend has already disposed of the record.
2669          */
2670         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2671                 error = 0;
2672                 goto done;
2673         }
2674
2675         /*
2676          * If the whole inode is being deleted and all on-disk records will
2677          * be deleted very soon, we can't sync any new records to disk
2678          * because they will be deleted in the same transaction they were
2679          * created in (delete_tid == create_tid), which will assert.
2680          *
2681          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2682          * that we currently panic on.
2683          */
2684         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2685                 switch(record->type) {
2686                 case HAMMER_MEM_RECORD_DATA:
2687                         /*
2688                          * We don't have to do anything, if the record was
2689                          * committed the space will have been accounted for
2690                          * in the blockmap.
2691                          */
2692                         /* fall through */
2693                 case HAMMER_MEM_RECORD_GENERAL:
2694                         /*
2695                          * Set deleted-by-backend flag.  Do not set the
2696                          * backend committed flag, because we are throwing
2697                          * the record away.
2698                          */
2699                         record->flags |= HAMMER_RECF_DELETED_BE;
2700                         ++record->ip->rec_generation;
2701                         error = 0;
2702                         goto done;
2703                 case HAMMER_MEM_RECORD_ADD:
2704                         panic("hammer_sync_record_callback: illegal add "
2705                               "during inode deletion record %p", record);
2706                         break; /* NOT REACHED */
2707                 case HAMMER_MEM_RECORD_INODE:
2708                         panic("hammer_sync_record_callback: attempt to "
2709                               "sync inode record %p?", record);
2710                         break; /* NOT REACHED */
2711                 case HAMMER_MEM_RECORD_DEL:
2712                         /* 
2713                          * Follow through and issue the on-disk deletion
2714                          */
2715                         break;
2716                 }
2717         }
2718
2719         /*
2720          * If DELETED_FE is set special handling is needed for directory
2721          * entries.  Dependant pieces related to the directory entry may
2722          * have already been synced to disk.  If this occurs we have to
2723          * sync the directory entry and then change the in-memory record
2724          * from an ADD to a DELETE to cover the fact that it's been
2725          * deleted by the frontend.
2726          *
2727          * A directory delete covering record (MEM_RECORD_DEL) can never
2728          * be deleted by the frontend.
2729          *
2730          * Any other record type (aka DATA) can be deleted by the frontend.
2731          * XXX At the moment the flusher must skip it because there may
2732          * be another data record in the flush group for the same block,
2733          * meaning that some frontend data changes can leak into the backend's
2734          * synchronization point.
2735          */
2736         if (record->flags & HAMMER_RECF_DELETED_FE) {
2737                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2738                         /*
2739                          * Convert a front-end deleted directory-add to
2740                          * a directory-delete entry later.
2741                          */
2742                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2743                 } else {
2744                         /*
2745                          * Dispose of the record (race case).  Mark as
2746                          * deleted by backend (and not committed).
2747                          */
2748                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2749                         record->flags |= HAMMER_RECF_DELETED_BE;
2750                         ++record->ip->rec_generation;
2751                         error = 0;
2752                         goto done;
2753                 }
2754         }
2755
2756         /*
2757          * Assign the create_tid for new records.  Deletions already
2758          * have the record's entire key properly set up.
2759          */
2760         if (record->type != HAMMER_MEM_RECORD_DEL) {
2761                 record->leaf.base.create_tid = trans->tid;
2762                 record->leaf.create_ts = trans->time32;
2763         }
2764
2765         /*
2766          * This actually moves the record to the on-media B-Tree.  We
2767          * must also generate REDO_TERM entries in the UNDO/REDO FIFO
2768          * indicating that the related REDO_WRITE(s) have been committed.
2769          *
2770          * During recovery any REDO_TERM's within the nominal recovery span
2771          * are ignored since the related meta-data is being undone, causing
2772          * any matching REDO_WRITEs to execute.  The REDO_TERMs outside
2773          * the nominal recovery span will match against REDO_WRITEs and
2774          * prevent them from being executed (because the meta-data has
2775          * already been synchronized).
2776          */
2777         if (record->flags & HAMMER_RECF_REDO) {
2778                 KKASSERT(record->type == HAMMER_MEM_RECORD_DATA);
2779                 hammer_generate_redo(trans, record->ip,
2780                                      record->leaf.base.key -
2781                                          record->leaf.data_len,
2782                                      HAMMER_REDO_TERM_WRITE,
2783                                      NULL,
2784                                      record->leaf.data_len);
2785         }
2786
2787         for (;;) {
2788                 error = hammer_ip_sync_record_cursor(cursor, record);
2789                 if (error != EDEADLK)
2790                         break;
2791                 hammer_done_cursor(cursor);
2792                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2793                                            record->ip);
2794                 if (error)
2795                         break;
2796         }
2797         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2798
2799         if (error)
2800                 error = -error;
2801 done:
2802         hammer_flush_record_done(record, error);
2803
2804         /*
2805          * Do partial finalization if we have built up too many dirty
2806          * buffers.  Otherwise a buffer cache deadlock can occur when
2807          * doing things like creating tens of thousands of tiny files.
2808          *
2809          * We must release our cursor lock to avoid a 3-way deadlock
2810          * due to the exclusive sync lock the finalizer must get.
2811          *
2812          * WARNING: See warnings in hammer_unlock_cursor() function.
2813          */
2814         if (hammer_flusher_meta_limit(hmp) ||
2815             vm_page_count_severe()) {
2816                 hammer_unlock_cursor(cursor);
2817                 hammer_flusher_finalize(trans, 0);
2818                 hammer_lock_cursor(cursor);
2819         }
2820         return(error);
2821 }
2822
2823 /*
2824  * Backend function called by the flusher to sync an inode to media.
2825  */
2826 int
2827 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2828 {
2829         struct hammer_cursor cursor;
2830         hammer_node_t tmp_node;
2831         hammer_record_t depend;
2832         hammer_record_t next;
2833         int error, tmp_error;
2834         u_int64_t nlinks;
2835
2836         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2837                 return(0);
2838
2839         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2840         if (error)
2841                 goto done;
2842
2843         /*
2844          * Any directory records referencing this inode which are not in
2845          * our current flush group must adjust our nlink count for the
2846          * purposes of synchronizating to disk.
2847          *
2848          * Records which are in our flush group can be unlinked from our
2849          * inode now, potentially allowing the inode to be physically
2850          * deleted.
2851          *
2852          * This cannot block.
2853          */
2854         nlinks = ip->ino_data.nlinks;
2855         next = TAILQ_FIRST(&ip->target_list);
2856         while ((depend = next) != NULL) {
2857                 next = TAILQ_NEXT(depend, target_entry);
2858                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2859                     depend->flush_group == ip->flush_group) {
2860                         /*
2861                          * If this is an ADD that was deleted by the frontend
2862                          * the frontend nlinks count will have already been
2863                          * decremented, but the backend is going to sync its
2864                          * directory entry and must account for it.  The
2865                          * record will be converted to a delete-on-disk when
2866                          * it gets synced.
2867                          *
2868                          * If the ADD was not deleted by the frontend we
2869                          * can remove the dependancy from our target_list.
2870                          */
2871                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2872                                 ++nlinks;
2873                         } else {
2874                                 TAILQ_REMOVE(&ip->target_list, depend,
2875                                              target_entry);
2876                                 depend->target_ip = NULL;
2877                         }
2878                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2879                         /*
2880                          * Not part of our flush group and not deleted by
2881                          * the front-end, adjust the link count synced to
2882                          * the media (undo what the frontend did when it
2883                          * queued the record).
2884                          */
2885                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2886                         switch(depend->type) {
2887                         case HAMMER_MEM_RECORD_ADD:
2888                                 --nlinks;
2889                                 break;
2890                         case HAMMER_MEM_RECORD_DEL:
2891                                 ++nlinks;
2892                                 break;
2893                         default:
2894                                 break;
2895                         }
2896                 }
2897         }
2898
2899         /*
2900          * Set dirty if we had to modify the link count.
2901          */
2902         if (ip->sync_ino_data.nlinks != nlinks) {
2903                 KKASSERT((int64_t)nlinks >= 0);
2904                 ip->sync_ino_data.nlinks = nlinks;
2905                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2906         }
2907
2908         /*
2909          * If there is a trunction queued destroy any data past the (aligned)
2910          * truncation point.  Userland will have dealt with the buffer
2911          * containing the truncation point for us.
2912          *
2913          * We don't flush pending frontend data buffers until after we've
2914          * dealt with the truncation.
2915          */
2916         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2917                 /*
2918                  * Interlock trunc_off.  The VOP front-end may continue to
2919                  * make adjustments to it while we are blocked.
2920                  */
2921                 off_t trunc_off;
2922                 off_t aligned_trunc_off;
2923                 int blkmask;
2924
2925                 trunc_off = ip->sync_trunc_off;
2926                 blkmask = hammer_blocksize(trunc_off) - 1;
2927                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2928
2929                 /*
2930                  * Delete any whole blocks on-media.  The front-end has
2931                  * already cleaned out any partial block and made it
2932                  * pending.  The front-end may have updated trunc_off
2933                  * while we were blocked so we only use sync_trunc_off.
2934                  *
2935                  * This operation can blow out the buffer cache, EWOULDBLOCK
2936                  * means we were unable to complete the deletion.  The
2937                  * deletion will update sync_trunc_off in that case.
2938                  */
2939                 error = hammer_ip_delete_range(&cursor, ip,
2940                                                 aligned_trunc_off,
2941                                                 0x7FFFFFFFFFFFFFFFLL, 2);
2942                 if (error == EWOULDBLOCK) {
2943                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
2944                         error = 0;
2945                         goto defer_buffer_flush;
2946                 }
2947
2948                 if (error)
2949                         goto done;
2950
2951                 /*
2952                  * Generate a REDO_TERM_TRUNC entry in the UNDO/REDO FIFO.
2953                  *
2954                  * XXX we do this even if we did not previously generate
2955                  * a REDO_TRUNC record.  This operation may enclosed the
2956                  * range for multiple prior truncation entries in the REDO
2957                  * log.
2958                  */
2959                 if (trans->hmp->version >= HAMMER_VOL_VERSION_FOUR &&
2960                     (ip->flags & HAMMER_INODE_RDIRTY)) {
2961                         hammer_generate_redo(trans, ip, aligned_trunc_off,
2962                                              HAMMER_REDO_TERM_TRUNC,
2963                                              NULL, 0);
2964                 }
2965
2966                 /*
2967                  * Clear the truncation flag on the backend after we have
2968                  * completed the deletions.  Backend data is now good again
2969                  * (including new records we are about to sync, below).
2970                  *
2971                  * Leave sync_trunc_off intact.  As we write additional
2972                  * records the backend will update sync_trunc_off.  This
2973                  * tells the backend whether it can skip the overwrite
2974                  * test.  This should work properly even when the backend
2975                  * writes full blocks where the truncation point straddles
2976                  * the block because the comparison is against the base
2977                  * offset of the record.
2978                  */
2979                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2980                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
2981         } else {
2982                 error = 0;
2983         }
2984
2985         /*
2986          * Now sync related records.  These will typically be directory
2987          * entries, records tracking direct-writes, or delete-on-disk records.
2988          */
2989         if (error == 0) {
2990                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2991                                     hammer_sync_record_callback, &cursor);
2992                 if (tmp_error < 0)
2993                         tmp_error = -error;
2994                 if (tmp_error)
2995                         error = tmp_error;
2996         }
2997         hammer_cache_node(&ip->cache[1], cursor.node);
2998
2999         /*
3000          * Re-seek for inode update, assuming our cache hasn't been ripped
3001          * out from under us.
3002          */
3003         if (error == 0) {
3004                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
3005                 if (tmp_node) {
3006                         hammer_cursor_downgrade(&cursor);
3007                         hammer_lock_sh(&tmp_node->lock);
3008                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
3009                                 hammer_cursor_seek(&cursor, tmp_node, 0);
3010                         hammer_unlock(&tmp_node->lock);
3011                         hammer_rel_node(tmp_node);
3012                 }
3013                 error = 0;
3014         }
3015
3016         /*
3017          * If we are deleting the inode the frontend had better not have
3018          * any active references on elements making up the inode.
3019          *
3020          * The call to hammer_ip_delete_clean() cleans up auxillary records
3021          * but not DB or DATA records.  Those must have already been deleted
3022          * by the normal truncation mechanic.
3023          */
3024         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
3025                 RB_EMPTY(&ip->rec_tree)  &&
3026             (ip->sync_flags & HAMMER_INODE_DELETING) &&
3027             (ip->flags & HAMMER_INODE_DELETED) == 0) {
3028                 int count1 = 0;
3029
3030                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
3031                 if (error == 0) {
3032                         ip->flags |= HAMMER_INODE_DELETED;
3033                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
3034                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
3035                         KKASSERT(RB_EMPTY(&ip->rec_tree));
3036
3037                         /*
3038                          * Set delete_tid in both the frontend and backend
3039                          * copy of the inode record.  The DELETED flag handles
3040                          * this, do not set DDIRTY.
3041                          */
3042                         ip->ino_leaf.base.delete_tid = trans->tid;
3043                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
3044                         ip->ino_leaf.delete_ts = trans->time32;
3045                         ip->sync_ino_leaf.delete_ts = trans->time32;
3046
3047
3048                         /*
3049                          * Adjust the inode count in the volume header
3050                          */
3051                         hammer_sync_lock_sh(trans);
3052                         if (ip->flags & HAMMER_INODE_ONDISK) {
3053                                 hammer_modify_volume_field(trans,
3054                                                            trans->rootvol,
3055                                                            vol0_stat_inodes);
3056                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
3057                                 hammer_modify_volume_done(trans->rootvol);
3058                         }
3059                         hammer_sync_unlock(trans);
3060                 }
3061         }
3062
3063         if (error)
3064                 goto done;
3065         ip->sync_flags &= ~HAMMER_INODE_BUFS;
3066
3067 defer_buffer_flush:
3068         /*
3069          * Now update the inode's on-disk inode-data and/or on-disk record.
3070          * DELETED and ONDISK are managed only in ip->flags.
3071          *
3072          * In the case of a defered buffer flush we still update the on-disk
3073          * inode to satisfy visibility requirements if there happen to be
3074          * directory dependancies.
3075          */
3076         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
3077         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
3078                 /*
3079                  * If deleted and on-disk, don't set any additional flags.
3080                  * the delete flag takes care of things.
3081                  *
3082                  * Clear flags which may have been set by the frontend.
3083                  */
3084                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3085                                     HAMMER_INODE_SDIRTY |
3086                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3087                                     HAMMER_INODE_DELETING);
3088                 break;
3089         case HAMMER_INODE_DELETED:
3090                 /*
3091                  * Take care of the case where a deleted inode was never
3092                  * flushed to the disk in the first place.
3093                  *
3094                  * Clear flags which may have been set by the frontend.
3095                  */
3096                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
3097                                     HAMMER_INODE_SDIRTY |
3098                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
3099                                     HAMMER_INODE_DELETING);
3100                 while (RB_ROOT(&ip->rec_tree)) {
3101                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
3102                         hammer_ref(&record->lock);
3103                         KKASSERT(hammer_oneref(&record->lock));
3104                         record->flags |= HAMMER_RECF_DELETED_BE;
3105                         ++record->ip->rec_generation;
3106                         hammer_rel_mem_record(record);
3107                 }
3108                 break;
3109         case HAMMER_INODE_ONDISK:
3110                 /*
3111                  * If already on-disk, do not set any additional flags.
3112                  */
3113                 break;
3114         default:
3115                 /*
3116                  * If not on-disk and not deleted, set DDIRTY to force
3117                  * an initial record to be written.
3118                  *
3119                  * Also set the create_tid in both the frontend and backend
3120                  * copy of the inode record.
3121                  */
3122                 ip->ino_leaf.base.create_tid = trans->tid;
3123                 ip->ino_leaf.create_ts = trans->time32;
3124                 ip->sync_ino_leaf.base.create_tid = trans->tid;
3125                 ip->sync_ino_leaf.create_ts = trans->time32;
3126                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
3127                 break;
3128         }
3129
3130         /*
3131          * If DDIRTY or SDIRTY is set, write out a new record.
3132          * If the inode is already on-disk the old record is marked as
3133          * deleted.
3134          *
3135          * If DELETED is set hammer_update_inode() will delete the existing
3136          * record without writing out a new one.
3137          *
3138          * If *ONLY* the ITIMES flag is set we can update the record in-place.
3139          */
3140         if (ip->flags & HAMMER_INODE_DELETED) {
3141                 error = hammer_update_inode(&cursor, ip);
3142         } else 
3143         if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
3144             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
3145                 error = hammer_update_itimes(&cursor, ip);
3146         } else
3147         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
3148                               HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
3149                 error = hammer_update_inode(&cursor, ip);
3150         }
3151 done:
3152         if (error) {
3153                 hammer_critical_error(ip->hmp, ip, error,
3154                                       "while syncing inode");
3155         }
3156         hammer_done_cursor(&cursor);
3157         return(error);
3158 }
3159
3160 /*
3161  * This routine is called when the OS is no longer actively referencing
3162  * the inode (but might still be keeping it cached), or when releasing
3163  * the last reference to an inode.
3164  *
3165  * At this point if the inode's nlinks count is zero we want to destroy
3166  * it, which may mean destroying it on-media too.
3167  */
3168 void
3169 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
3170 {
3171         struct vnode *vp;
3172
3173         /*
3174          * Set the DELETING flag when the link count drops to 0 and the
3175          * OS no longer has any opens on the inode.
3176          *
3177          * The backend will clear DELETING (a mod flag) and set DELETED
3178          * (a state flag) when it is actually able to perform the
3179          * operation.
3180          *
3181          * Don't reflag the deletion if the flusher is currently syncing
3182          * one that was already flagged.  A previously set DELETING flag
3183          * may bounce around flags and sync_flags until the operation is
3184          * completely done.
3185          *
3186          * Do not attempt to modify a snapshot inode (one set to read-only).
3187          */
3188         if (ip->ino_data.nlinks == 0 &&
3189             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_RO|HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
3190                 ip->flags |= HAMMER_INODE_DELETING;
3191                 ip->flags |= HAMMER_INODE_TRUNCATED;
3192                 ip->trunc_off = 0;
3193                 vp = NULL;
3194                 if (getvp) {
3195                         if (hammer_get_vnode(ip, &vp) != 0)
3196                                 return;
3197                 }
3198
3199                 /*
3200                  * Final cleanup
3201                  */
3202                 if (ip->vp)
3203                         nvtruncbuf(ip->vp, 0, HAMMER_BUFSIZE, 0, 0);
3204                 if (getvp)
3205                         vput(vp);
3206         }
3207 }
3208
3209 /*
3210  * After potentially resolving a dependancy the inode is tested
3211  * to determine whether it needs to be reflushed.
3212  */
3213 void
3214 hammer_test_inode(hammer_inode_t ip)
3215 {
3216         if (ip->flags & HAMMER_INODE_REFLUSH) {
3217                 ip->flags &= ~HAMMER_INODE_REFLUSH;
3218                 hammer_ref(&ip->lock);
3219                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
3220                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
3221                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
3222                 } else {
3223                         hammer_flush_inode(ip, 0);
3224                 }
3225                 hammer_rel_inode(ip, 0);
3226         }
3227 }
3228
3229 /*
3230  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
3231  * reassociated with a vp or just before it gets freed.
3232  *
3233  * Pipeline wakeups to threads blocked due to an excessive number of
3234  * detached inodes.  This typically occurs when atime updates accumulate
3235  * while scanning a directory tree.
3236  */
3237 static void
3238 hammer_inode_wakereclaims(hammer_inode_t ip)
3239 {
3240         struct hammer_reclaim *reclaim;
3241         hammer_mount_t hmp = ip->hmp;
3242
3243         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
3244                 return;
3245
3246         --hammer_count_reclaims;
3247         --hmp->count_reclaims;
3248         ip->flags &= ~HAMMER_INODE_RECLAIM;
3249
3250         if ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3251                 KKASSERT(reclaim->count > 0);
3252                 if (--reclaim->count == 0) {
3253                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3254                         wakeup(reclaim);
3255                 }
3256         }
3257 }
3258
3259 /*
3260  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3261  * inodes build up before we start blocking.  This routine is called
3262  * if a new inode is created or an inode is loaded from media.
3263  *
3264  * When we block we don't care *which* inode has finished reclaiming,
3265  * as long as one does.
3266  *
3267  * The reclaim pipeline is primarily governed by the auto-flush which is
3268  * 1/4 hammer_limit_reclaims.  We don't want to block if the count is
3269  * less than 1/2 hammer_limit_reclaims.  From 1/2 to full count is
3270  * dynamically governed.
3271  */
3272 void
3273 hammer_inode_waitreclaims(hammer_transaction_t trans)
3274 {
3275         hammer_mount_t hmp = trans->hmp;
3276         struct hammer_reclaim reclaim;
3277         int lower_limit;
3278
3279         /*
3280          * Track inode load, delay if the number of reclaiming inodes is
3281          * between 2/4 and 4/4 hammer_limit_reclaims, depending.
3282          */
3283         if (curthread->td_proc) {
3284                 struct hammer_inostats *stats;
3285
3286                 stats = hammer_inode_inostats(hmp, curthread->td_proc->p_pid);
3287                 ++stats->count;
3288
3289                 if (stats->count > hammer_limit_reclaims / 2)
3290                         stats->count = hammer_limit_reclaims / 2;
3291                 lower_limit = hammer_limit_reclaims - stats->count;
3292                 if (hammer_debug_general & 0x10000) {
3293                         kprintf("pid %5d limit %d\n",
3294                                 (int)curthread->td_proc->p_pid, lower_limit);
3295                 }
3296         } else {
3297                 lower_limit = hammer_limit_reclaims * 3 / 4;
3298         }
3299         if (hmp->count_reclaims >= lower_limit) {
3300                 reclaim.count = 1;
3301                 TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3302                 tsleep(&reclaim, 0, "hmrrcm", hz);
3303                 if (reclaim.count > 0)
3304                         TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3305         }
3306 }
3307
3308 /*
3309  * Keep track of reclaim statistics on a per-pid basis using a loose
3310  * 4-way set associative hash table.  Collisions inherit the count of
3311  * the previous entry.
3312  *
3313  * NOTE: We want to be careful here to limit the chain size.  If the chain
3314  *       size is too large a pid will spread its stats out over too many
3315  *       entries under certain types of heavy filesystem activity and
3316  *       wind up not delaying long enough.
3317  */
3318 static
3319 struct hammer_inostats *
3320 hammer_inode_inostats(hammer_mount_t hmp, pid_t pid)
3321 {
3322         struct hammer_inostats *stats;
3323         int delta;
3324         int chain;
3325         static volatile int iterator;   /* we don't care about MP races */
3326
3327         /*
3328          * Chain up to 4 times to find our entry.
3329          */
3330         for (chain = 0; chain < 4; ++chain) {
3331                 stats = &hmp->inostats[(pid + chain) & HAMMER_INOSTATS_HMASK];
3332                 if (stats->pid == pid)
3333                         break;
3334         }
3335
3336         /*
3337          * Replace one of the four chaining entries with our new entry.
3338          */
3339         if (chain == 4) {
3340                 stats = &hmp->inostats[(pid + (iterator++ & 3)) &
3341                                        HAMMER_INOSTATS_HMASK];
3342                 stats->pid = pid;
3343         }
3344
3345         /*
3346          * Decay the entry
3347          */
3348         if (stats->count && stats->ltick != ticks) {
3349                 delta = ticks - stats->ltick;
3350                 stats->ltick = ticks;
3351                 if (delta <= 0 || delta > hz * 60)
3352                         stats->count = 0;
3353                 else
3354                         stats->count = stats->count * hz / (hz + delta);
3355         }
3356         if (hammer_debug_general & 0x10000)
3357                 kprintf("pid %5d stats %d\n", (int)pid, stats->count);
3358         return (stats);
3359 }
3360
3361 #if 0
3362
3363 /*
3364  * XXX not used, doesn't work very well due to the large batching nature
3365  * of flushes.
3366  *
3367  * A larger then normal backlog of inodes is sitting in the flusher,
3368  * enforce a general slowdown to let it catch up.  This routine is only
3369  * called on completion of a non-flusher-related transaction which
3370  * performed B-Tree node I/O.
3371  *
3372  * It is possible for the flusher to stall in a continuous load.
3373  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3374  * If the flusher is unable to catch up the inode count can bloat until
3375  * we run out of kvm.
3376  *
3377  * This is a bit of a hack.
3378  */
3379 void
3380 hammer_inode_waithard(hammer_mount_t hmp)
3381 {
3382         /*
3383          * Hysteresis.
3384          */
3385         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3386                 if (hmp->count_reclaims < hammer_limit_reclaims / 2 &&
3387                     hmp->count_iqueued < hmp->count_inodes / 20) {
3388                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3389                         return;
3390                 }
3391         } else {
3392                 if (hmp->count_reclaims < hammer_limit_reclaims ||
3393                     hmp->count_iqueued < hmp->count_inodes / 10) {
3394                         return;
3395                 }
3396                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3397         }
3398
3399         /*
3400          * Block for one flush cycle.
3401          */
3402         hammer_flusher_wait_next(hmp);
3403 }
3404
3405 #endif