49a7763422de1e2105e8b7af158431363868cba1
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.114 2008/09/24 00:53:51 dillon Exp $
35  */
36
37 #include "hammer.h"
38 #include <vm/vm_extern.h>
39 #include <sys/buf.h>
40 #include <sys/buf2.h>
41
42 static int      hammer_unload_inode(struct hammer_inode *ip);
43 static void     hammer_free_inode(hammer_inode_t ip);
44 static void     hammer_flush_inode_core(hammer_inode_t ip,
45                                         hammer_flush_group_t flg, int flags);
46 static int      hammer_setup_child_callback(hammer_record_t rec, void *data);
47 #if 0
48 static int      hammer_syncgrp_child_callback(hammer_record_t rec, void *data);
49 #endif
50 static int      hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
51                                         hammer_flush_group_t flg);
52 static int      hammer_setup_parent_inodes_helper(hammer_record_t record,
53                                         int depth, hammer_flush_group_t flg);
54 static void     hammer_inode_wakereclaims(hammer_inode_t ip);
55
56 #ifdef DEBUG_TRUNCATE
57 extern struct hammer_inode *HammerTruncIp;
58 #endif
59
60 /*
61  * RB-Tree support for inode structures
62  */
63 int
64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
65 {
66         if (ip1->obj_localization < ip2->obj_localization)
67                 return(-1);
68         if (ip1->obj_localization > ip2->obj_localization)
69                 return(1);
70         if (ip1->obj_id < ip2->obj_id)
71                 return(-1);
72         if (ip1->obj_id > ip2->obj_id)
73                 return(1);
74         if (ip1->obj_asof < ip2->obj_asof)
75                 return(-1);
76         if (ip1->obj_asof > ip2->obj_asof)
77                 return(1);
78         return(0);
79 }
80
81 /*
82  * RB-Tree support for inode structures / special LOOKUP_INFO
83  */
84 static int
85 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
86 {
87         if (info->obj_localization < ip->obj_localization)
88                 return(-1);
89         if (info->obj_localization > ip->obj_localization)
90                 return(1);
91         if (info->obj_id < ip->obj_id)
92                 return(-1);
93         if (info->obj_id > ip->obj_id)
94                 return(1);
95         if (info->obj_asof < ip->obj_asof)
96                 return(-1);
97         if (info->obj_asof > ip->obj_asof)
98                 return(1);
99         return(0);
100 }
101
102 /*
103  * Used by hammer_scan_inode_snapshots() to locate all of an object's
104  * snapshots.  Note that the asof field is not tested, which we can get
105  * away with because it is the lowest-priority field.
106  */
107 static int
108 hammer_inode_info_cmp_all_history(hammer_inode_t ip, void *data)
109 {
110         hammer_inode_info_t info = data;
111
112         if (ip->obj_localization > info->obj_localization)
113                 return(1);
114         if (ip->obj_localization < info->obj_localization)
115                 return(-1);
116         if (ip->obj_id > info->obj_id)
117                 return(1);
118         if (ip->obj_id < info->obj_id)
119                 return(-1);
120         return(0);
121 }
122
123 /*
124  * Used by hammer_unload_pseudofs() to locate all inodes associated with
125  * a particular PFS.
126  */
127 static int
128 hammer_inode_pfs_cmp(hammer_inode_t ip, void *data)
129 {
130         u_int32_t localization = *(u_int32_t *)data;
131         if (ip->obj_localization > localization)
132                 return(1);
133         if (ip->obj_localization < localization)
134                 return(-1);
135         return(0);
136 }
137
138 /*
139  * RB-Tree support for pseudofs structures
140  */
141 static int
142 hammer_pfs_rb_compare(hammer_pseudofs_inmem_t p1, hammer_pseudofs_inmem_t p2)
143 {
144         if (p1->localization < p2->localization)
145                 return(-1);
146         if (p1->localization > p2->localization)
147                 return(1);
148         return(0);
149 }
150
151
152 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
153 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
154                 hammer_inode_info_cmp, hammer_inode_info_t);
155 RB_GENERATE2(hammer_pfs_rb_tree, hammer_pseudofs_inmem, rb_node,
156              hammer_pfs_rb_compare, u_int32_t, localization);
157
158 /*
159  * The kernel is not actively referencing this vnode but is still holding
160  * it cached.
161  *
162  * This is called from the frontend.
163  */
164 int
165 hammer_vop_inactive(struct vop_inactive_args *ap)
166 {
167         struct hammer_inode *ip = VTOI(ap->a_vp);
168
169         /*
170          * Degenerate case
171          */
172         if (ip == NULL) {
173                 vrecycle(ap->a_vp);
174                 return(0);
175         }
176
177         /*
178          * If the inode no longer has visibility in the filesystem try to
179          * recycle it immediately, even if the inode is dirty.  Recycling
180          * it quickly allows the system to reclaim buffer cache and VM
181          * resources which can matter a lot in a heavily loaded system.
182          *
183          * This can deadlock in vfsync() if we aren't careful.
184          * 
185          * Do not queue the inode to the flusher if we still have visibility,
186          * otherwise namespace calls such as chmod will unnecessarily generate
187          * multiple inode updates.
188          */
189         hammer_inode_unloadable_check(ip, 0);
190         if (ip->ino_data.nlinks == 0) {
191                 if (ip->flags & HAMMER_INODE_MODMASK)
192                         hammer_flush_inode(ip, 0);
193                 vrecycle(ap->a_vp);
194         }
195         return(0);
196 }
197
198 /*
199  * Release the vnode association.  This is typically (but not always)
200  * the last reference on the inode.
201  *
202  * Once the association is lost we are on our own with regards to
203  * flushing the inode.
204  */
205 int
206 hammer_vop_reclaim(struct vop_reclaim_args *ap)
207 {
208         struct hammer_inode *ip;
209         hammer_mount_t hmp;
210         struct vnode *vp;
211
212         vp = ap->a_vp;
213
214         if ((ip = vp->v_data) != NULL) {
215                 hmp = ip->hmp;
216                 vp->v_data = NULL;
217                 ip->vp = NULL;
218
219                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
220                         ++hammer_count_reclaiming;
221                         ++hmp->inode_reclaims;
222                         ip->flags |= HAMMER_INODE_RECLAIM;
223                 }
224                 hammer_rel_inode(ip, 1);
225         }
226         return(0);
227 }
228
229 /*
230  * Return a locked vnode for the specified inode.  The inode must be
231  * referenced but NOT LOCKED on entry and will remain referenced on
232  * return.
233  *
234  * Called from the frontend.
235  */
236 int
237 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
238 {
239         hammer_mount_t hmp;
240         struct vnode *vp;
241         int error = 0;
242         u_int8_t obj_type;
243
244         hmp = ip->hmp;
245
246         for (;;) {
247                 if ((vp = ip->vp) == NULL) {
248                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
249                         if (error)
250                                 break;
251                         hammer_lock_ex(&ip->lock);
252                         if (ip->vp != NULL) {
253                                 hammer_unlock(&ip->lock);
254                                 vp = *vpp;
255                                 vp->v_type = VBAD;
256                                 vx_put(vp);
257                                 continue;
258                         }
259                         hammer_ref(&ip->lock);
260                         vp = *vpp;
261                         ip->vp = vp;
262
263                         obj_type = ip->ino_data.obj_type;
264                         vp->v_type = hammer_get_vnode_type(obj_type);
265
266                         hammer_inode_wakereclaims(ip);
267
268                         switch(ip->ino_data.obj_type) {
269                         case HAMMER_OBJTYPE_CDEV:
270                         case HAMMER_OBJTYPE_BDEV:
271                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
272                                 addaliasu(vp, ip->ino_data.rmajor,
273                                           ip->ino_data.rminor);
274                                 break;
275                         case HAMMER_OBJTYPE_FIFO:
276                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
277                                 break;
278                         case HAMMER_OBJTYPE_REGFILE:
279                                 break;
280                         default:
281                                 break;
282                         }
283
284                         /*
285                          * Only mark as the root vnode if the ip is not
286                          * historical, otherwise the VFS cache will get
287                          * confused.  The other half of the special handling
288                          * is in hammer_vop_nlookupdotdot().
289                          *
290                          * Pseudo-filesystem roots can be accessed via
291                          * non-root filesystem paths and setting VROOT may
292                          * confuse the namecache.  Set VPFSROOT instead.
293                          */
294                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
295                             ip->obj_asof == hmp->asof) {
296                                 if (ip->obj_localization == 0)
297                                         vp->v_flag |= VROOT;
298                                 else
299                                         vp->v_flag |= VPFSROOT;
300                         }
301
302                         vp->v_data = (void *)ip;
303                         /* vnode locked by getnewvnode() */
304                         /* make related vnode dirty if inode dirty? */
305                         hammer_unlock(&ip->lock);
306                         if (vp->v_type == VREG)
307                                 vinitvmio(vp, ip->ino_data.size);
308                         break;
309                 }
310
311                 /*
312                  * loop if the vget fails (aka races), or if the vp
313                  * no longer matches ip->vp.
314                  */
315                 if (vget(vp, LK_EXCLUSIVE) == 0) {
316                         if (vp == ip->vp)
317                                 break;
318                         vput(vp);
319                 }
320         }
321         *vpp = vp;
322         return(error);
323 }
324
325 /*
326  * Locate all copies of the inode for obj_id compatible with the specified
327  * asof, reference, and issue the related call-back.  This routine is used
328  * for direct-io invalidation and does not create any new inodes.
329  */
330 void
331 hammer_scan_inode_snapshots(hammer_mount_t hmp, hammer_inode_info_t iinfo,
332                             int (*callback)(hammer_inode_t ip, void *data),
333                             void *data)
334 {
335         hammer_ino_rb_tree_RB_SCAN(&hmp->rb_inos_root,
336                                    hammer_inode_info_cmp_all_history,
337                                    callback, iinfo);
338 }
339
340 /*
341  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
342  * do not attach or detach the related vnode (use hammer_get_vnode() for
343  * that).
344  *
345  * The flags argument is only applied for newly created inodes, and only
346  * certain flags are inherited.
347  *
348  * Called from the frontend.
349  */
350 struct hammer_inode *
351 hammer_get_inode(hammer_transaction_t trans, hammer_inode_t dip,
352                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
353                  int flags, int *errorp)
354 {
355         hammer_mount_t hmp = trans->hmp;
356         struct hammer_node_cache *cachep;
357         struct hammer_inode_info iinfo;
358         struct hammer_cursor cursor;
359         struct hammer_inode *ip;
360
361
362         /*
363          * Determine if we already have an inode cached.  If we do then
364          * we are golden.
365          *
366          * If we find an inode with no vnode we have to mark the
367          * transaction such that hammer_inode_waitreclaims() is
368          * called later on to avoid building up an infinite number
369          * of inodes.  Otherwise we can continue to * add new inodes
370          * faster then they can be disposed of, even with the tsleep
371          * delay.
372          *
373          * If we find a dummy inode we return a failure so dounlink
374          * (which does another lookup) doesn't try to mess with the
375          * link count.  hammer_vop_nresolve() uses hammer_get_dummy_inode()
376          * to ref dummy inodes.
377          */
378         iinfo.obj_id = obj_id;
379         iinfo.obj_asof = asof;
380         iinfo.obj_localization = localization;
381 loop:
382         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
383         if (ip) {
384                 if (ip->flags & HAMMER_INODE_DUMMY) {
385                         *errorp = ENOENT;
386                         return(NULL);
387                 }
388                 hammer_ref(&ip->lock);
389                 *errorp = 0;
390                 return(ip);
391         }
392
393         /*
394          * Allocate a new inode structure and deal with races later.
395          */
396         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
397         ++hammer_count_inodes;
398         ++hmp->count_inodes;
399         ip->obj_id = obj_id;
400         ip->obj_asof = iinfo.obj_asof;
401         ip->obj_localization = localization;
402         ip->hmp = hmp;
403         ip->flags = flags & HAMMER_INODE_RO;
404         ip->cache[0].ip = ip;
405         ip->cache[1].ip = ip;
406         ip->cache[2].ip = ip;
407         ip->cache[3].ip = ip;
408         if (hmp->ronly)
409                 ip->flags |= HAMMER_INODE_RO;
410         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
411                 0x7FFFFFFFFFFFFFFFLL;
412         RB_INIT(&ip->rec_tree);
413         TAILQ_INIT(&ip->target_list);
414         hammer_ref(&ip->lock);
415
416         /*
417          * Locate the on-disk inode.  If this is a PFS root we always
418          * access the current version of the root inode and (if it is not
419          * a master) always access information under it with a snapshot
420          * TID.
421          *
422          * We cache recent inode lookups in this directory in dip->cache[2].
423          * If we can't find it we assume the inode we are looking for is
424          * close to the directory inode.
425          */
426 retry:
427         cachep = NULL;
428         if (dip) {
429                 if (dip->cache[2].node)
430                         cachep = &dip->cache[2];
431                 else
432                         cachep = &dip->cache[0];
433         }
434         hammer_init_cursor(trans, &cursor, cachep, NULL);
435         cursor.key_beg.localization = localization + HAMMER_LOCALIZE_INODE;
436         cursor.key_beg.obj_id = ip->obj_id;
437         cursor.key_beg.key = 0;
438         cursor.key_beg.create_tid = 0;
439         cursor.key_beg.delete_tid = 0;
440         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
441         cursor.key_beg.obj_type = 0;
442
443         cursor.asof = iinfo.obj_asof;
444         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
445                        HAMMER_CURSOR_ASOF;
446
447         *errorp = hammer_btree_lookup(&cursor);
448         if (*errorp == EDEADLK) {
449                 hammer_done_cursor(&cursor);
450                 goto retry;
451         }
452
453         /*
454          * On success the B-Tree lookup will hold the appropriate
455          * buffer cache buffers and provide a pointer to the requested
456          * information.  Copy the information to the in-memory inode
457          * and cache the B-Tree node to improve future operations.
458          */
459         if (*errorp == 0) {
460                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
461                 ip->ino_data = cursor.data->inode;
462
463                 /*
464                  * cache[0] tries to cache the location of the object inode.
465                  * The assumption is that it is near the directory inode.
466                  *
467                  * cache[1] tries to cache the location of the object data.
468                  * We might have something in the governing directory from
469                  * scan optimizations (see the strategy code in
470                  * hammer_vnops.c).
471                  *
472                  * We update dip->cache[2], if possible, with the location
473                  * of the object inode for future directory shortcuts.
474                  */
475                 hammer_cache_node(&ip->cache[0], cursor.node);
476                 if (dip) {
477                         if (dip->cache[3].node) {
478                                 hammer_cache_node(&ip->cache[1],
479                                                   dip->cache[3].node);
480                         }
481                         hammer_cache_node(&dip->cache[2], cursor.node);
482                 }
483
484                 /*
485                  * The file should not contain any data past the file size
486                  * stored in the inode.  Setting save_trunc_off to the
487                  * file size instead of max reduces B-Tree lookup overheads
488                  * on append by allowing the flusher to avoid checking for
489                  * record overwrites.
490                  */
491                 ip->save_trunc_off = ip->ino_data.size;
492
493                 /*
494                  * Locate and assign the pseudofs management structure to
495                  * the inode.
496                  */
497                 if (dip && dip->obj_localization == ip->obj_localization) {
498                         ip->pfsm = dip->pfsm;
499                         hammer_ref(&ip->pfsm->lock);
500                 } else {
501                         ip->pfsm = hammer_load_pseudofs(trans,
502                                                         ip->obj_localization,
503                                                         errorp);
504                         *errorp = 0;    /* ignore ENOENT */
505                 }
506         }
507
508         /*
509          * The inode is placed on the red-black tree and will be synced to
510          * the media when flushed or by the filesystem sync.  If this races
511          * another instantiation/lookup the insertion will fail.
512          */
513         if (*errorp == 0) {
514                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
515                         hammer_free_inode(ip);
516                         hammer_done_cursor(&cursor);
517                         goto loop;
518                 }
519                 ip->flags |= HAMMER_INODE_ONDISK;
520         } else {
521                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
522                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
523                         --hmp->rsv_inodes;
524                 }
525
526                 hammer_free_inode(ip);
527                 ip = NULL;
528         }
529         hammer_done_cursor(&cursor);
530         trans->flags |= HAMMER_TRANSF_NEWINODE;
531         return (ip);
532 }
533
534 /*
535  * Get a dummy inode to placemark a broken directory entry.
536  */
537 struct hammer_inode *
538 hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
539                  int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
540                  int flags, int *errorp)
541 {
542         hammer_mount_t hmp = trans->hmp;
543         struct hammer_inode_info iinfo;
544         struct hammer_inode *ip;
545
546         /*
547          * Determine if we already have an inode cached.  If we do then
548          * we are golden.
549          *
550          * If we find an inode with no vnode we have to mark the
551          * transaction such that hammer_inode_waitreclaims() is
552          * called later on to avoid building up an infinite number
553          * of inodes.  Otherwise we can continue to * add new inodes
554          * faster then they can be disposed of, even with the tsleep
555          * delay.
556          *
557          * If we find a non-fake inode we return an error.  Only fake
558          * inodes can be returned by this routine.
559          */
560         iinfo.obj_id = obj_id;
561         iinfo.obj_asof = asof;
562         iinfo.obj_localization = localization;
563 loop:
564         *errorp = 0;
565         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
566         if (ip) {
567                 if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
568                         *errorp = ENOENT;
569                         return(NULL);
570                 }
571                 hammer_ref(&ip->lock);
572                 return(ip);
573         }
574
575         /*
576          * Allocate a new inode structure and deal with races later.
577          */
578         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
579         ++hammer_count_inodes;
580         ++hmp->count_inodes;
581         ip->obj_id = obj_id;
582         ip->obj_asof = iinfo.obj_asof;
583         ip->obj_localization = localization;
584         ip->hmp = hmp;
585         ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
586         ip->cache[0].ip = ip;
587         ip->cache[1].ip = ip;
588         ip->cache[2].ip = ip;
589         ip->cache[3].ip = ip;
590         ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
591                 0x7FFFFFFFFFFFFFFFLL;
592         RB_INIT(&ip->rec_tree);
593         TAILQ_INIT(&ip->target_list);
594         hammer_ref(&ip->lock);
595
596         /*
597          * Populate the dummy inode.  Leave everything zero'd out.
598          *
599          * (ip->ino_leaf and ip->ino_data)
600          *
601          * Make the dummy inode a FIFO object which most copy programs
602          * will properly ignore.
603          */
604         ip->save_trunc_off = ip->ino_data.size;
605         ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
606
607         /*
608          * Locate and assign the pseudofs management structure to
609          * the inode.
610          */
611         if (dip && dip->obj_localization == ip->obj_localization) {
612                 ip->pfsm = dip->pfsm;
613                 hammer_ref(&ip->pfsm->lock);
614         } else {
615                 ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
616                                                 errorp);
617                 *errorp = 0;    /* ignore ENOENT */
618         }
619
620         /*
621          * The inode is placed on the red-black tree and will be synced to
622          * the media when flushed or by the filesystem sync.  If this races
623          * another instantiation/lookup the insertion will fail.
624          *
625          * NOTE: Do not set HAMMER_INODE_ONDISK.  The inode is a fake.
626          */
627         if (*errorp == 0) {
628                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
629                         hammer_free_inode(ip);
630                         goto loop;
631                 }
632         } else {
633                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
634                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
635                         --hmp->rsv_inodes;
636                 }
637                 hammer_free_inode(ip);
638                 ip = NULL;
639         }
640         trans->flags |= HAMMER_TRANSF_NEWINODE;
641         return (ip);
642 }
643
644 /*
645  * Return a referenced inode only if it is in our inode cache.
646  *
647  * Dummy inodes do not count.
648  */
649 struct hammer_inode *
650 hammer_find_inode(hammer_transaction_t trans, int64_t obj_id,
651                   hammer_tid_t asof, u_int32_t localization)
652 {
653         hammer_mount_t hmp = trans->hmp;
654         struct hammer_inode_info iinfo;
655         struct hammer_inode *ip;
656
657         iinfo.obj_id = obj_id;
658         iinfo.obj_asof = asof;
659         iinfo.obj_localization = localization;
660
661         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
662         if (ip) {
663                 if (ip->flags & HAMMER_INODE_DUMMY)
664                         ip = NULL;
665                 else
666                         hammer_ref(&ip->lock);
667         }
668         return(ip);
669 }
670
671 /*
672  * Create a new filesystem object, returning the inode in *ipp.  The
673  * returned inode will be referenced.  The inode is created in-memory.
674  *
675  * If pfsm is non-NULL the caller wishes to create the root inode for
676  * a master PFS.
677  */
678 int
679 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
680                     struct ucred *cred,
681                     hammer_inode_t dip, const char *name, int namelen,
682                     hammer_pseudofs_inmem_t pfsm, struct hammer_inode **ipp)
683 {
684         hammer_mount_t hmp;
685         hammer_inode_t ip;
686         uid_t xuid;
687         int error;
688         int64_t namekey;
689         u_int32_t dummy;
690
691         hmp = trans->hmp;
692
693         ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
694         ++hammer_count_inodes;
695         ++hmp->count_inodes;
696         trans->flags |= HAMMER_TRANSF_NEWINODE;
697
698         if (pfsm) {
699                 KKASSERT(pfsm->localization != 0);
700                 ip->obj_id = HAMMER_OBJID_ROOT;
701                 ip->obj_localization = pfsm->localization;
702         } else {
703                 KKASSERT(dip != NULL);
704                 namekey = hammer_directory_namekey(dip, name, namelen, &dummy);
705                 ip->obj_id = hammer_alloc_objid(hmp, dip, namekey);
706                 ip->obj_localization = dip->obj_localization;
707         }
708
709         KKASSERT(ip->obj_id != 0);
710         ip->obj_asof = hmp->asof;
711         ip->hmp = hmp;
712         ip->flush_state = HAMMER_FST_IDLE;
713         ip->flags = HAMMER_INODE_DDIRTY |
714                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME;
715         ip->cache[0].ip = ip;
716         ip->cache[1].ip = ip;
717         ip->cache[2].ip = ip;
718         ip->cache[3].ip = ip;
719
720         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
721         /* ip->save_trunc_off = 0; (already zero) */
722         RB_INIT(&ip->rec_tree);
723         TAILQ_INIT(&ip->target_list);
724
725         ip->ino_data.atime = trans->time;
726         ip->ino_data.mtime = trans->time;
727         ip->ino_data.size = 0;
728         ip->ino_data.nlinks = 0;
729
730         /*
731          * A nohistory designator on the parent directory is inherited by
732          * the child.  We will do this even for pseudo-fs creation... the
733          * sysad can turn it off.
734          */
735         if (dip) {
736                 ip->ino_data.uflags = dip->ino_data.uflags &
737                                       (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
738         }
739
740         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
741         ip->ino_leaf.base.localization = ip->obj_localization +
742                                          HAMMER_LOCALIZE_INODE;
743         ip->ino_leaf.base.obj_id = ip->obj_id;
744         ip->ino_leaf.base.key = 0;
745         ip->ino_leaf.base.create_tid = 0;
746         ip->ino_leaf.base.delete_tid = 0;
747         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
748         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
749
750         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
751         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
752         ip->ino_data.mode = vap->va_mode;
753         ip->ino_data.ctime = trans->time;
754
755         /*
756          * If we are running version 2 or greater directory entries are
757          * inode-localized instead of data-localized.
758          */
759         if (trans->hmp->version >= HAMMER_VOL_VERSION_TWO) {
760                 if (ip->ino_leaf.base.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
761                         ip->ino_data.cap_flags |=
762                                 HAMMER_INODE_CAP_DIR_LOCAL_INO;
763                 }
764         }
765
766         /*
767          * Setup the ".." pointer.  This only needs to be done for directories
768          * but we do it for all objects as a recovery aid.
769          */
770         if (dip)
771                 ip->ino_data.parent_obj_id = dip->ino_leaf.base.obj_id;
772 #if 0
773         /*
774          * The parent_obj_localization field only applies to pseudo-fs roots.
775          * XXX this is no longer applicable, PFSs are no longer directly
776          * tied into the parent's directory structure.
777          */
778         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY &&
779             ip->obj_id == HAMMER_OBJID_ROOT) {
780                 ip->ino_data.ext.obj.parent_obj_localization = 
781                                                 dip->obj_localization;
782         }
783 #endif
784
785         switch(ip->ino_leaf.base.obj_type) {
786         case HAMMER_OBJTYPE_CDEV:
787         case HAMMER_OBJTYPE_BDEV:
788                 ip->ino_data.rmajor = vap->va_rmajor;
789                 ip->ino_data.rminor = vap->va_rminor;
790                 break;
791         default:
792                 break;
793         }
794
795         /*
796          * Calculate default uid/gid and overwrite with information from
797          * the vap.
798          */
799         if (dip) {
800                 xuid = hammer_to_unix_xid(&dip->ino_data.uid);
801                 xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode,
802                                              xuid, cred, &vap->va_mode);
803         } else {
804                 xuid = 0;
805         }
806         ip->ino_data.mode = vap->va_mode;
807
808         if (vap->va_vaflags & VA_UID_UUID_VALID)
809                 ip->ino_data.uid = vap->va_uid_uuid;
810         else if (vap->va_uid != (uid_t)VNOVAL)
811                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
812         else
813                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
814
815         if (vap->va_vaflags & VA_GID_UUID_VALID)
816                 ip->ino_data.gid = vap->va_gid_uuid;
817         else if (vap->va_gid != (gid_t)VNOVAL)
818                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
819         else if (dip)
820                 ip->ino_data.gid = dip->ino_data.gid;
821
822         hammer_ref(&ip->lock);
823
824         if (pfsm) {
825                 ip->pfsm = pfsm;
826                 hammer_ref(&pfsm->lock);
827                 error = 0;
828         } else if (dip->obj_localization == ip->obj_localization) {
829                 ip->pfsm = dip->pfsm;
830                 hammer_ref(&ip->pfsm->lock);
831                 error = 0;
832         } else {
833                 ip->pfsm = hammer_load_pseudofs(trans,
834                                                 ip->obj_localization,
835                                                 &error);
836                 error = 0;      /* ignore ENOENT */
837         }
838
839         if (error) {
840                 hammer_free_inode(ip);
841                 ip = NULL;
842         } else if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
843                 panic("hammer_create_inode: duplicate obj_id %llx",
844                       (long long)ip->obj_id);
845                 /* not reached */
846                 hammer_free_inode(ip);
847         }
848         *ipp = ip;
849         return(error);
850 }
851
852 /*
853  * Final cleanup / freeing of an inode structure
854  */
855 static void
856 hammer_free_inode(hammer_inode_t ip)
857 {
858         struct hammer_mount *hmp;
859
860         hmp = ip->hmp;
861         KKASSERT(ip->lock.refs == 1);
862         hammer_uncache_node(&ip->cache[0]);
863         hammer_uncache_node(&ip->cache[1]);
864         hammer_uncache_node(&ip->cache[2]);
865         hammer_uncache_node(&ip->cache[3]);
866         hammer_inode_wakereclaims(ip);
867         if (ip->objid_cache)
868                 hammer_clear_objid(ip);
869         --hammer_count_inodes;
870         --hmp->count_inodes;
871         if (ip->pfsm) {
872                 hammer_rel_pseudofs(hmp, ip->pfsm);
873                 ip->pfsm = NULL;
874         }
875         kfree(ip, hmp->m_inodes);
876         ip = NULL;
877 }
878
879 /*
880  * Retrieve pseudo-fs data.  NULL will never be returned.
881  *
882  * If an error occurs *errorp will be set and a default template is returned,
883  * otherwise *errorp is set to 0.  Typically when an error occurs it will
884  * be ENOENT.
885  */
886 hammer_pseudofs_inmem_t
887 hammer_load_pseudofs(hammer_transaction_t trans,
888                      u_int32_t localization, int *errorp)
889 {
890         hammer_mount_t hmp = trans->hmp;
891         hammer_inode_t ip;
892         hammer_pseudofs_inmem_t pfsm;
893         struct hammer_cursor cursor;
894         int bytes;
895
896 retry:
897         pfsm = RB_LOOKUP(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, localization);
898         if (pfsm) {
899                 hammer_ref(&pfsm->lock);
900                 *errorp = 0;
901                 return(pfsm);
902         }
903
904         /*
905          * PFS records are stored in the root inode (not the PFS root inode,
906          * but the real root).  Avoid an infinite recursion if loading
907          * the PFS for the real root.
908          */
909         if (localization) {
910                 ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT,
911                                       HAMMER_MAX_TID,
912                                       HAMMER_DEF_LOCALIZATION, 0, errorp);
913         } else {
914                 ip = NULL;
915         }
916
917         pfsm = kmalloc(sizeof(*pfsm), hmp->m_misc, M_WAITOK | M_ZERO);
918         pfsm->localization = localization;
919         pfsm->pfsd.unique_uuid = trans->rootvol->ondisk->vol_fsid;
920         pfsm->pfsd.shared_uuid = pfsm->pfsd.unique_uuid;
921
922         hammer_init_cursor(trans, &cursor, (ip ? &ip->cache[1] : NULL), ip);
923         cursor.key_beg.localization = HAMMER_DEF_LOCALIZATION +
924                                       HAMMER_LOCALIZE_MISC;
925         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
926         cursor.key_beg.create_tid = 0;
927         cursor.key_beg.delete_tid = 0;
928         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
929         cursor.key_beg.obj_type = 0;
930         cursor.key_beg.key = localization;
931         cursor.asof = HAMMER_MAX_TID;
932         cursor.flags |= HAMMER_CURSOR_ASOF;
933
934         if (ip)
935                 *errorp = hammer_ip_lookup(&cursor);
936         else
937                 *errorp = hammer_btree_lookup(&cursor);
938         if (*errorp == 0) {
939                 *errorp = hammer_ip_resolve_data(&cursor);
940                 if (*errorp == 0) {
941                         if (cursor.data->pfsd.mirror_flags &
942                             HAMMER_PFSD_DELETED) {
943                                 *errorp = ENOENT;
944                         } else {
945                                 bytes = cursor.leaf->data_len;
946                                 if (bytes > sizeof(pfsm->pfsd))
947                                         bytes = sizeof(pfsm->pfsd);
948                                 bcopy(cursor.data, &pfsm->pfsd, bytes);
949                         }
950                 }
951         }
952         hammer_done_cursor(&cursor);
953
954         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
955         hammer_ref(&pfsm->lock);
956         if (ip)
957                 hammer_rel_inode(ip, 0);
958         if (RB_INSERT(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm)) {
959                 kfree(pfsm, hmp->m_misc);
960                 goto retry;
961         }
962         return(pfsm);
963 }
964
965 /*
966  * Store pseudo-fs data.  The backend will automatically delete any prior
967  * on-disk pseudo-fs data but we have to delete in-memory versions.
968  */
969 int
970 hammer_save_pseudofs(hammer_transaction_t trans, hammer_pseudofs_inmem_t pfsm)
971 {
972         struct hammer_cursor cursor;
973         hammer_record_t record;
974         hammer_inode_t ip;
975         int error;
976
977         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
978                               HAMMER_DEF_LOCALIZATION, 0, &error);
979 retry:
980         pfsm->fsid_udev = hammer_fsid_to_udev(&pfsm->pfsd.shared_uuid);
981         hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
982         cursor.key_beg.localization = ip->obj_localization +
983                                       HAMMER_LOCALIZE_MISC;
984         cursor.key_beg.obj_id = HAMMER_OBJID_ROOT;
985         cursor.key_beg.create_tid = 0;
986         cursor.key_beg.delete_tid = 0;
987         cursor.key_beg.rec_type = HAMMER_RECTYPE_PFS;
988         cursor.key_beg.obj_type = 0;
989         cursor.key_beg.key = pfsm->localization;
990         cursor.asof = HAMMER_MAX_TID;
991         cursor.flags |= HAMMER_CURSOR_ASOF;
992
993         /*
994          * Replace any in-memory version of the record.
995          */
996         error = hammer_ip_lookup(&cursor);
997         if (error == 0 && hammer_cursor_inmem(&cursor)) {
998                 record = cursor.iprec;
999                 if (record->flags & HAMMER_RECF_INTERLOCK_BE) {
1000                         KKASSERT(cursor.deadlk_rec == NULL);
1001                         hammer_ref(&record->lock);
1002                         cursor.deadlk_rec = record;
1003                         error = EDEADLK;
1004                 } else {
1005                         record->flags |= HAMMER_RECF_DELETED_FE;
1006                         error = 0;
1007                 }
1008         }
1009
1010         /*
1011          * Allocate replacement general record.  The backend flush will
1012          * delete any on-disk version of the record.
1013          */
1014         if (error == 0 || error == ENOENT) {
1015                 record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd));
1016                 record->type = HAMMER_MEM_RECORD_GENERAL;
1017
1018                 record->leaf.base.localization = ip->obj_localization +
1019                                                  HAMMER_LOCALIZE_MISC;
1020                 record->leaf.base.rec_type = HAMMER_RECTYPE_PFS;
1021                 record->leaf.base.key = pfsm->localization;
1022                 record->leaf.data_len = sizeof(pfsm->pfsd);
1023                 bcopy(&pfsm->pfsd, record->data, sizeof(pfsm->pfsd));
1024                 error = hammer_ip_add_record(trans, record);
1025         }
1026         hammer_done_cursor(&cursor);
1027         if (error == EDEADLK)
1028                 goto retry;
1029         hammer_rel_inode(ip, 0);
1030         return(error);
1031 }
1032
1033 /*
1034  * Create a root directory for a PFS if one does not alredy exist.
1035  *
1036  * The PFS root stands alone so we must also bump the nlinks count
1037  * to prevent it from being destroyed on release.
1038  */
1039 int
1040 hammer_mkroot_pseudofs(hammer_transaction_t trans, struct ucred *cred,
1041                        hammer_pseudofs_inmem_t pfsm)
1042 {
1043         hammer_inode_t ip;
1044         struct vattr vap;
1045         int error;
1046
1047         ip = hammer_get_inode(trans, NULL, HAMMER_OBJID_ROOT, HAMMER_MAX_TID,
1048                               pfsm->localization, 0, &error);
1049         if (ip == NULL) {
1050                 vattr_null(&vap);
1051                 vap.va_mode = 0755;
1052                 vap.va_type = VDIR;
1053                 error = hammer_create_inode(trans, &vap, cred,
1054                                             NULL, NULL, 0,
1055                                             pfsm, &ip);
1056                 if (error == 0) {
1057                         ++ip->ino_data.nlinks;
1058                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1059                 }
1060         }
1061         if (ip)
1062                 hammer_rel_inode(ip, 0);
1063         return(error);
1064 }
1065
1066 /*
1067  * Unload any vnodes & inodes associated with a PFS, return ENOTEMPTY
1068  * if we are unable to disassociate all the inodes.
1069  */
1070 static
1071 int
1072 hammer_unload_pseudofs_callback(hammer_inode_t ip, void *data)
1073 {
1074         int res;
1075
1076         hammer_ref(&ip->lock);
1077         if (ip->lock.refs == 2 && ip->vp)
1078                 vclean_unlocked(ip->vp);
1079         if (ip->lock.refs == 1 && ip->vp == NULL)
1080                 res = 0;
1081         else
1082                 res = -1;       /* stop, someone is using the inode */
1083         hammer_rel_inode(ip, 0);
1084         return(res);
1085 }
1086
1087 int
1088 hammer_unload_pseudofs(hammer_transaction_t trans, u_int32_t localization)
1089 {
1090         int res;
1091         int try;
1092
1093         for (try = res = 0; try < 4; ++try) {
1094                 res = hammer_ino_rb_tree_RB_SCAN(&trans->hmp->rb_inos_root,
1095                                            hammer_inode_pfs_cmp,
1096                                            hammer_unload_pseudofs_callback,
1097                                            &localization);
1098                 if (res == 0 && try > 1)
1099                         break;
1100                 hammer_flusher_sync(trans->hmp);
1101         }
1102         if (res != 0)
1103                 res = ENOTEMPTY;
1104         return(res);
1105 }
1106
1107
1108 /*
1109  * Release a reference on a PFS
1110  */
1111 void
1112 hammer_rel_pseudofs(hammer_mount_t hmp, hammer_pseudofs_inmem_t pfsm)
1113 {
1114         hammer_unref(&pfsm->lock);
1115         if (pfsm->lock.refs == 0) {
1116                 RB_REMOVE(hammer_pfs_rb_tree, &hmp->rb_pfsm_root, pfsm);
1117                 kfree(pfsm, hmp->m_misc);
1118         }
1119 }
1120
1121 /*
1122  * Called by hammer_sync_inode().
1123  */
1124 static int
1125 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
1126 {
1127         hammer_transaction_t trans = cursor->trans;
1128         hammer_record_t record;
1129         int error;
1130         int redirty;
1131
1132 retry:
1133         error = 0;
1134
1135         /*
1136          * If the inode has a presence on-disk then locate it and mark
1137          * it deleted, setting DELONDISK.
1138          *
1139          * The record may or may not be physically deleted, depending on
1140          * the retention policy.
1141          */
1142         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
1143             HAMMER_INODE_ONDISK) {
1144                 hammer_normalize_cursor(cursor);
1145                 cursor->key_beg.localization = ip->obj_localization + 
1146                                                HAMMER_LOCALIZE_INODE;
1147                 cursor->key_beg.obj_id = ip->obj_id;
1148                 cursor->key_beg.key = 0;
1149                 cursor->key_beg.create_tid = 0;
1150                 cursor->key_beg.delete_tid = 0;
1151                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1152                 cursor->key_beg.obj_type = 0;
1153                 cursor->asof = ip->obj_asof;
1154                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1155                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
1156                 cursor->flags |= HAMMER_CURSOR_BACKEND;
1157
1158                 error = hammer_btree_lookup(cursor);
1159                 if (hammer_debug_inode)
1160                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
1161
1162                 if (error == 0) {
1163                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
1164                         if (hammer_debug_inode)
1165                                 kprintf(" error %d\n", error);
1166                         if (error == 0) {
1167                                 ip->flags |= HAMMER_INODE_DELONDISK;
1168                         }
1169                         if (cursor->node)
1170                                 hammer_cache_node(&ip->cache[0], cursor->node);
1171                 }
1172                 if (error == EDEADLK) {
1173                         hammer_done_cursor(cursor);
1174                         error = hammer_init_cursor(trans, cursor,
1175                                                    &ip->cache[0], ip);
1176                         if (hammer_debug_inode)
1177                                 kprintf("IPDED %p %d\n", ip, error);
1178                         if (error == 0)
1179                                 goto retry;
1180                 }
1181         }
1182
1183         /*
1184          * Ok, write out the initial record or a new record (after deleting
1185          * the old one), unless the DELETED flag is set.  This routine will
1186          * clear DELONDISK if it writes out a record.
1187          *
1188          * Update our inode statistics if this is the first application of
1189          * the inode on-disk.
1190          */
1191         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
1192                 /*
1193                  * Generate a record and write it to the media.  We clean-up
1194                  * the state before releasing so we do not have to set-up
1195                  * a flush_group.
1196                  */
1197                 record = hammer_alloc_mem_record(ip, 0);
1198                 record->type = HAMMER_MEM_RECORD_INODE;
1199                 record->flush_state = HAMMER_FST_FLUSH;
1200                 record->leaf = ip->sync_ino_leaf;
1201                 record->leaf.base.create_tid = trans->tid;
1202                 record->leaf.data_len = sizeof(ip->sync_ino_data);
1203                 record->leaf.create_ts = trans->time32;
1204                 record->data = (void *)&ip->sync_ino_data;
1205                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
1206
1207                 /*
1208                  * If this flag is set we cannot sync the new file size
1209                  * because we haven't finished related truncations.  The
1210                  * inode will be flushed in another flush group to finish
1211                  * the job.
1212                  */
1213                 if ((ip->flags & HAMMER_INODE_WOULDBLOCK) &&
1214                     ip->sync_ino_data.size != ip->ino_data.size) {
1215                         redirty = 1;
1216                         ip->sync_ino_data.size = ip->ino_data.size;
1217                 } else {
1218                         redirty = 0;
1219                 }
1220
1221                 for (;;) {
1222                         error = hammer_ip_sync_record_cursor(cursor, record);
1223                         if (hammer_debug_inode)
1224                                 kprintf("GENREC %p rec %08x %d\n",      
1225                                         ip, record->flags, error);
1226                         if (error != EDEADLK)
1227                                 break;
1228                         hammer_done_cursor(cursor);
1229                         error = hammer_init_cursor(trans, cursor,
1230                                                    &ip->cache[0], ip);
1231                         if (hammer_debug_inode)
1232                                 kprintf("GENREC reinit %d\n", error);
1233                         if (error)
1234                                 break;
1235                 }
1236
1237                 /*
1238                  * Note:  The record was never on the inode's record tree
1239                  * so just wave our hands importantly and destroy it.
1240                  */
1241                 record->flags |= HAMMER_RECF_COMMITTED;
1242                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
1243                 record->flush_state = HAMMER_FST_IDLE;
1244                 ++ip->rec_generation;
1245                 hammer_rel_mem_record(record);
1246
1247                 /*
1248                  * Finish up.
1249                  */
1250                 if (error == 0) {
1251                         if (hammer_debug_inode)
1252                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
1253                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1254                                             HAMMER_INODE_ATIME |
1255                                             HAMMER_INODE_MTIME);
1256                         ip->flags &= ~HAMMER_INODE_DELONDISK;
1257                         if (redirty)
1258                                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1259
1260                         /*
1261                          * Root volume count of inodes
1262                          */
1263                         hammer_sync_lock_sh(trans);
1264                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
1265                                 hammer_modify_volume_field(trans,
1266                                                            trans->rootvol,
1267                                                            vol0_stat_inodes);
1268                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1269                                 hammer_modify_volume_done(trans->rootvol);
1270                                 ip->flags |= HAMMER_INODE_ONDISK;
1271                                 if (hammer_debug_inode)
1272                                         kprintf("NOWONDISK %p\n", ip);
1273                         }
1274                         hammer_sync_unlock(trans);
1275                 }
1276         }
1277
1278         /*
1279          * If the inode has been destroyed, clean out any left-over flags
1280          * that may have been set by the frontend.
1281          */
1282         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
1283                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
1284                                     HAMMER_INODE_ATIME |
1285                                     HAMMER_INODE_MTIME);
1286         }
1287         return(error);
1288 }
1289
1290 /*
1291  * Update only the itimes fields.
1292  *
1293  * ATIME can be updated without generating any UNDO.  MTIME is updated
1294  * with UNDO so it is guaranteed to be synchronized properly in case of
1295  * a crash.
1296  *
1297  * Neither field is included in the B-Tree leaf element's CRC, which is how
1298  * we can get away with updating ATIME the way we do.
1299  */
1300 static int
1301 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
1302 {
1303         hammer_transaction_t trans = cursor->trans;
1304         int error;
1305
1306 retry:
1307         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) !=
1308             HAMMER_INODE_ONDISK) {
1309                 return(0);
1310         }
1311
1312         hammer_normalize_cursor(cursor);
1313         cursor->key_beg.localization = ip->obj_localization + 
1314                                        HAMMER_LOCALIZE_INODE;
1315         cursor->key_beg.obj_id = ip->obj_id;
1316         cursor->key_beg.key = 0;
1317         cursor->key_beg.create_tid = 0;
1318         cursor->key_beg.delete_tid = 0;
1319         cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
1320         cursor->key_beg.obj_type = 0;
1321         cursor->asof = ip->obj_asof;
1322         cursor->flags &= ~HAMMER_CURSOR_INITMASK;
1323         cursor->flags |= HAMMER_CURSOR_ASOF;
1324         cursor->flags |= HAMMER_CURSOR_GET_LEAF;
1325         cursor->flags |= HAMMER_CURSOR_GET_DATA;
1326         cursor->flags |= HAMMER_CURSOR_BACKEND;
1327
1328         error = hammer_btree_lookup(cursor);
1329         if (error == 0) {
1330                 hammer_cache_node(&ip->cache[0], cursor->node);
1331                 if (ip->sync_flags & HAMMER_INODE_MTIME) {
1332                         /*
1333                          * Updating MTIME requires an UNDO.  Just cover
1334                          * both atime and mtime.
1335                          */
1336                         hammer_sync_lock_sh(trans);
1337                         hammer_modify_buffer(trans, cursor->data_buffer,
1338                                      HAMMER_ITIMES_BASE(&cursor->data->inode),
1339                                      HAMMER_ITIMES_BYTES);
1340                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1341                         cursor->data->inode.mtime = ip->sync_ino_data.mtime;
1342                         hammer_modify_buffer_done(cursor->data_buffer);
1343                         hammer_sync_unlock(trans);
1344                 } else if (ip->sync_flags & HAMMER_INODE_ATIME) {
1345                         /*
1346                          * Updating atime only can be done in-place with
1347                          * no UNDO.
1348                          */
1349                         hammer_sync_lock_sh(trans);
1350                         hammer_modify_buffer(trans, cursor->data_buffer,
1351                                              NULL, 0);
1352                         cursor->data->inode.atime = ip->sync_ino_data.atime;
1353                         hammer_modify_buffer_done(cursor->data_buffer);
1354                         hammer_sync_unlock(trans);
1355                 }
1356                 ip->sync_flags &= ~(HAMMER_INODE_ATIME | HAMMER_INODE_MTIME);
1357         }
1358         if (error == EDEADLK) {
1359                 hammer_done_cursor(cursor);
1360                 error = hammer_init_cursor(trans, cursor,
1361                                            &ip->cache[0], ip);
1362                 if (error == 0)
1363                         goto retry;
1364         }
1365         return(error);
1366 }
1367
1368 /*
1369  * Release a reference on an inode, flush as requested.
1370  *
1371  * On the last reference we queue the inode to the flusher for its final
1372  * disposition.
1373  */
1374 void
1375 hammer_rel_inode(struct hammer_inode *ip, int flush)
1376 {
1377         /*hammer_mount_t hmp = ip->hmp;*/
1378
1379         /*
1380          * Handle disposition when dropping the last ref.
1381          */
1382         for (;;) {
1383                 if (ip->lock.refs == 1) {
1384                         /*
1385                          * Determine whether on-disk action is needed for
1386                          * the inode's final disposition.
1387                          */
1388                         KKASSERT(ip->vp == NULL);
1389                         hammer_inode_unloadable_check(ip, 0);
1390                         if (ip->flags & HAMMER_INODE_MODMASK) {
1391                                 hammer_flush_inode(ip, 0);
1392                         } else if (ip->lock.refs == 1) {
1393                                 hammer_unload_inode(ip);
1394                                 break;
1395                         }
1396                 } else {
1397                         if (flush)
1398                                 hammer_flush_inode(ip, 0);
1399
1400                         /*
1401                          * The inode still has multiple refs, try to drop
1402                          * one ref.
1403                          */
1404                         KKASSERT(ip->lock.refs >= 1);
1405                         if (ip->lock.refs > 1) {
1406                                 hammer_unref(&ip->lock);
1407                                 break;
1408                         }
1409                 }
1410         }
1411 }
1412
1413 /*
1414  * Unload and destroy the specified inode.  Must be called with one remaining
1415  * reference.  The reference is disposed of.
1416  *
1417  * The inode must be completely clean.
1418  */
1419 static int
1420 hammer_unload_inode(struct hammer_inode *ip)
1421 {
1422         hammer_mount_t hmp = ip->hmp;
1423
1424         KASSERT(ip->lock.refs == 1,
1425                 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
1426         KKASSERT(ip->vp == NULL);
1427         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
1428         KKASSERT(ip->cursor_ip_refs == 0);
1429         KKASSERT(hammer_notlocked(&ip->lock));
1430         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
1431
1432         KKASSERT(RB_EMPTY(&ip->rec_tree));
1433         KKASSERT(TAILQ_EMPTY(&ip->target_list));
1434
1435         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
1436
1437         hammer_free_inode(ip);
1438         return(0);
1439 }
1440
1441 /*
1442  * Called during unmounting if a critical error occured.  The in-memory
1443  * inode and all related structures are destroyed.
1444  *
1445  * If a critical error did not occur the unmount code calls the standard
1446  * release and asserts that the inode is gone.
1447  */
1448 int
1449 hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused)
1450 {
1451         hammer_record_t rec;
1452
1453         /*
1454          * Get rid of the inodes in-memory records, regardless of their
1455          * state, and clear the mod-mask.
1456          */
1457         while ((rec = TAILQ_FIRST(&ip->target_list)) != NULL) {
1458                 TAILQ_REMOVE(&ip->target_list, rec, target_entry);
1459                 rec->target_ip = NULL;
1460                 if (rec->flush_state == HAMMER_FST_SETUP)
1461                         rec->flush_state = HAMMER_FST_IDLE;
1462         }
1463         while ((rec = RB_ROOT(&ip->rec_tree)) != NULL) {
1464                 if (rec->flush_state == HAMMER_FST_FLUSH)
1465                         --rec->flush_group->refs;
1466                 else
1467                         hammer_ref(&rec->lock);
1468                 KKASSERT(rec->lock.refs == 1);
1469                 rec->flush_state = HAMMER_FST_IDLE;
1470                 rec->flush_group = NULL;
1471                 rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */
1472                 rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */
1473                 ++ip->rec_generation;
1474                 hammer_rel_mem_record(rec);
1475         }
1476         ip->flags &= ~HAMMER_INODE_MODMASK;
1477         ip->sync_flags &= ~HAMMER_INODE_MODMASK;
1478         KKASSERT(ip->vp == NULL);
1479
1480         /*
1481          * Remove the inode from any flush group, force it idle.  FLUSH
1482          * and SETUP states have an inode ref.
1483          */
1484         switch(ip->flush_state) {
1485         case HAMMER_FST_FLUSH:
1486                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
1487                 --ip->flush_group->refs;
1488                 ip->flush_group = NULL;
1489                 /* fall through */
1490         case HAMMER_FST_SETUP:
1491                 hammer_unref(&ip->lock);
1492                 ip->flush_state = HAMMER_FST_IDLE;
1493                 /* fall through */
1494         case HAMMER_FST_IDLE:
1495                 break;
1496         }
1497
1498         /*
1499          * There shouldn't be any associated vnode.  The unload needs at
1500          * least one ref, if we do have a vp steal its ip ref.
1501          */
1502         if (ip->vp) {
1503                 kprintf("hammer_destroy_inode_callback: Unexpected "
1504                         "vnode association ip %p vp %p\n", ip, ip->vp);
1505                 ip->vp->v_data = NULL;
1506                 ip->vp = NULL;
1507         } else {
1508                 hammer_ref(&ip->lock);
1509         }
1510         hammer_unload_inode(ip);
1511         return(0);
1512 }
1513
1514 /*
1515  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
1516  * the read-only flag for cached inodes.
1517  *
1518  * This routine is called from a RB_SCAN().
1519  */
1520 int
1521 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
1522 {
1523         hammer_mount_t hmp = ip->hmp;
1524
1525         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
1526                 ip->flags |= HAMMER_INODE_RO;
1527         else
1528                 ip->flags &= ~HAMMER_INODE_RO;
1529         return(0);
1530 }
1531
1532 /*
1533  * A transaction has modified an inode, requiring updates as specified by
1534  * the passed flags.
1535  *
1536  * HAMMER_INODE_DDIRTY: Inode data has been updated
1537  * HAMMER_INODE_XDIRTY: Dirty in-memory records
1538  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
1539  * HAMMER_INODE_DELETED: Inode record/data must be deleted
1540  * HAMMER_INODE_ATIME/MTIME: mtime/atime has been updated
1541  */
1542 void
1543 hammer_modify_inode(hammer_inode_t ip, int flags)
1544 {
1545         /* 
1546          * ronly of 0 or 2 does not trigger assertion.
1547          * 2 is a special error state 
1548          */
1549         KKASSERT(ip->hmp->ronly != 1 ||
1550                   (flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY | 
1551                             HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
1552                             HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
1553         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
1554                 ip->flags |= HAMMER_INODE_RSV_INODES;
1555                 ++ip->hmp->rsv_inodes;
1556         }
1557
1558         ip->flags |= flags;
1559 }
1560
1561 /*
1562  * Request that an inode be flushed.  This whole mess cannot block and may
1563  * recurse (if not synchronous).  Once requested HAMMER will attempt to
1564  * actively flush the inode until the flush can be done.
1565  *
1566  * The inode may already be flushing, or may be in a setup state.  We can
1567  * place the inode in a flushing state if it is currently idle and flag it
1568  * to reflush if it is currently flushing.
1569  *
1570  * Upon return if the inode could not be flushed due to a setup
1571  * dependancy, then it will be automatically flushed when the dependancy
1572  * is satisfied.
1573  */
1574 void
1575 hammer_flush_inode(hammer_inode_t ip, int flags)
1576 {
1577         hammer_mount_t hmp;
1578         hammer_flush_group_t flg;
1579         int good;
1580
1581         /*
1582          * next_flush_group is the first flush group we can place the inode
1583          * in.  It may be NULL.  If it becomes full we append a new flush
1584          * group and make that the next_flush_group.
1585          */
1586         hmp = ip->hmp;
1587         while ((flg = hmp->next_flush_group) != NULL) {
1588                 KKASSERT(flg->running == 0);
1589                 if (flg->total_count + flg->refs <= ip->hmp->undo_rec_limit)
1590                         break;
1591                 hmp->next_flush_group = TAILQ_NEXT(flg, flush_entry);
1592                 hammer_flusher_async(ip->hmp, flg);
1593         }
1594         if (flg == NULL) {
1595                 flg = kmalloc(sizeof(*flg), hmp->m_misc, M_WAITOK|M_ZERO);
1596                 hmp->next_flush_group = flg;
1597                 RB_INIT(&flg->flush_tree);
1598                 TAILQ_INSERT_TAIL(&hmp->flush_group_list, flg, flush_entry);
1599         }
1600
1601         /*
1602          * Trivial 'nothing to flush' case.  If the inode is in a SETUP
1603          * state we have to put it back into an IDLE state so we can
1604          * drop the extra ref.
1605          *
1606          * If we have a parent dependancy we must still fall through
1607          * so we can run it.
1608          */
1609         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
1610                 if (ip->flush_state == HAMMER_FST_SETUP &&
1611                     TAILQ_EMPTY(&ip->target_list)) {
1612                         ip->flush_state = HAMMER_FST_IDLE;
1613                         hammer_rel_inode(ip, 0);
1614                 }
1615                 if (ip->flush_state == HAMMER_FST_IDLE)
1616                         return;
1617         }
1618
1619         /*
1620          * Our flush action will depend on the current state.
1621          */
1622         switch(ip->flush_state) {
1623         case HAMMER_FST_IDLE:
1624                 /*
1625                  * We have no dependancies and can flush immediately.  Some
1626                  * our children may not be flushable so we have to re-test
1627                  * with that additional knowledge.
1628                  */
1629                 hammer_flush_inode_core(ip, flg, flags);
1630                 break;
1631         case HAMMER_FST_SETUP:
1632                 /*
1633                  * Recurse upwards through dependancies via target_list
1634                  * and start their flusher actions going if possible.
1635                  *
1636                  * 'good' is our connectivity.  -1 means we have none and
1637                  * can't flush, 0 means there weren't any dependancies, and
1638                  * 1 means we have good connectivity.
1639                  */
1640                 good = hammer_setup_parent_inodes(ip, 0, flg);
1641
1642                 if (good >= 0) {
1643                         /*
1644                          * We can continue if good >= 0.  Determine how 
1645                          * many records under our inode can be flushed (and
1646                          * mark them).
1647                          */
1648                         hammer_flush_inode_core(ip, flg, flags);
1649                 } else {
1650                         /*
1651                          * Parent has no connectivity, tell it to flush
1652                          * us as soon as it does.
1653                          *
1654                          * The REFLUSH flag is also needed to trigger
1655                          * dependancy wakeups.
1656                          */
1657                         ip->flags |= HAMMER_INODE_CONN_DOWN |
1658                                      HAMMER_INODE_REFLUSH;
1659                         if (flags & HAMMER_FLUSH_SIGNAL) {
1660                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1661                                 hammer_flusher_async(ip->hmp, flg);
1662                         }
1663                 }
1664                 break;
1665         case HAMMER_FST_FLUSH:
1666                 /*
1667                  * We are already flushing, flag the inode to reflush
1668                  * if needed after it completes its current flush.
1669                  *
1670                  * The REFLUSH flag is also needed to trigger
1671                  * dependancy wakeups.
1672                  */
1673                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
1674                         ip->flags |= HAMMER_INODE_REFLUSH;
1675                 if (flags & HAMMER_FLUSH_SIGNAL) {
1676                         ip->flags |= HAMMER_INODE_RESIGNAL;
1677                         hammer_flusher_async(ip->hmp, flg);
1678                 }
1679                 break;
1680         }
1681 }
1682
1683 /*
1684  * Scan ip->target_list, which is a list of records owned by PARENTS to our
1685  * ip which reference our ip.
1686  *
1687  * XXX This is a huge mess of recursive code, but not one bit of it blocks
1688  *     so for now do not ref/deref the structures.  Note that if we use the
1689  *     ref/rel code later, the rel CAN block.
1690  */
1691 static int
1692 hammer_setup_parent_inodes(hammer_inode_t ip, int depth,
1693                            hammer_flush_group_t flg)
1694 {
1695         hammer_record_t depend;
1696         int good;
1697         int r;
1698
1699         /*
1700          * If we hit our recursion limit and we have parent dependencies
1701          * We cannot continue.  Returning < 0 will cause us to be flagged
1702          * for reflush.  Returning -2 cuts off additional dependency checks
1703          * because they are likely to also hit the depth limit.
1704          *
1705          * We cannot return < 0 if there are no dependencies or there might
1706          * not be anything to wakeup (ip).
1707          */
1708         if (depth == 20 && TAILQ_FIRST(&ip->target_list)) {
1709                 kprintf("HAMMER Warning: depth limit reached on "
1710                         "setup recursion, inode %p %016llx\n",
1711                         ip, (long long)ip->obj_id);
1712                 return(-2);
1713         }
1714
1715         /*
1716          * Scan dependencies
1717          */
1718         good = 0;
1719         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
1720                 r = hammer_setup_parent_inodes_helper(depend, depth, flg);
1721                 KKASSERT(depend->target_ip == ip);
1722                 if (r < 0 && good == 0)
1723                         good = -1;
1724                 if (r > 0)
1725                         good = 1;
1726
1727                 /*
1728                  * If we failed due to the recursion depth limit then stop
1729                  * now.
1730                  */
1731                 if (r == -2)
1732                         break;
1733         }
1734         return(good);
1735 }
1736
1737 /*
1738  * This helper function takes a record representing the dependancy between
1739  * the parent inode and child inode.
1740  *
1741  * record->ip           = parent inode
1742  * record->target_ip    = child inode
1743  * 
1744  * We are asked to recurse upwards and convert the record from SETUP
1745  * to FLUSH if possible.
1746  *
1747  * Return 1 if the record gives us connectivity
1748  *
1749  * Return 0 if the record is not relevant 
1750  *
1751  * Return -1 if we can't resolve the dependancy and there is no connectivity.
1752  */
1753 static int
1754 hammer_setup_parent_inodes_helper(hammer_record_t record, int depth,
1755                                   hammer_flush_group_t flg)
1756 {
1757         hammer_mount_t hmp;
1758         hammer_inode_t pip;
1759         int good;
1760
1761         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
1762         pip = record->ip;
1763         hmp = pip->hmp;
1764
1765         /*
1766          * If the record is already flushing, is it in our flush group?
1767          *
1768          * If it is in our flush group but it is a general record or a 
1769          * delete-on-disk, it does not improve our connectivity (return 0),
1770          * and if the target inode is not trying to destroy itself we can't
1771          * allow the operation yet anyway (the second return -1).
1772          */
1773         if (record->flush_state == HAMMER_FST_FLUSH) {
1774                 /*
1775                  * If not in our flush group ask the parent to reflush
1776                  * us as soon as possible.
1777                  */
1778                 if (record->flush_group != flg) {
1779                         pip->flags |= HAMMER_INODE_REFLUSH;
1780                         record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1781                         return(-1);
1782                 }
1783
1784                 /*
1785                  * If in our flush group everything is already set up,
1786                  * just return whether the record will improve our
1787                  * visibility or not.
1788                  */
1789                 if (record->type == HAMMER_MEM_RECORD_ADD)
1790                         return(1);
1791                 return(0);
1792         }
1793
1794         /*
1795          * It must be a setup record.  Try to resolve the setup dependancies
1796          * by recursing upwards so we can place ip on the flush list.
1797          *
1798          * Limit ourselves to 20 levels of recursion to avoid blowing out
1799          * the kernel stack.  If we hit the recursion limit we can't flush
1800          * until the parent flushes.  The parent will flush independantly
1801          * on its own and ultimately a deep recursion will be resolved.
1802          */
1803         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1804
1805         good = hammer_setup_parent_inodes(pip, depth + 1, flg);
1806
1807         /*
1808          * If good < 0 the parent has no connectivity and we cannot safely
1809          * flush the directory entry, which also means we can't flush our
1810          * ip.  Flag us for downward recursion once the parent's
1811          * connectivity is resolved.  Flag the parent for [re]flush or it
1812          * may not check for downward recursions.
1813          */
1814         if (good < 0) {
1815                 pip->flags |= HAMMER_INODE_REFLUSH;
1816                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1817                 return(good);
1818         }
1819
1820         /*
1821          * We are go, place the parent inode in a flushing state so we can
1822          * place its record in a flushing state.  Note that the parent
1823          * may already be flushing.  The record must be in the same flush
1824          * group as the parent.
1825          */
1826         if (pip->flush_state != HAMMER_FST_FLUSH)
1827                 hammer_flush_inode_core(pip, flg, HAMMER_FLUSH_RECURSION);
1828         KKASSERT(pip->flush_state == HAMMER_FST_FLUSH);
1829         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
1830
1831 #if 0
1832         if (record->type == HAMMER_MEM_RECORD_DEL &&
1833             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
1834                 /*
1835                  * Regardless of flushing state we cannot sync this path if the
1836                  * record represents a delete-on-disk but the target inode
1837                  * is not ready to sync its own deletion.
1838                  *
1839                  * XXX need to count effective nlinks to determine whether
1840                  * the flush is ok, otherwise removing a hardlink will
1841                  * just leave the DEL record to rot.
1842                  */
1843                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
1844                 return(-1);
1845         } else
1846 #endif
1847         if (pip->flush_group == flg) {
1848                 /*
1849                  * Because we have not calculated nlinks yet we can just
1850                  * set records to the flush state if the parent is in
1851                  * the same flush group as we are.
1852                  */
1853                 record->flush_state = HAMMER_FST_FLUSH;
1854                 record->flush_group = flg;
1855                 ++record->flush_group->refs;
1856                 hammer_ref(&record->lock);
1857
1858                 /*
1859                  * A general directory-add contributes to our visibility.
1860                  *
1861                  * Otherwise it is probably a directory-delete or 
1862                  * delete-on-disk record and does not contribute to our
1863                  * visbility (but we can still flush it).
1864                  */
1865                 if (record->type == HAMMER_MEM_RECORD_ADD)
1866                         return(1);
1867                 return(0);
1868         } else {
1869                 /*
1870                  * If the parent is not in our flush group we cannot
1871                  * flush this record yet, there is no visibility.
1872                  * We tell the parent to reflush and mark ourselves
1873                  * so the parent knows it should flush us too.
1874                  */
1875                 pip->flags |= HAMMER_INODE_REFLUSH;
1876                 record->target_ip->flags |= HAMMER_INODE_CONN_DOWN;
1877                 return(-1);
1878         }
1879 }
1880
1881 /*
1882  * This is the core routine placing an inode into the FST_FLUSH state.
1883  */
1884 static void
1885 hammer_flush_inode_core(hammer_inode_t ip, hammer_flush_group_t flg, int flags)
1886 {
1887         int go_count;
1888
1889         /*
1890          * Set flush state and prevent the flusher from cycling into
1891          * the next flush group.  Do not place the ip on the list yet.
1892          * Inodes not in the idle state get an extra reference.
1893          */
1894         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
1895         if (ip->flush_state == HAMMER_FST_IDLE)
1896                 hammer_ref(&ip->lock);
1897         ip->flush_state = HAMMER_FST_FLUSH;
1898         ip->flush_group = flg;
1899         ++ip->hmp->flusher.group_lock;
1900         ++ip->hmp->count_iqueued;
1901         ++hammer_count_iqueued;
1902         ++flg->total_count;
1903
1904         /*
1905          * If the flush group reaches the autoflush limit we want to signal
1906          * the flusher.  This is particularly important for remove()s.
1907          */
1908         if (flg->total_count == hammer_autoflush)
1909                 flags |= HAMMER_FLUSH_SIGNAL;
1910
1911         /*
1912          * We need to be able to vfsync/truncate from the backend.
1913          */
1914         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
1915         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
1916                 ip->flags |= HAMMER_INODE_VHELD;
1917                 vref(ip->vp);
1918         }
1919
1920         /*
1921          * Figure out how many in-memory records we can actually flush
1922          * (not including inode meta-data, buffers, etc).
1923          */
1924         KKASSERT((ip->flags & HAMMER_INODE_WOULDBLOCK) == 0);
1925         if (flags & HAMMER_FLUSH_RECURSION) {
1926                 /*
1927                  * If this is a upwards recursion we do not want to
1928                  * recurse down again!
1929                  */
1930                 go_count = 1;
1931 #if 0
1932         } else if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
1933                 /*
1934                  * No new records are added if we must complete a flush
1935                  * from a previous cycle, but we do have to move the records
1936                  * from the previous cycle to the current one.
1937                  */
1938 #if 0
1939                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1940                                    hammer_syncgrp_child_callback, NULL);
1941 #endif
1942                 go_count = 1;
1943 #endif
1944         } else {
1945                 /*
1946                  * Normal flush, scan records and bring them into the flush.
1947                  * Directory adds and deletes are usually skipped (they are
1948                  * grouped with the related inode rather then with the
1949                  * directory).
1950                  *
1951                  * go_count can be negative, which means the scan aborted
1952                  * due to the flush group being over-full and we should
1953                  * flush what we have.
1954                  */
1955                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1956                                    hammer_setup_child_callback, NULL);
1957         }
1958
1959         /*
1960          * This is a more involved test that includes go_count.  If we
1961          * can't flush, flag the inode and return.  If go_count is 0 we
1962          * were are unable to flush any records in our rec_tree and
1963          * must ignore the XDIRTY flag.
1964          */
1965         if (go_count == 0) {
1966                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
1967                         --ip->hmp->count_iqueued;
1968                         --hammer_count_iqueued;
1969
1970                         --flg->total_count;
1971                         ip->flush_state = HAMMER_FST_SETUP;
1972                         ip->flush_group = NULL;
1973                         if (ip->flags & HAMMER_INODE_VHELD) {
1974                                 ip->flags &= ~HAMMER_INODE_VHELD;
1975                                 vrele(ip->vp);
1976                         }
1977
1978                         /*
1979                          * REFLUSH is needed to trigger dependancy wakeups
1980                          * when an inode is in SETUP.
1981                          */
1982                         ip->flags |= HAMMER_INODE_REFLUSH;
1983                         if (flags & HAMMER_FLUSH_SIGNAL) {
1984                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1985                                 hammer_flusher_async(ip->hmp, flg);
1986                         }
1987                         if (--ip->hmp->flusher.group_lock == 0)
1988                                 wakeup(&ip->hmp->flusher.group_lock);
1989                         return;
1990                 }
1991         }
1992
1993         /*
1994          * Snapshot the state of the inode for the backend flusher.
1995          *
1996          * We continue to retain save_trunc_off even when all truncations
1997          * have been resolved as an optimization to determine if we can
1998          * skip the B-Tree lookup for overwrite deletions.
1999          *
2000          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
2001          * and stays in ip->flags.  Once set, it stays set until the
2002          * inode is destroyed.
2003          */
2004         if (ip->flags & HAMMER_INODE_TRUNCATED) {
2005                 KKASSERT((ip->sync_flags & HAMMER_INODE_TRUNCATED) == 0);
2006                 ip->sync_trunc_off = ip->trunc_off;
2007                 ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
2008                 ip->flags &= ~HAMMER_INODE_TRUNCATED;
2009                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
2010
2011                 /*
2012                  * The save_trunc_off used to cache whether the B-Tree
2013                  * holds any records past that point is not used until
2014                  * after the truncation has succeeded, so we can safely
2015                  * set it now.
2016                  */
2017                 if (ip->save_trunc_off > ip->sync_trunc_off)
2018                         ip->save_trunc_off = ip->sync_trunc_off;
2019         }
2020         ip->sync_flags |= (ip->flags & HAMMER_INODE_MODMASK &
2021                            ~HAMMER_INODE_TRUNCATED);
2022         ip->sync_ino_leaf = ip->ino_leaf;
2023         ip->sync_ino_data = ip->ino_data;
2024         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
2025 #ifdef DEBUG_TRUNCATE
2026         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
2027                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
2028 #endif
2029
2030         /*
2031          * The flusher list inherits our inode and reference.
2032          */
2033         KKASSERT(flg->running == 0);
2034         RB_INSERT(hammer_fls_rb_tree, &flg->flush_tree, ip);
2035         if (--ip->hmp->flusher.group_lock == 0)
2036                 wakeup(&ip->hmp->flusher.group_lock);
2037
2038         if (flags & HAMMER_FLUSH_SIGNAL) {
2039                 hammer_flusher_async(ip->hmp, flg);
2040         }
2041 }
2042
2043 /*
2044  * Callback for scan of ip->rec_tree.  Try to include each record in our
2045  * flush.  ip->flush_group has been set but the inode has not yet been
2046  * moved into a flushing state.
2047  *
2048  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
2049  * both inodes.
2050  *
2051  * We return 1 for any record placed or found in FST_FLUSH, which prevents
2052  * the caller from shortcutting the flush.
2053  */
2054 static int
2055 hammer_setup_child_callback(hammer_record_t rec, void *data)
2056 {
2057         hammer_flush_group_t flg;
2058         hammer_inode_t target_ip;
2059         hammer_inode_t ip;
2060         int r;
2061
2062         /*
2063          * Records deleted or committed by the backend are ignored.
2064          * Note that the flush detects deleted frontend records at
2065          * multiple points to deal with races.  This is just the first
2066          * line of defense.  The only time HAMMER_RECF_DELETED_FE cannot
2067          * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it
2068          * messes up link-count calculations.
2069          *
2070          * NOTE: Don't get confused between record deletion and, say,
2071          * directory entry deletion.  The deletion of a directory entry
2072          * which is on-media has nothing to do with the record deletion
2073          * flags.
2074          */
2075         if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE |
2076                           HAMMER_RECF_COMMITTED)) {
2077                 if (rec->flush_state == HAMMER_FST_FLUSH) {
2078                         KKASSERT(rec->flush_group == rec->ip->flush_group);
2079                         r = 1;
2080                 } else {
2081                         r = 0;
2082                 }
2083                 return(r);
2084         }
2085
2086         /*
2087          * If the record is in an idle state it has no dependancies and
2088          * can be flushed.
2089          */
2090         ip = rec->ip;
2091         flg = ip->flush_group;
2092         r = 0;
2093
2094         switch(rec->flush_state) {
2095         case HAMMER_FST_IDLE:
2096                 /*
2097                  * The record has no setup dependancy, we can flush it.
2098                  */
2099                 KKASSERT(rec->target_ip == NULL);
2100                 rec->flush_state = HAMMER_FST_FLUSH;
2101                 rec->flush_group = flg;
2102                 ++flg->refs;
2103                 hammer_ref(&rec->lock);
2104                 r = 1;
2105                 break;
2106         case HAMMER_FST_SETUP:
2107                 /*
2108                  * The record has a setup dependancy.  These are typically
2109                  * directory entry adds and deletes.  Such entries will be
2110                  * flushed when their inodes are flushed so we do not
2111                  * usually have to add them to the flush here.  However,
2112                  * if the target_ip has set HAMMER_INODE_CONN_DOWN then
2113                  * it is asking us to flush this record (and it).
2114                  */
2115                 target_ip = rec->target_ip;
2116                 KKASSERT(target_ip != NULL);
2117                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
2118
2119                 /*
2120                  * If the target IP is already flushing in our group
2121                  * we could associate the record, but target_ip has
2122                  * already synced ino_data to sync_ino_data and we
2123                  * would also have to adjust nlinks.   Plus there are
2124                  * ordering issues for adds and deletes.
2125                  *
2126                  * Reflush downward if this is an ADD, and upward if
2127                  * this is a DEL.
2128                  */
2129                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
2130                         if (rec->flush_state == HAMMER_MEM_RECORD_ADD)
2131                                 ip->flags |= HAMMER_INODE_REFLUSH;
2132                         else
2133                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
2134                         break;
2135                 } 
2136
2137                 /*
2138                  * Target IP is not yet flushing.  This can get complex
2139                  * because we have to be careful about the recursion.
2140                  *
2141                  * Directories create an issue for us in that if a flush
2142                  * of a directory is requested the expectation is to flush
2143                  * any pending directory entries, but this will cause the
2144                  * related inodes to recursively flush as well.  We can't
2145                  * really defer the operation so just get as many as we
2146                  * can and
2147                  */
2148 #if 0
2149                 if ((target_ip->flags & HAMMER_INODE_RECLAIM) == 0 &&
2150                     (target_ip->flags & HAMMER_INODE_CONN_DOWN) == 0) {
2151                         /*
2152                          * We aren't reclaiming and the target ip was not
2153                          * previously prevented from flushing due to this
2154                          * record dependancy.  Do not flush this record.
2155                          */
2156                         /*r = 0;*/
2157                 } else
2158 #endif
2159                 if (flg->total_count + flg->refs >
2160                            ip->hmp->undo_rec_limit) {
2161                         /*
2162                          * Our flush group is over-full and we risk blowing
2163                          * out the UNDO FIFO.  Stop the scan, flush what we
2164                          * have, then reflush the directory.
2165                          *
2166                          * The directory may be forced through multiple
2167                          * flush groups before it can be completely
2168                          * flushed.
2169                          */
2170                         ip->flags |= HAMMER_INODE_RESIGNAL |
2171                                      HAMMER_INODE_REFLUSH;
2172                         r = -1;
2173                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
2174                         /*
2175                          * If the target IP is not flushing we can force
2176                          * it to flush, even if it is unable to write out
2177                          * any of its own records we have at least one in
2178                          * hand that we CAN deal with.
2179                          */
2180                         rec->flush_state = HAMMER_FST_FLUSH;
2181                         rec->flush_group = flg;
2182                         ++flg->refs;
2183                         hammer_ref(&rec->lock);
2184                         hammer_flush_inode_core(target_ip, flg,
2185                                                 HAMMER_FLUSH_RECURSION);
2186                         r = 1;
2187                 } else {
2188                         /*
2189                          * General or delete-on-disk record.
2190                          *
2191                          * XXX this needs help.  If a delete-on-disk we could
2192                          * disconnect the target.  If the target has its own
2193                          * dependancies they really need to be flushed.
2194                          *
2195                          * XXX
2196                          */
2197                         rec->flush_state = HAMMER_FST_FLUSH;
2198                         rec->flush_group = flg;
2199                         ++flg->refs;
2200                         hammer_ref(&rec->lock);
2201                         hammer_flush_inode_core(target_ip, flg,
2202                                                 HAMMER_FLUSH_RECURSION);
2203                         r = 1;
2204                 }
2205                 break;
2206         case HAMMER_FST_FLUSH:
2207                 /* 
2208                  * The flush_group should already match.
2209                  */
2210                 KKASSERT(rec->flush_group == flg);
2211                 r = 1;
2212                 break;
2213         }
2214         return(r);
2215 }
2216
2217 #if 0
2218 /*
2219  * This version just moves records already in a flush state to the new
2220  * flush group and that is it.
2221  */
2222 static int
2223 hammer_syncgrp_child_callback(hammer_record_t rec, void *data)
2224 {
2225         hammer_inode_t ip = rec->ip;
2226
2227         switch(rec->flush_state) {
2228         case HAMMER_FST_FLUSH:
2229                 KKASSERT(rec->flush_group == ip->flush_group);
2230                 break;
2231         default:
2232                 break;
2233         }
2234         return(0);
2235 }
2236 #endif
2237
2238 /*
2239  * Wait for a previously queued flush to complete.
2240  *
2241  * If a critical error occured we don't try to wait.
2242  */
2243 void
2244 hammer_wait_inode(hammer_inode_t ip)
2245 {
2246         hammer_flush_group_t flg;
2247
2248         flg = NULL;
2249         if ((ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
2250                 while (ip->flush_state != HAMMER_FST_IDLE &&
2251                        (ip->hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR) == 0) {
2252                         if (ip->flush_state == HAMMER_FST_SETUP)
2253                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2254                         if (ip->flush_state != HAMMER_FST_IDLE) {
2255                                 ip->flags |= HAMMER_INODE_FLUSHW;
2256                                 tsleep(&ip->flags, 0, "hmrwin", 0);
2257                         }
2258                 }
2259         }
2260 }
2261
2262 /*
2263  * Called by the backend code when a flush has been completed.
2264  * The inode has already been removed from the flush list.
2265  *
2266  * A pipelined flush can occur, in which case we must re-enter the
2267  * inode on the list and re-copy its fields.
2268  */
2269 void
2270 hammer_flush_inode_done(hammer_inode_t ip, int error)
2271 {
2272         hammer_mount_t hmp;
2273         int dorel;
2274
2275         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
2276
2277         hmp = ip->hmp;
2278
2279         /*
2280          * Auto-reflush if the backend could not completely flush
2281          * the inode.  This fixes a case where a deferred buffer flush
2282          * could cause fsync to return early.
2283          */
2284         if (ip->sync_flags & HAMMER_INODE_MODMASK)
2285                 ip->flags |= HAMMER_INODE_REFLUSH;
2286
2287         /*
2288          * Merge left-over flags back into the frontend and fix the state.
2289          * Incomplete truncations are retained by the backend.
2290          */
2291         ip->error = error;
2292         ip->flags |= ip->sync_flags & ~HAMMER_INODE_TRUNCATED;
2293         ip->sync_flags &= HAMMER_INODE_TRUNCATED;
2294
2295         /*
2296          * The backend may have adjusted nlinks, so if the adjusted nlinks
2297          * does not match the fronttend set the frontend's RDIRTY flag again.
2298          */
2299         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
2300                 ip->flags |= HAMMER_INODE_DDIRTY;
2301
2302         /*
2303          * Fix up the dirty buffer status.
2304          */
2305         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
2306                 ip->flags |= HAMMER_INODE_BUFS;
2307         }
2308
2309         /*
2310          * Re-set the XDIRTY flag if some of the inode's in-memory records
2311          * could not be flushed.
2312          */
2313         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
2314                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
2315                  (!RB_EMPTY(&ip->rec_tree) &&
2316                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
2317
2318         /*
2319          * Do not lose track of inodes which no longer have vnode
2320          * assocations, otherwise they may never get flushed again.
2321          *
2322          * The reflush flag can be set superfluously, causing extra pain
2323          * for no reason.  If the inode is no longer modified it no longer
2324          * needs to be flushed.
2325          */
2326         if (ip->flags & HAMMER_INODE_MODMASK) {
2327                 if (ip->vp == NULL)
2328                         ip->flags |= HAMMER_INODE_REFLUSH;
2329         } else {
2330                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2331         }
2332
2333         /*
2334          * Adjust the flush state.
2335          */
2336         if (ip->flags & HAMMER_INODE_WOULDBLOCK) {
2337                 /*
2338                  * We were unable to flush out all our records, leave the
2339                  * inode in a flush state and in the current flush group.
2340                  * The flush group will be re-run.
2341                  *
2342                  * This occurs if the UNDO block gets too full or there is
2343                  * too much dirty meta-data and allows the flusher to
2344                  * finalize the UNDO block and then re-flush.
2345                  */
2346                 ip->flags &= ~HAMMER_INODE_WOULDBLOCK;
2347                 dorel = 0;
2348         } else {
2349                 /*
2350                  * Remove from the flush_group
2351                  */
2352                 RB_REMOVE(hammer_fls_rb_tree, &ip->flush_group->flush_tree, ip);
2353                 ip->flush_group = NULL;
2354
2355                 /*
2356                  * Clean up the vnode ref and tracking counts.
2357                  */
2358                 if (ip->flags & HAMMER_INODE_VHELD) {
2359                         ip->flags &= ~HAMMER_INODE_VHELD;
2360                         vrele(ip->vp);
2361                 }
2362                 --hmp->count_iqueued;
2363                 --hammer_count_iqueued;
2364
2365                 /*
2366                  * And adjust the state.
2367                  */
2368                 if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
2369                         ip->flush_state = HAMMER_FST_IDLE;
2370                         dorel = 1;
2371                 } else {
2372                         ip->flush_state = HAMMER_FST_SETUP;
2373                         dorel = 0;
2374                 }
2375
2376                 /*
2377                  * If the frontend is waiting for a flush to complete,
2378                  * wake it up.
2379                  */
2380                 if (ip->flags & HAMMER_INODE_FLUSHW) {
2381                         ip->flags &= ~HAMMER_INODE_FLUSHW;
2382                         wakeup(&ip->flags);
2383                 }
2384
2385                 /*
2386                  * If the frontend made more changes and requested another
2387                  * flush, then try to get it running.
2388                  *
2389                  * Reflushes are aborted when the inode is errored out.
2390                  */
2391                 if (ip->flags & HAMMER_INODE_REFLUSH) {
2392                         ip->flags &= ~HAMMER_INODE_REFLUSH;
2393                         if (ip->flags & HAMMER_INODE_RESIGNAL) {
2394                                 ip->flags &= ~HAMMER_INODE_RESIGNAL;
2395                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2396                         } else {
2397                                 hammer_flush_inode(ip, 0);
2398                         }
2399                 }
2400         }
2401
2402         /*
2403          * If we have no parent dependancies we can clear CONN_DOWN
2404          */
2405         if (TAILQ_EMPTY(&ip->target_list))
2406                 ip->flags &= ~HAMMER_INODE_CONN_DOWN;
2407
2408         /*
2409          * If the inode is now clean drop the space reservation.
2410          */
2411         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2412             (ip->flags & HAMMER_INODE_RSV_INODES)) {
2413                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
2414                 --hmp->rsv_inodes;
2415         }
2416
2417         if (dorel)
2418                 hammer_rel_inode(ip, 0);
2419 }
2420
2421 /*
2422  * Called from hammer_sync_inode() to synchronize in-memory records
2423  * to the media.
2424  */
2425 static int
2426 hammer_sync_record_callback(hammer_record_t record, void *data)
2427 {
2428         hammer_cursor_t cursor = data;
2429         hammer_transaction_t trans = cursor->trans;
2430         hammer_mount_t hmp = trans->hmp;
2431         int error;
2432
2433         /*
2434          * Skip records that do not belong to the current flush.
2435          */
2436         ++hammer_stats_record_iterations;
2437         if (record->flush_state != HAMMER_FST_FLUSH)
2438                 return(0);
2439
2440 #if 1
2441         if (record->flush_group != record->ip->flush_group) {
2442                 kprintf("sync_record %p ip %p bad flush group %p %p\n", record, record->ip, record->flush_group ,record->ip->flush_group);
2443                 if (hammer_debug_critical)
2444                         Debugger("blah2");
2445                 return(0);
2446         }
2447 #endif
2448         KKASSERT(record->flush_group == record->ip->flush_group);
2449
2450         /*
2451          * Interlock the record using the BE flag.  Once BE is set the
2452          * frontend cannot change the state of FE.
2453          *
2454          * NOTE: If FE is set prior to us setting BE we still sync the
2455          * record out, but the flush completion code converts it to 
2456          * a delete-on-disk record instead of destroying it.
2457          */
2458         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
2459         record->flags |= HAMMER_RECF_INTERLOCK_BE;
2460
2461         /*
2462          * The backend has already disposed of the record.
2463          */
2464         if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) {
2465                 error = 0;
2466                 goto done;
2467         }
2468
2469         /*
2470          * If the whole inode is being deleting all on-disk records will
2471          * be deleted very soon, we can't sync any new records to disk
2472          * because they will be deleted in the same transaction they were
2473          * created in (delete_tid == create_tid), which will assert.
2474          *
2475          * XXX There may be a case with RECORD_ADD with DELETED_FE set
2476          * that we currently panic on.
2477          */
2478         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
2479                 switch(record->type) {
2480                 case HAMMER_MEM_RECORD_DATA:
2481                         /*
2482                          * We don't have to do anything, if the record was
2483                          * committed the space will have been accounted for
2484                          * in the blockmap.
2485                          */
2486                         /* fall through */
2487                 case HAMMER_MEM_RECORD_GENERAL:
2488                         /*
2489                          * Set deleted-by-backend flag.  Do not set the
2490                          * backend committed flag, because we are throwing
2491                          * the record away.
2492                          */
2493                         record->flags |= HAMMER_RECF_DELETED_BE;
2494                         ++record->ip->rec_generation;
2495                         error = 0;
2496                         goto done;
2497                 case HAMMER_MEM_RECORD_ADD:
2498                         panic("hammer_sync_record_callback: illegal add "
2499                               "during inode deletion record %p", record);
2500                         break; /* NOT REACHED */
2501                 case HAMMER_MEM_RECORD_INODE:
2502                         panic("hammer_sync_record_callback: attempt to "
2503                               "sync inode record %p?", record);
2504                         break; /* NOT REACHED */
2505                 case HAMMER_MEM_RECORD_DEL:
2506                         /* 
2507                          * Follow through and issue the on-disk deletion
2508                          */
2509                         break;
2510                 }
2511         }
2512
2513         /*
2514          * If DELETED_FE is set special handling is needed for directory
2515          * entries.  Dependant pieces related to the directory entry may
2516          * have already been synced to disk.  If this occurs we have to
2517          * sync the directory entry and then change the in-memory record
2518          * from an ADD to a DELETE to cover the fact that it's been
2519          * deleted by the frontend.
2520          *
2521          * A directory delete covering record (MEM_RECORD_DEL) can never
2522          * be deleted by the frontend.
2523          *
2524          * Any other record type (aka DATA) can be deleted by the frontend.
2525          * XXX At the moment the flusher must skip it because there may
2526          * be another data record in the flush group for the same block,
2527          * meaning that some frontend data changes can leak into the backend's
2528          * synchronization point.
2529          */
2530         if (record->flags & HAMMER_RECF_DELETED_FE) {
2531                 if (record->type == HAMMER_MEM_RECORD_ADD) {
2532                         /*
2533                          * Convert a front-end deleted directory-add to
2534                          * a directory-delete entry later.
2535                          */
2536                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
2537                 } else {
2538                         /*
2539                          * Dispose of the record (race case).  Mark as
2540                          * deleted by backend (and not committed).
2541                          */
2542                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
2543                         record->flags |= HAMMER_RECF_DELETED_BE;
2544                         ++record->ip->rec_generation;
2545                         error = 0;
2546                         goto done;
2547                 }
2548         }
2549
2550         /*
2551          * Assign the create_tid for new records.  Deletions already
2552          * have the record's entire key properly set up.
2553          */
2554         if (record->type != HAMMER_MEM_RECORD_DEL) {
2555                 record->leaf.base.create_tid = trans->tid;
2556                 record->leaf.create_ts = trans->time32;
2557         }
2558         for (;;) {
2559                 error = hammer_ip_sync_record_cursor(cursor, record);
2560                 if (error != EDEADLK)
2561                         break;
2562                 hammer_done_cursor(cursor);
2563                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
2564                                            record->ip);
2565                 if (error)
2566                         break;
2567         }
2568         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
2569
2570         if (error)
2571                 error = -error;
2572 done:
2573         hammer_flush_record_done(record, error);
2574
2575         /*
2576          * Do partial finalization if we have built up too many dirty
2577          * buffers.  Otherwise a buffer cache deadlock can occur when
2578          * doing things like creating tens of thousands of tiny files.
2579          *
2580          * We must release our cursor lock to avoid a 3-way deadlock
2581          * due to the exclusive sync lock the finalizer must get.
2582          *
2583          * WARNING: See warnings in hammer_unlock_cursor() function.
2584          */
2585         if (hammer_flusher_meta_limit(hmp)) {
2586                 hammer_unlock_cursor(cursor);
2587                 hammer_flusher_finalize(trans, 0);
2588                 hammer_lock_cursor(cursor);
2589         }
2590
2591         return(error);
2592 }
2593
2594 /*
2595  * Backend function called by the flusher to sync an inode to media.
2596  */
2597 int
2598 hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip)
2599 {
2600         struct hammer_cursor cursor;
2601         hammer_node_t tmp_node;
2602         hammer_record_t depend;
2603         hammer_record_t next;
2604         int error, tmp_error;
2605         u_int64_t nlinks;
2606
2607         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
2608                 return(0);
2609
2610         error = hammer_init_cursor(trans, &cursor, &ip->cache[1], ip);
2611         if (error)
2612                 goto done;
2613
2614         /*
2615          * Any directory records referencing this inode which are not in
2616          * our current flush group must adjust our nlink count for the
2617          * purposes of synchronization to disk.
2618          *
2619          * Records which are in our flush group can be unlinked from our
2620          * inode now, potentially allowing the inode to be physically
2621          * deleted.
2622          *
2623          * This cannot block.
2624          */
2625         nlinks = ip->ino_data.nlinks;
2626         next = TAILQ_FIRST(&ip->target_list);
2627         while ((depend = next) != NULL) {
2628                 next = TAILQ_NEXT(depend, target_entry);
2629                 if (depend->flush_state == HAMMER_FST_FLUSH &&
2630                     depend->flush_group == ip->flush_group) {
2631                         /*
2632                          * If this is an ADD that was deleted by the frontend
2633                          * the frontend nlinks count will have already been
2634                          * decremented, but the backend is going to sync its
2635                          * directory entry and must account for it.  The
2636                          * record will be converted to a delete-on-disk when
2637                          * it gets synced.
2638                          *
2639                          * If the ADD was not deleted by the frontend we
2640                          * can remove the dependancy from our target_list.
2641                          */
2642                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
2643                                 ++nlinks;
2644                         } else {
2645                                 TAILQ_REMOVE(&ip->target_list, depend,
2646                                              target_entry);
2647                                 depend->target_ip = NULL;
2648                         }
2649                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
2650                         /*
2651                          * Not part of our flush group and not deleted by
2652                          * the front-end, adjust the link count synced to
2653                          * the media (undo what the frontend did when it
2654                          * queued the record).
2655                          */
2656                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
2657                         switch(depend->type) {
2658                         case HAMMER_MEM_RECORD_ADD:
2659                                 --nlinks;
2660                                 break;
2661                         case HAMMER_MEM_RECORD_DEL:
2662                                 ++nlinks;
2663                                 break;
2664                         default:
2665                                 break;
2666                         }
2667                 }
2668         }
2669
2670         /*
2671          * Set dirty if we had to modify the link count.
2672          */
2673         if (ip->sync_ino_data.nlinks != nlinks) {
2674                 KKASSERT((int64_t)nlinks >= 0);
2675                 ip->sync_ino_data.nlinks = nlinks;
2676                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2677         }
2678
2679         /*
2680          * If there is a trunction queued destroy any data past the (aligned)
2681          * truncation point.  Userland will have dealt with the buffer
2682          * containing the truncation point for us.
2683          *
2684          * We don't flush pending frontend data buffers until after we've
2685          * dealt with the truncation.
2686          */
2687         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2688                 /*
2689                  * Interlock trunc_off.  The VOP front-end may continue to
2690                  * make adjustments to it while we are blocked.
2691                  */
2692                 off_t trunc_off;
2693                 off_t aligned_trunc_off;
2694                 int blkmask;
2695
2696                 trunc_off = ip->sync_trunc_off;
2697                 blkmask = hammer_blocksize(trunc_off) - 1;
2698                 aligned_trunc_off = (trunc_off + blkmask) & ~(int64_t)blkmask;
2699
2700                 /*
2701                  * Delete any whole blocks on-media.  The front-end has
2702                  * already cleaned out any partial block and made it
2703                  * pending.  The front-end may have updated trunc_off
2704                  * while we were blocked so we only use sync_trunc_off.
2705                  *
2706                  * This operation can blow out the buffer cache, EWOULDBLOCK
2707                  * means we were unable to complete the deletion.  The
2708                  * deletion will update sync_trunc_off in that case.
2709                  */
2710                 error = hammer_ip_delete_range(&cursor, ip,
2711                                                 aligned_trunc_off,
2712                                                 0x7FFFFFFFFFFFFFFFLL, 2);
2713                 if (error == EWOULDBLOCK) {
2714                         ip->flags |= HAMMER_INODE_WOULDBLOCK;
2715                         error = 0;
2716                         goto defer_buffer_flush;
2717                 }
2718
2719                 if (error)
2720                         goto done;
2721
2722                 /*
2723                  * Clear the truncation flag on the backend after we have
2724                  * complete the deletions.  Backend data is now good again
2725                  * (including new records we are about to sync, below).
2726                  *
2727                  * Leave sync_trunc_off intact.  As we write additional
2728                  * records the backend will update sync_trunc_off.  This
2729                  * tells the backend whether it can skip the overwrite
2730                  * test.  This should work properly even when the backend
2731                  * writes full blocks where the truncation point straddles
2732                  * the block because the comparison is against the base
2733                  * offset of the record.
2734                  */
2735                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2736                 /* ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL; */
2737         } else {
2738                 error = 0;
2739         }
2740
2741         /*
2742          * Now sync related records.  These will typically be directory
2743          * entries, records tracking direct-writes, or delete-on-disk records.
2744          */
2745         if (error == 0) {
2746                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
2747                                     hammer_sync_record_callback, &cursor);
2748                 if (tmp_error < 0)
2749                         tmp_error = -error;
2750                 if (tmp_error)
2751                         error = tmp_error;
2752         }
2753         hammer_cache_node(&ip->cache[1], cursor.node);
2754
2755         /*
2756          * Re-seek for inode update, assuming our cache hasn't been ripped
2757          * out from under us.
2758          */
2759         if (error == 0) {
2760                 tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
2761                 if (tmp_node) {
2762                         hammer_cursor_downgrade(&cursor);
2763                         hammer_lock_sh(&tmp_node->lock);
2764                         if ((tmp_node->flags & HAMMER_NODE_DELETED) == 0)
2765                                 hammer_cursor_seek(&cursor, tmp_node, 0);
2766                         hammer_unlock(&tmp_node->lock);
2767                         hammer_rel_node(tmp_node);
2768                 }
2769                 error = 0;
2770         }
2771
2772         /*
2773          * If we are deleting the inode the frontend had better not have
2774          * any active references on elements making up the inode.
2775          *
2776          * The call to hammer_ip_delete_clean() cleans up auxillary records
2777          * but not DB or DATA records.  Those must have already been deleted
2778          * by the normal truncation mechanic.
2779          */
2780         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
2781                 RB_EMPTY(&ip->rec_tree)  &&
2782             (ip->sync_flags & HAMMER_INODE_DELETING) &&
2783             (ip->flags & HAMMER_INODE_DELETED) == 0) {
2784                 int count1 = 0;
2785
2786                 error = hammer_ip_delete_clean(&cursor, ip, &count1);
2787                 if (error == 0) {
2788                         ip->flags |= HAMMER_INODE_DELETED;
2789                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
2790                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
2791                         KKASSERT(RB_EMPTY(&ip->rec_tree));
2792
2793                         /*
2794                          * Set delete_tid in both the frontend and backend
2795                          * copy of the inode record.  The DELETED flag handles
2796                          * this, do not set RDIRTY.
2797                          */
2798                         ip->ino_leaf.base.delete_tid = trans->tid;
2799                         ip->sync_ino_leaf.base.delete_tid = trans->tid;
2800                         ip->ino_leaf.delete_ts = trans->time32;
2801                         ip->sync_ino_leaf.delete_ts = trans->time32;
2802
2803
2804                         /*
2805                          * Adjust the inode count in the volume header
2806                          */
2807                         hammer_sync_lock_sh(trans);
2808                         if (ip->flags & HAMMER_INODE_ONDISK) {
2809                                 hammer_modify_volume_field(trans,
2810                                                            trans->rootvol,
2811                                                            vol0_stat_inodes);
2812                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
2813                                 hammer_modify_volume_done(trans->rootvol);
2814                         }
2815                         hammer_sync_unlock(trans);
2816                 }
2817         }
2818
2819         if (error)
2820                 goto done;
2821         ip->sync_flags &= ~HAMMER_INODE_BUFS;
2822
2823 defer_buffer_flush:
2824         /*
2825          * Now update the inode's on-disk inode-data and/or on-disk record.
2826          * DELETED and ONDISK are managed only in ip->flags.
2827          *
2828          * In the case of a defered buffer flush we still update the on-disk
2829          * inode to satisfy visibility requirements if there happen to be
2830          * directory dependancies.
2831          */
2832         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
2833         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
2834                 /*
2835                  * If deleted and on-disk, don't set any additional flags.
2836                  * the delete flag takes care of things.
2837                  *
2838                  * Clear flags which may have been set by the frontend.
2839                  */
2840                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
2841                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
2842                                     HAMMER_INODE_DELETING);
2843                 break;
2844         case HAMMER_INODE_DELETED:
2845                 /*
2846                  * Take care of the case where a deleted inode was never
2847                  * flushed to the disk in the first place.
2848                  *
2849                  * Clear flags which may have been set by the frontend.
2850                  */
2851                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
2852                                     HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
2853                                     HAMMER_INODE_DELETING);
2854                 while (RB_ROOT(&ip->rec_tree)) {
2855                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
2856                         hammer_ref(&record->lock);
2857                         KKASSERT(record->lock.refs == 1);
2858                         record->flags |= HAMMER_RECF_DELETED_BE;
2859                         ++record->ip->rec_generation;
2860                         hammer_rel_mem_record(record);
2861                 }
2862                 break;
2863         case HAMMER_INODE_ONDISK:
2864                 /*
2865                  * If already on-disk, do not set any additional flags.
2866                  */
2867                 break;
2868         default:
2869                 /*
2870                  * If not on-disk and not deleted, set DDIRTY to force
2871                  * an initial record to be written.
2872                  *
2873                  * Also set the create_tid in both the frontend and backend
2874                  * copy of the inode record.
2875                  */
2876                 ip->ino_leaf.base.create_tid = trans->tid;
2877                 ip->ino_leaf.create_ts = trans->time32;
2878                 ip->sync_ino_leaf.base.create_tid = trans->tid;
2879                 ip->sync_ino_leaf.create_ts = trans->time32;
2880                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
2881                 break;
2882         }
2883
2884         /*
2885          * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
2886          * is already on-disk the old record is marked as deleted.
2887          *
2888          * If DELETED is set hammer_update_inode() will delete the existing
2889          * record without writing out a new one.
2890          *
2891          * If *ONLY* the ITIMES flag is set we can update the record in-place.
2892          */
2893         if (ip->flags & HAMMER_INODE_DELETED) {
2894                 error = hammer_update_inode(&cursor, ip);
2895         } else 
2896         if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
2897             (ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
2898                 error = hammer_update_itimes(&cursor, ip);
2899         } else
2900         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
2901                 error = hammer_update_inode(&cursor, ip);
2902         }
2903 done:
2904         if (error) {
2905                 hammer_critical_error(ip->hmp, ip, error,
2906                                       "while syncing inode");
2907         }
2908         hammer_done_cursor(&cursor);
2909         return(error);
2910 }
2911
2912 /*
2913  * This routine is called when the OS is no longer actively referencing
2914  * the inode (but might still be keeping it cached), or when releasing
2915  * the last reference to an inode.
2916  *
2917  * At this point if the inode's nlinks count is zero we want to destroy
2918  * it, which may mean destroying it on-media too.
2919  */
2920 void
2921 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
2922 {
2923         struct vnode *vp;
2924
2925         /*
2926          * Set the DELETING flag when the link count drops to 0 and the
2927          * OS no longer has any opens on the inode.
2928          *
2929          * The backend will clear DELETING (a mod flag) and set DELETED
2930          * (a state flag) when it is actually able to perform the
2931          * operation.
2932          *
2933          * Don't reflag the deletion if the flusher is currently syncing
2934          * one that was already flagged.  A previously set DELETING flag
2935          * may bounce around flags and sync_flags until the operation is
2936          * completely done.
2937          */
2938         if (ip->ino_data.nlinks == 0 &&
2939             ((ip->flags | ip->sync_flags) & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
2940                 ip->flags |= HAMMER_INODE_DELETING;
2941                 ip->flags |= HAMMER_INODE_TRUNCATED;
2942                 ip->trunc_off = 0;
2943                 vp = NULL;
2944                 if (getvp) {
2945                         if (hammer_get_vnode(ip, &vp) != 0)
2946                                 return;
2947                 }
2948
2949                 /*
2950                  * Final cleanup
2951                  */
2952                 if (ip->vp) {
2953                         vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
2954                         vnode_pager_setsize(ip->vp, 0);
2955                 }
2956                 if (getvp) {
2957                         vput(vp);
2958                 }
2959         }
2960 }
2961
2962 /*
2963  * After potentially resolving a dependancy the inode is tested
2964  * to determine whether it needs to be reflushed.
2965  */
2966 void
2967 hammer_test_inode(hammer_inode_t ip)
2968 {
2969         if (ip->flags & HAMMER_INODE_REFLUSH) {
2970                 ip->flags &= ~HAMMER_INODE_REFLUSH;
2971                 hammer_ref(&ip->lock);
2972                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
2973                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
2974                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2975                 } else {
2976                         hammer_flush_inode(ip, 0);
2977                 }
2978                 hammer_rel_inode(ip, 0);
2979         }
2980 }
2981
2982 /*
2983  * Clear the RECLAIM flag on an inode.  This occurs when the inode is
2984  * reassociated with a vp or just before it gets freed.
2985  *
2986  * Pipeline wakeups to threads blocked due to an excessive number of
2987  * detached inodes.  This typically occurs when atime updates accumulate
2988  * while scanning a directory tree.
2989  */
2990 static void
2991 hammer_inode_wakereclaims(hammer_inode_t ip)
2992 {
2993         struct hammer_reclaim *reclaim;
2994         hammer_mount_t hmp = ip->hmp;
2995
2996         if ((ip->flags & HAMMER_INODE_RECLAIM) == 0)
2997                 return;
2998
2999         --hammer_count_reclaiming;
3000         --hmp->inode_reclaims;
3001         ip->flags &= ~HAMMER_INODE_RECLAIM;
3002
3003         while ((reclaim = TAILQ_FIRST(&hmp->reclaim_list)) != NULL) {
3004                 if (reclaim->count > 0 && --reclaim->count == 0) {
3005                         TAILQ_REMOVE(&hmp->reclaim_list, reclaim, entry);
3006                         wakeup(reclaim);
3007                 }
3008                 if (hmp->inode_reclaims > hammer_limit_reclaim / 2)
3009                         break;
3010         }
3011 }
3012
3013 /*
3014  * Setup our reclaim pipeline.  We only let so many detached (and dirty)
3015  * inodes build up before we start blocking.  This routine is called
3016  * if a new inode is created or an inode is loaded from media.
3017  *
3018  * When we block we don't care *which* inode has finished reclaiming,
3019  * as lone as one does.
3020  */
3021 void
3022 hammer_inode_waitreclaims(hammer_mount_t hmp)
3023 {
3024         struct hammer_reclaim reclaim;
3025
3026         if (hmp->inode_reclaims < hammer_limit_reclaim)
3027                 return;
3028         reclaim.count = 1;
3029         TAILQ_INSERT_TAIL(&hmp->reclaim_list, &reclaim, entry);
3030         tsleep(&reclaim, 0, "hmrrcm", hz);
3031         if (reclaim.count > 0)
3032                 TAILQ_REMOVE(&hmp->reclaim_list, &reclaim, entry);
3033 }
3034
3035 #if 0
3036
3037 /*
3038  * XXX not used, doesn't work very well due to the large batching nature
3039  * of flushes.
3040  *
3041  * A larger then normal backlog of inodes is sitting in the flusher,
3042  * enforce a general slowdown to let it catch up.  This routine is only
3043  * called on completion of a non-flusher-related transaction which
3044  * performed B-Tree node I/O.
3045  *
3046  * It is possible for the flusher to stall in a continuous load.
3047  * blogbench -i1000 -o seems to do a good job generating this sort of load.
3048  * If the flusher is unable to catch up the inode count can bloat until
3049  * we run out of kvm.
3050  *
3051  * This is a bit of a hack.
3052  */
3053 void
3054 hammer_inode_waithard(hammer_mount_t hmp)
3055 {
3056         /*
3057          * Hysteresis.
3058          */
3059         if (hmp->flags & HAMMER_MOUNT_FLUSH_RECOVERY) {
3060                 if (hmp->inode_reclaims < hammer_limit_reclaim / 2 &&
3061                     hmp->count_iqueued < hmp->count_inodes / 20) {
3062                         hmp->flags &= ~HAMMER_MOUNT_FLUSH_RECOVERY;
3063                         return;
3064                 }
3065         } else {
3066                 if (hmp->inode_reclaims < hammer_limit_reclaim ||
3067                     hmp->count_iqueued < hmp->count_inodes / 10) {
3068                         return;
3069                 }
3070                 hmp->flags |= HAMMER_MOUNT_FLUSH_RECOVERY;
3071         }
3072
3073         /*
3074          * Block for one flush cycle.
3075          */
3076         hammer_flusher_wait_next(hmp);
3077 }
3078
3079 #endif