HAMMER 53C/Many: Stabilization
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.67 2008/06/09 04:19:10 dillon Exp $
35  */
36
37 #include "hammer.h"
38 #include <vm/vm_extern.h>
39 #include <sys/buf.h>
40 #include <sys/buf2.h>
41
42 static int hammer_unload_inode(struct hammer_inode *ip);
43 static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
44 static int hammer_setup_child_callback(hammer_record_t rec, void *data);
45 static int hammer_setup_parent_inodes(hammer_record_t record);
46
47 #ifdef DEBUG_TRUNCATE
48 extern struct hammer_inode *HammerTruncIp;
49 #endif
50
51 /*
52  * The kernel is not actively referencing this vnode but is still holding
53  * it cached.
54  *
55  * This is called from the frontend.
56  */
57 int
58 hammer_vop_inactive(struct vop_inactive_args *ap)
59 {
60         struct hammer_inode *ip = VTOI(ap->a_vp);
61
62         /*
63          * Degenerate case
64          */
65         if (ip == NULL) {
66                 vrecycle(ap->a_vp);
67                 return(0);
68         }
69
70         /*
71          * If the inode no longer has visibility in the filesystem and is
72          * fairly clean, try to recycle it immediately.  This can deadlock
73          * in vfsync() if we aren't careful.
74          * 
75          * Do not queue the inode to the flusher if we still have visibility,
76          * otherwise namespace calls such as chmod will unnecessarily generate
77          * multiple inode updates.
78          */
79         hammer_inode_unloadable_check(ip, 0);
80         if (ip->ino_data.nlinks == 0) {
81                 if (ip->flags & HAMMER_INODE_MODMASK)
82                         hammer_flush_inode(ip, 0);
83                 else
84                         vrecycle(ap->a_vp);
85         }
86         return(0);
87 }
88
89 /*
90  * Release the vnode association.  This is typically (but not always)
91  * the last reference on the inode.
92  *
93  * Once the association is lost we are on our own with regards to
94  * flushing the inode.
95  */
96 int
97 hammer_vop_reclaim(struct vop_reclaim_args *ap)
98 {
99         struct hammer_inode *ip;
100         struct vnode *vp;
101
102         vp = ap->a_vp;
103
104         if ((ip = vp->v_data) != NULL) {
105                 vp->v_data = NULL;
106                 ip->vp = NULL;
107                 if ((ip->flags & HAMMER_INODE_RECLAIM) == 0) {
108                         ++hammer_count_reclaiming;
109                         ++ip->hmp->inode_reclaims;
110                         ip->flags |= HAMMER_INODE_RECLAIM;
111                 }
112                 hammer_rel_inode(ip, 1);
113         }
114         return(0);
115 }
116
117 /*
118  * Return a locked vnode for the specified inode.  The inode must be
119  * referenced but NOT LOCKED on entry and will remain referenced on
120  * return.
121  *
122  * Called from the frontend.
123  */
124 int
125 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
126 {
127         hammer_mount_t hmp;
128         struct vnode *vp;
129         int error = 0;
130
131         hmp = ip->hmp;
132
133         for (;;) {
134                 if ((vp = ip->vp) == NULL) {
135                         error = getnewvnode(VT_HAMMER, hmp->mp, vpp, 0, 0);
136                         if (error)
137                                 break;
138                         hammer_lock_ex(&ip->lock);
139                         if (ip->vp != NULL) {
140                                 hammer_unlock(&ip->lock);
141                                 vp->v_type = VBAD;
142                                 vx_put(vp);
143                                 continue;
144                         }
145                         hammer_ref(&ip->lock);
146                         vp = *vpp;
147                         ip->vp = vp;
148                         vp->v_type =
149                                 hammer_get_vnode_type(ip->ino_data.obj_type);
150
151                         if (ip->flags & HAMMER_INODE_RECLAIM) {
152                                 --hammer_count_reclaiming;
153                                 --hmp->inode_reclaims;
154                                 ip->flags &= ~HAMMER_INODE_RECLAIM;
155                                 if (hmp->flags & HAMMER_MOUNT_WAITIMAX)
156                                         hammer_inode_wakereclaims(hmp);
157                         }
158
159                         switch(ip->ino_data.obj_type) {
160                         case HAMMER_OBJTYPE_CDEV:
161                         case HAMMER_OBJTYPE_BDEV:
162                                 vp->v_ops = &hmp->mp->mnt_vn_spec_ops;
163                                 addaliasu(vp, ip->ino_data.rmajor,
164                                           ip->ino_data.rminor);
165                                 break;
166                         case HAMMER_OBJTYPE_FIFO:
167                                 vp->v_ops = &hmp->mp->mnt_vn_fifo_ops;
168                                 break;
169                         default:
170                                 break;
171                         }
172
173                         /*
174                          * Only mark as the root vnode if the ip is not
175                          * historical, otherwise the VFS cache will get
176                          * confused.  The other half of the special handling
177                          * is in hammer_vop_nlookupdotdot().
178                          */
179                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
180                             ip->obj_asof == hmp->asof) {
181                                 vp->v_flag |= VROOT;
182                         }
183
184                         vp->v_data = (void *)ip;
185                         /* vnode locked by getnewvnode() */
186                         /* make related vnode dirty if inode dirty? */
187                         hammer_unlock(&ip->lock);
188                         if (vp->v_type == VREG)
189                                 vinitvmio(vp, ip->ino_data.size);
190                         break;
191                 }
192
193                 /*
194                  * loop if the vget fails (aka races), or if the vp
195                  * no longer matches ip->vp.
196                  */
197                 if (vget(vp, LK_EXCLUSIVE) == 0) {
198                         if (vp == ip->vp)
199                                 break;
200                         vput(vp);
201                 }
202         }
203         *vpp = vp;
204         return(error);
205 }
206
207 /*
208  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
209  * do not attach or detach the related vnode (use hammer_get_vnode() for
210  * that).
211  *
212  * The flags argument is only applied for newly created inodes, and only
213  * certain flags are inherited.
214  *
215  * Called from the frontend.
216  */
217 struct hammer_inode *
218 hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache,
219                  u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
220 {
221         hammer_mount_t hmp = trans->hmp;
222         struct hammer_inode_info iinfo;
223         struct hammer_cursor cursor;
224         struct hammer_inode *ip;
225
226         /*
227          * Determine if we already have an inode cached.  If we do then
228          * we are golden.
229          */
230         iinfo.obj_id = obj_id;
231         iinfo.obj_asof = asof;
232 loop:
233         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
234         if (ip) {
235                 hammer_ref(&ip->lock);
236                 *errorp = 0;
237                 return(ip);
238         }
239
240         ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
241         ++hammer_count_inodes;
242         ++hmp->count_inodes;
243         ip->obj_id = obj_id;
244         ip->obj_asof = iinfo.obj_asof;
245         ip->hmp = hmp;
246         ip->flags = flags & HAMMER_INODE_RO;
247         if (hmp->ronly)
248                 ip->flags |= HAMMER_INODE_RO;
249         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
250         RB_INIT(&ip->rec_tree);
251         TAILQ_INIT(&ip->target_list);
252
253         /*
254          * Locate the on-disk inode.
255          */
256 retry:
257         hammer_init_cursor(trans, &cursor, cache, NULL);
258         cursor.key_beg.localization = HAMMER_LOCALIZE_INODE;
259         cursor.key_beg.obj_id = ip->obj_id;
260         cursor.key_beg.key = 0;
261         cursor.key_beg.create_tid = 0;
262         cursor.key_beg.delete_tid = 0;
263         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
264         cursor.key_beg.obj_type = 0;
265         cursor.asof = iinfo.obj_asof;
266         cursor.flags = HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_GET_DATA |
267                        HAMMER_CURSOR_ASOF;
268
269         *errorp = hammer_btree_lookup(&cursor);
270         if (*errorp == EDEADLK) {
271                 hammer_done_cursor(&cursor);
272                 goto retry;
273         }
274
275         /*
276          * On success the B-Tree lookup will hold the appropriate
277          * buffer cache buffers and provide a pointer to the requested
278          * information.  Copy the information to the in-memory inode
279          * and cache the B-Tree node to improve future operations.
280          */
281         if (*errorp == 0) {
282                 ip->ino_leaf = cursor.node->ondisk->elms[cursor.index].leaf;
283                 ip->ino_data = cursor.data->inode;
284                 hammer_cache_node(cursor.node, &ip->cache[0]);
285                 if (cache)
286                         hammer_cache_node(cursor.node, cache);
287         }
288
289         /*
290          * On success load the inode's record and data and insert the
291          * inode into the B-Tree.  It is possible to race another lookup
292          * insertion of the same inode so deal with that condition too.
293          *
294          * The cursor's locked node interlocks against others creating and
295          * destroying ip while we were blocked.
296          */
297         if (*errorp == 0) {
298                 hammer_ref(&ip->lock);
299                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
300                         hammer_uncache_node(&ip->cache[0]);
301                         hammer_uncache_node(&ip->cache[1]);
302                         KKASSERT(ip->lock.refs == 1);
303                         --hammer_count_inodes;
304                         --hmp->count_inodes;
305                         kfree(ip, M_HAMMER);
306                         hammer_done_cursor(&cursor);
307                         goto loop;
308                 }
309                 ip->flags |= HAMMER_INODE_ONDISK;
310         } else {
311                 /*
312                  * Do not panic on read-only accesses which fail, particularly
313                  * historical accesses where the snapshot might not have
314                  * complete connectivity.
315                  */
316                 if ((flags & HAMMER_INODE_RO) == 0) {
317                         kprintf("hammer_get_inode: failed ip %p obj_id %016llx cursor %p error %d\n",
318                                 ip, ip->obj_id, &cursor, *errorp);
319                         Debugger("x");
320                 }
321                 if (ip->flags & HAMMER_INODE_RSV_INODES) {
322                         ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
323                         --hmp->rsv_inodes;
324                 }
325                 hmp->rsv_databufs -= ip->rsv_databufs;
326                 ip->rsv_databufs = 0;                          /* sanity */
327
328                 --hammer_count_inodes;
329                 --hmp->count_inodes;
330                 kfree(ip, M_HAMMER);
331                 ip = NULL;
332         }
333         hammer_done_cursor(&cursor);
334         return (ip);
335 }
336
337 /*
338  * Create a new filesystem object, returning the inode in *ipp.  The
339  * returned inode will be referenced.
340  *
341  * The inode is created in-memory.
342  */
343 int
344 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
345                     struct ucred *cred, hammer_inode_t dip,
346                     struct hammer_inode **ipp)
347 {
348         hammer_mount_t hmp;
349         hammer_inode_t ip;
350         uid_t xuid;
351
352         hmp = trans->hmp;
353         ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
354         ++hammer_count_inodes;
355         ++hmp->count_inodes;
356         ip->obj_id = hammer_alloc_objid(trans, dip);
357         KKASSERT(ip->obj_id != 0);
358         ip->obj_asof = hmp->asof;
359         ip->hmp = hmp;
360         ip->flush_state = HAMMER_FST_IDLE;
361         ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_ITIMES;
362
363         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
364         RB_INIT(&ip->rec_tree);
365         TAILQ_INIT(&ip->target_list);
366
367         ip->ino_leaf.atime = trans->time;
368         ip->ino_data.mtime = trans->time;
369         ip->ino_data.size = 0;
370         ip->ino_data.nlinks = 0;
371
372         /*
373          * A nohistory designator on the parent directory is inherited by
374          * the child.
375          */
376         ip->ino_data.uflags = dip->ino_data.uflags &
377                               (SF_NOHISTORY|UF_NOHISTORY|UF_NODUMP);
378
379         ip->ino_leaf.base.btype = HAMMER_BTREE_TYPE_RECORD;
380         ip->ino_leaf.base.localization = HAMMER_LOCALIZE_INODE;
381         ip->ino_leaf.base.obj_id = ip->obj_id;
382         ip->ino_leaf.base.key = 0;
383         ip->ino_leaf.base.create_tid = 0;
384         ip->ino_leaf.base.delete_tid = 0;
385         ip->ino_leaf.base.rec_type = HAMMER_RECTYPE_INODE;
386         ip->ino_leaf.base.obj_type = hammer_get_obj_type(vap->va_type);
387
388         ip->ino_data.obj_type = ip->ino_leaf.base.obj_type;
389         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
390         ip->ino_data.mode = vap->va_mode;
391         ip->ino_data.ctime = trans->time;
392         ip->ino_data.parent_obj_id = (dip) ? dip->ino_leaf.base.obj_id : 0;
393
394         switch(ip->ino_leaf.base.obj_type) {
395         case HAMMER_OBJTYPE_CDEV:
396         case HAMMER_OBJTYPE_BDEV:
397                 ip->ino_data.rmajor = vap->va_rmajor;
398                 ip->ino_data.rminor = vap->va_rminor;
399                 break;
400         default:
401                 break;
402         }
403
404         /*
405          * Calculate default uid/gid and overwrite with information from
406          * the vap.
407          */
408         xuid = hammer_to_unix_xid(&dip->ino_data.uid);
409         xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
410                                      &vap->va_mode);
411         ip->ino_data.mode = vap->va_mode;
412
413         if (vap->va_vaflags & VA_UID_UUID_VALID)
414                 ip->ino_data.uid = vap->va_uid_uuid;
415         else if (vap->va_uid != (uid_t)VNOVAL)
416                 hammer_guid_to_uuid(&ip->ino_data.uid, vap->va_uid);
417         else
418                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
419
420         if (vap->va_vaflags & VA_GID_UUID_VALID)
421                 ip->ino_data.gid = vap->va_gid_uuid;
422         else if (vap->va_gid != (gid_t)VNOVAL)
423                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
424         else
425                 ip->ino_data.gid = dip->ino_data.gid;
426
427         hammer_ref(&ip->lock);
428         if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
429                 hammer_unref(&ip->lock);
430                 panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
431         }
432         *ipp = ip;
433         return(0);
434 }
435
436 /*
437  * Called by hammer_sync_inode().
438  */
439 static int
440 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
441 {
442         hammer_transaction_t trans = cursor->trans;
443         hammer_record_t record;
444         int error;
445
446 retry:
447         error = 0;
448
449         /*
450          * If the inode has a presence on-disk then locate it and mark
451          * it deleted, setting DELONDISK.
452          *
453          * The record may or may not be physically deleted, depending on
454          * the retention policy.
455          */
456         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
457             HAMMER_INODE_ONDISK) {
458                 hammer_normalize_cursor(cursor);
459                 cursor->key_beg.localization = HAMMER_LOCALIZE_INODE;
460                 cursor->key_beg.obj_id = ip->obj_id;
461                 cursor->key_beg.key = 0;
462                 cursor->key_beg.create_tid = 0;
463                 cursor->key_beg.delete_tid = 0;
464                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
465                 cursor->key_beg.obj_type = 0;
466                 cursor->asof = ip->obj_asof;
467                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
468                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
469                 cursor->flags |= HAMMER_CURSOR_BACKEND;
470
471                 error = hammer_btree_lookup(cursor);
472                 if (hammer_debug_inode)
473                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
474                 if (error) {
475                         kprintf("error %d\n", error);
476                         Debugger("hammer_update_inode");
477                 }
478
479                 if (error == 0) {
480                         error = hammer_ip_delete_record(cursor, ip, trans->tid);
481                         if (hammer_debug_inode)
482                                 kprintf(" error %d\n", error);
483                         if (error && error != EDEADLK) {
484                                 kprintf("error %d\n", error);
485                                 Debugger("hammer_update_inode2");
486                         }
487                         if (error == 0) {
488                                 ip->flags |= HAMMER_INODE_DELONDISK;
489                         }
490                         if (cursor->node)
491                                 hammer_cache_node(cursor->node, &ip->cache[0]);
492                 }
493                 if (error == EDEADLK) {
494                         hammer_done_cursor(cursor);
495                         error = hammer_init_cursor(trans, cursor,
496                                                    &ip->cache[0], ip);
497                         if (hammer_debug_inode)
498                                 kprintf("IPDED %p %d\n", ip, error);
499                         if (error == 0)
500                                 goto retry;
501                 }
502         }
503
504         /*
505          * Ok, write out the initial record or a new record (after deleting
506          * the old one), unless the DELETED flag is set.  This routine will
507          * clear DELONDISK if it writes out a record.
508          *
509          * Update our inode statistics if this is the first application of
510          * the inode on-disk.
511          */
512         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
513                 /*
514                  * Generate a record and write it to the media
515                  */
516                 record = hammer_alloc_mem_record(ip, 0);
517                 record->type = HAMMER_MEM_RECORD_INODE;
518                 record->flush_state = HAMMER_FST_FLUSH;
519                 record->leaf = ip->sync_ino_leaf;
520                 record->leaf.base.create_tid = trans->tid;
521                 record->leaf.data_len = sizeof(ip->sync_ino_data);
522                 record->data = (void *)&ip->sync_ino_data;
523                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
524                 for (;;) {
525                         error = hammer_ip_sync_record_cursor(cursor, record);
526                         if (hammer_debug_inode)
527                                 kprintf("GENREC %p rec %08x %d\n",      
528                                         ip, record->flags, error);
529                         if (error != EDEADLK)
530                                 break;
531                         hammer_done_cursor(cursor);
532                         error = hammer_init_cursor(trans, cursor,
533                                                    &ip->cache[0], ip);
534                         if (hammer_debug_inode)
535                                 kprintf("GENREC reinit %d\n", error);
536                         if (error)
537                                 break;
538                 }
539                 if (error) {
540                         kprintf("error %d\n", error);
541                         Debugger("hammer_update_inode3");
542                 }
543
544                 /*
545                  * The record isn't managed by the inode's record tree,
546                  * destroy it whether we succeed or fail.
547                  */
548                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
549                 record->flags |= HAMMER_RECF_DELETED_FE;
550                 record->flush_state = HAMMER_FST_IDLE;
551                 hammer_rel_mem_record(record);
552
553                 /*
554                  * Finish up.
555                  */
556                 if (error == 0) {
557                         if (hammer_debug_inode)
558                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
559                         ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
560                                             HAMMER_INODE_ITIMES);
561                         ip->flags &= ~HAMMER_INODE_DELONDISK;
562
563                         /*
564                          * Root volume count of inodes
565                          */
566                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
567                                 hammer_modify_volume_field(trans,
568                                                            trans->rootvol,
569                                                            vol0_stat_inodes);
570                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
571                                 hammer_modify_volume_done(trans->rootvol);
572                                 ip->flags |= HAMMER_INODE_ONDISK;
573                                 if (hammer_debug_inode)
574                                         kprintf("NOWONDISK %p\n", ip);
575                         }
576                 }
577         }
578
579         /*
580          * If the inode has been destroyed, clean out any left-over flags
581          * that may have been set by the frontend.
582          */
583         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
584                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
585                                     HAMMER_INODE_ITIMES);
586         }
587         return(error);
588 }
589
590 /*
591  * Update only the itimes fields.  This is done no-historically.  The
592  * record is updated in-place on the disk.
593  */
594 static int
595 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
596 {
597         hammer_transaction_t trans = cursor->trans;
598         struct hammer_btree_leaf_elm *leaf;
599         int error;
600
601 retry:
602         error = 0;
603         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
604             HAMMER_INODE_ONDISK) {
605                 hammer_normalize_cursor(cursor);
606                 cursor->key_beg.localization = HAMMER_LOCALIZE_INODE;
607                 cursor->key_beg.obj_id = ip->obj_id;
608                 cursor->key_beg.key = 0;
609                 cursor->key_beg.create_tid = 0;
610                 cursor->key_beg.delete_tid = 0;
611                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
612                 cursor->key_beg.obj_type = 0;
613                 cursor->asof = ip->obj_asof;
614                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
615                 cursor->flags |= HAMMER_CURSOR_GET_LEAF | HAMMER_CURSOR_ASOF;
616                 cursor->flags |= HAMMER_CURSOR_BACKEND;
617
618                 error = hammer_btree_lookup(cursor);
619                 if (error) {
620                         kprintf("error %d\n", error);
621                         Debugger("hammer_update_itimes1");
622                 }
623                 if (error == 0) {
624                         /*
625                          * Do not generate UNDO records for atime updates.
626                          */
627                         leaf = cursor->leaf;
628                         hammer_modify_node(trans, cursor->node, 
629                                            &leaf->atime, sizeof(leaf->atime));
630                         leaf->atime = ip->sync_ino_leaf.atime;
631                         hammer_modify_node_done(cursor->node);
632                         /*rec->ino_mtime = ip->sync_ino_rec.ino_mtime;*/
633                         ip->sync_flags &= ~HAMMER_INODE_ITIMES;
634                         /* XXX recalculate crc */
635                         hammer_cache_node(cursor->node, &ip->cache[0]);
636                 }
637                 if (error == EDEADLK) {
638                         hammer_done_cursor(cursor);
639                         error = hammer_init_cursor(trans, cursor,
640                                                    &ip->cache[0], ip);
641                         if (error == 0)
642                                 goto retry;
643                 }
644         }
645         return(error);
646 }
647
648 /*
649  * Release a reference on an inode, flush as requested.
650  *
651  * On the last reference we queue the inode to the flusher for its final
652  * disposition.
653  */
654 void
655 hammer_rel_inode(struct hammer_inode *ip, int flush)
656 {
657         hammer_mount_t hmp = ip->hmp;
658
659         /*
660          * Handle disposition when dropping the last ref.
661          */
662         for (;;) {
663                 if (ip->lock.refs == 1) {
664                         /*
665                          * Determine whether on-disk action is needed for
666                          * the inode's final disposition.
667                          */
668                         KKASSERT(ip->vp == NULL);
669                         hammer_inode_unloadable_check(ip, 0);
670                         if (ip->flags & HAMMER_INODE_MODMASK) {
671                                 if (hmp->rsv_inodes > desiredvnodes) {
672                                         hammer_flush_inode(ip,
673                                                            HAMMER_FLUSH_SIGNAL);
674                                 } else {
675                                         hammer_flush_inode(ip, 0);
676                                 }
677                         } else if (ip->lock.refs == 1) {
678                                 hammer_unload_inode(ip);
679                                 break;
680                         }
681                 } else {
682                         if (flush)
683                                 hammer_flush_inode(ip, 0);
684
685                         /*
686                          * The inode still has multiple refs, try to drop
687                          * one ref.
688                          */
689                         KKASSERT(ip->lock.refs >= 1);
690                         if (ip->lock.refs > 1) {
691                                 hammer_unref(&ip->lock);
692                                 break;
693                         }
694                 }
695         }
696 }
697
698 /*
699  * Unload and destroy the specified inode.  Must be called with one remaining
700  * reference.  The reference is disposed of.
701  *
702  * This can only be called in the context of the flusher.
703  */
704 static int
705 hammer_unload_inode(struct hammer_inode *ip)
706 {
707         hammer_mount_t hmp = ip->hmp;
708
709         KASSERT(ip->lock.refs == 1,
710                 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
711         KKASSERT(ip->vp == NULL);
712         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
713         KKASSERT(ip->cursor_ip_refs == 0);
714         KKASSERT(ip->lock.lockcount == 0);
715         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
716
717         KKASSERT(RB_EMPTY(&ip->rec_tree));
718         KKASSERT(TAILQ_EMPTY(&ip->target_list));
719
720         RB_REMOVE(hammer_ino_rb_tree, &hmp->rb_inos_root, ip);
721
722         hammer_uncache_node(&ip->cache[0]);
723         hammer_uncache_node(&ip->cache[1]);
724         if (ip->objid_cache)
725                 hammer_clear_objid(ip);
726         --hammer_count_inodes;
727         --hmp->count_inodes;
728         if (hmp->flags & HAMMER_MOUNT_WAITIMAX)
729                 hammer_inode_wakereclaims(hmp);
730
731         if (ip->flags & HAMMER_INODE_RECLAIM) {
732                 --hammer_count_reclaiming;
733                 --hmp->inode_reclaims;
734                 ip->flags &= ~HAMMER_INODE_RECLAIM;
735         }
736         kfree(ip, M_HAMMER);
737
738         return(0);
739 }
740
741 /*
742  * Called on mount -u when switching from RW to RO or vise-versa.  Adjust
743  * the read-only flag for cached inodes.
744  *
745  * This routine is called from a RB_SCAN().
746  */
747 int
748 hammer_reload_inode(hammer_inode_t ip, void *arg __unused)
749 {
750         hammer_mount_t hmp = ip->hmp;
751
752         if (hmp->ronly || hmp->asof != HAMMER_MAX_TID)
753                 ip->flags |= HAMMER_INODE_RO;
754         else
755                 ip->flags &= ~HAMMER_INODE_RO;
756         return(0);
757 }
758
759 /*
760  * A transaction has modified an inode, requiring updates as specified by
761  * the passed flags.
762  *
763  * HAMMER_INODE_DDIRTY: Inode data has been updated
764  * HAMMER_INODE_XDIRTY: Dirty in-memory records
765  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
766  * HAMMER_INODE_DELETED: Inode record/data must be deleted
767  * HAMMER_INODE_ITIMES: mtime/atime has been updated
768  */
769 void
770 hammer_modify_inode(hammer_inode_t ip, int flags)
771 {
772         KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 ||
773                   (flags & (HAMMER_INODE_DDIRTY |
774                             HAMMER_INODE_XDIRTY | HAMMER_INODE_BUFS |
775                             HAMMER_INODE_DELETED | HAMMER_INODE_ITIMES)) == 0);
776         if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
777                 ip->flags |= HAMMER_INODE_RSV_INODES;
778                 ++ip->hmp->rsv_inodes;
779         }
780
781         ip->flags |= flags;
782 }
783
784 /*
785  * Request that an inode be flushed.  This whole mess cannot block and may
786  * recurse.  Once requested HAMMER will attempt to actively flush it until
787  * the flush can be done.
788  *
789  * The inode may already be flushing, or may be in a setup state.  We can
790  * place the inode in a flushing state if it is currently idle and flag it
791  * to reflush if it is currently flushing.
792  */
793 void
794 hammer_flush_inode(hammer_inode_t ip, int flags)
795 {
796         hammer_record_t depend;
797         int r, good;
798
799         /*
800          * Trivial 'nothing to flush' case.  If the inode is ina SETUP
801          * state we have to put it back into an IDLE state so we can
802          * drop the extra ref.
803          */
804         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
805                 if (ip->flush_state == HAMMER_FST_SETUP) {
806                         ip->flush_state = HAMMER_FST_IDLE;
807                         hammer_rel_inode(ip, 0);
808                 }
809                 return;
810         }
811
812         /*
813          * Our flush action will depend on the current state.
814          */
815         switch(ip->flush_state) {
816         case HAMMER_FST_IDLE:
817                 /*
818                  * We have no dependancies and can flush immediately.  Some
819                  * our children may not be flushable so we have to re-test
820                  * with that additional knowledge.
821                  */
822                 hammer_flush_inode_core(ip, flags);
823                 break;
824         case HAMMER_FST_SETUP:
825                 /*
826                  * Recurse upwards through dependancies via target_list
827                  * and start their flusher actions going if possible.
828                  *
829                  * 'good' is our connectivity.  -1 means we have none and
830                  * can't flush, 0 means there weren't any dependancies, and
831                  * 1 means we have good connectivity.
832                  */
833                 good = 0;
834                 TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
835                         r = hammer_setup_parent_inodes(depend);
836                         if (r < 0 && good == 0)
837                                 good = -1;
838                         if (r > 0)
839                                 good = 1;
840                 }
841
842                 /*
843                  * We can continue if good >= 0.  Determine how many records
844                  * under our inode can be flushed (and mark them).
845                  */
846                 if (good >= 0) {
847                         hammer_flush_inode_core(ip, flags);
848                 } else {
849                         ip->flags |= HAMMER_INODE_REFLUSH;
850                         if (flags & HAMMER_FLUSH_SIGNAL) {
851                                 ip->flags |= HAMMER_INODE_RESIGNAL;
852                                 hammer_flusher_async(ip->hmp);
853                         }
854                 }
855                 break;
856         default:
857                 /*
858                  * We are already flushing, flag the inode to reflush
859                  * if needed after it completes its current flush.
860                  */
861                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
862                         ip->flags |= HAMMER_INODE_REFLUSH;
863                 if (flags & HAMMER_FLUSH_SIGNAL) {
864                         ip->flags |= HAMMER_INODE_RESIGNAL;
865                         hammer_flusher_async(ip->hmp);
866                 }
867                 break;
868         }
869 }
870
871 /*
872  * We are asked to recurse upwards and convert the record from SETUP
873  * to FLUSH if possible.  record->ip is a parent of the caller's inode,
874  * and record->target_ip is the caller's inode.
875  *
876  * Return 1 if the record gives us connectivity
877  *
878  * Return 0 if the record is not relevant 
879  *
880  * Return -1 if we can't resolve the dependancy and there is no connectivity.
881  */
882 static int
883 hammer_setup_parent_inodes(hammer_record_t record)
884 {
885         hammer_mount_t hmp = record->ip->hmp;
886         hammer_record_t depend;
887         hammer_inode_t ip;
888         int r, good;
889
890         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
891         ip = record->ip;
892
893         /*
894          * If the record is already flushing, is it in our flush group?
895          *
896          * If it is in our flush group but it is a general record or a 
897          * delete-on-disk, it does not improve our connectivity (return 0),
898          * and if the target inode is not trying to destroy itself we can't
899          * allow the operation yet anyway (the second return -1).
900          */
901         if (record->flush_state == HAMMER_FST_FLUSH) {
902                 if (record->flush_group != hmp->flusher_next) {
903                         ip->flags |= HAMMER_INODE_REFLUSH;
904                         return(-1);
905                 }
906                 if (record->type == HAMMER_MEM_RECORD_ADD)
907                         return(1);
908                 /* GENERAL or DEL */
909                 return(0);
910         }
911
912         /*
913          * It must be a setup record.  Try to resolve the setup dependancies
914          * by recursing upwards so we can place ip on the flush list.
915          */
916         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
917
918         good = 0;
919         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
920                 r = hammer_setup_parent_inodes(depend);
921                 if (r < 0 && good == 0)
922                         good = -1;
923                 if (r > 0)
924                         good = 1;
925         }
926
927         /*
928          * We can't flush ip because it has no connectivity (XXX also check
929          * nlinks for pre-existing connectivity!).  Flag it so any resolution
930          * recurses back down.
931          */
932         if (good < 0) {
933                 ip->flags |= HAMMER_INODE_REFLUSH;
934                 return(good);
935         }
936
937         /*
938          * We are go, place the parent inode in a flushing state so we can
939          * place its record in a flushing state.  Note that the parent
940          * may already be flushing.  The record must be in the same flush
941          * group as the parent.
942          */
943         if (ip->flush_state != HAMMER_FST_FLUSH)
944                 hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION);
945         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
946         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
947
948 #if 0
949         if (record->type == HAMMER_MEM_RECORD_DEL &&
950             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
951                 /*
952                  * Regardless of flushing state we cannot sync this path if the
953                  * record represents a delete-on-disk but the target inode
954                  * is not ready to sync its own deletion.
955                  *
956                  * XXX need to count effective nlinks to determine whether
957                  * the flush is ok, otherwise removing a hardlink will
958                  * just leave the DEL record to rot.
959                  */
960                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
961                 return(-1);
962         } else
963 #endif
964         if (ip->flush_group == ip->hmp->flusher_next) {
965                 /*
966                  * This is the record we wanted to synchronize.
967                  */
968                 record->flush_state = HAMMER_FST_FLUSH;
969                 record->flush_group = ip->flush_group;
970                 hammer_ref(&record->lock);
971                 if (record->type == HAMMER_MEM_RECORD_ADD)
972                         return(1);
973
974                 /*
975                  * A general or delete-on-disk record does not contribute
976                  * to our visibility.  We can still flush it, however.
977                  */
978                 return(0);
979         } else {
980                 /*
981                  * We couldn't resolve the dependancies, request that the
982                  * inode be flushed when the dependancies can be resolved.
983                  */
984                 ip->flags |= HAMMER_INODE_REFLUSH;
985                 return(-1);
986         }
987 }
988
989 /*
990  * This is the core routine placing an inode into the FST_FLUSH state.
991  */
992 static void
993 hammer_flush_inode_core(hammer_inode_t ip, int flags)
994 {
995         int go_count;
996
997         /*
998          * Set flush state and prevent the flusher from cycling into
999          * the next flush group.  Do not place the ip on the list yet.
1000          * Inodes not in the idle state get an extra reference.
1001          */
1002         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
1003         if (ip->flush_state == HAMMER_FST_IDLE)
1004                 hammer_ref(&ip->lock);
1005         ip->flush_state = HAMMER_FST_FLUSH;
1006         ip->flush_group = ip->hmp->flusher_next;
1007         ++ip->hmp->flusher_lock;
1008
1009         /*
1010          * We need to be able to vfsync/truncate from the backend.
1011          */
1012         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
1013         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
1014                 ip->flags |= HAMMER_INODE_VHELD;
1015                 vref(ip->vp);
1016         }
1017
1018         /*
1019          * Figure out how many in-memory records we can actually flush
1020          * (not including inode meta-data, buffers, etc).
1021          */
1022         if (flags & HAMMER_FLUSH_RECURSION) {
1023                 go_count = 1;
1024         } else {
1025                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1026                                    hammer_setup_child_callback, NULL);
1027         }
1028
1029         /*
1030          * This is a more involved test that includes go_count.  If we
1031          * can't flush, flag the inode and return.  If go_count is 0 we
1032          * were are unable to flush any records in our rec_tree and
1033          * must ignore the XDIRTY flag.
1034          */
1035         if (go_count == 0) {
1036                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
1037                         ip->flags |= HAMMER_INODE_REFLUSH;
1038                         ip->flush_state = HAMMER_FST_SETUP;
1039                         if (ip->flags & HAMMER_INODE_VHELD) {
1040                                 ip->flags &= ~HAMMER_INODE_VHELD;
1041                                 vrele(ip->vp);
1042                         }
1043                         if (flags & HAMMER_FLUSH_SIGNAL) {
1044                                 ip->flags |= HAMMER_INODE_RESIGNAL;
1045                                 hammer_flusher_async(ip->hmp);
1046                         }
1047                         if (--ip->hmp->flusher_lock == 0)
1048                                 wakeup(&ip->hmp->flusher_lock);
1049                         return;
1050                 }
1051         }
1052
1053         /*
1054          * Snapshot the state of the inode for the backend flusher.
1055          *
1056          * The truncation must be retained in the frontend until after
1057          * we've actually performed the record deletion.
1058          *
1059          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
1060          * and stays in ip->flags.  Once set, it stays set until the
1061          * inode is destroyed.
1062          */
1063         ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
1064         ip->sync_trunc_off = ip->trunc_off;
1065         ip->sync_ino_leaf = ip->ino_leaf;
1066         ip->sync_ino_data = ip->ino_data;
1067         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
1068         ip->flags &= ~HAMMER_INODE_MODMASK;
1069 #ifdef DEBUG_TRUNCATE
1070         if ((ip->sync_flags & HAMMER_INODE_TRUNCATED) && ip == HammerTruncIp)
1071                 kprintf("truncateS %016llx\n", ip->sync_trunc_off);
1072 #endif
1073
1074         /*
1075          * The flusher list inherits our inode and reference.
1076          */
1077         TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
1078         if (--ip->hmp->flusher_lock == 0)
1079                 wakeup(&ip->hmp->flusher_lock);
1080
1081         if (flags & HAMMER_FLUSH_SIGNAL) {
1082                 hammer_flusher_async(ip->hmp);
1083         }
1084 }
1085
1086 /*
1087  * Callback for scan of ip->rec_tree.  Try to include each record in our
1088  * flush.  ip->flush_group has been set but the inode has not yet been
1089  * moved into a flushing state.
1090  *
1091  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
1092  * both inodes.
1093  *
1094  * We return 1 for any record placed or found in FST_FLUSH, which prevents
1095  * the caller from shortcutting the flush.
1096  */
1097 static int
1098 hammer_setup_child_callback(hammer_record_t rec, void *data)
1099 {
1100         hammer_inode_t target_ip;
1101         hammer_inode_t ip;
1102         int r;
1103
1104         /*
1105          * If the record has been deleted by the backend (it's being held
1106          * by the frontend in a race), just ignore it.
1107          */
1108         if (rec->flags & HAMMER_RECF_DELETED_BE)
1109                 return(0);
1110
1111         /*
1112          * If the record is in an idle state it has no dependancies and
1113          * can be flushed.
1114          */
1115         ip = rec->ip;
1116         r = 0;
1117
1118         switch(rec->flush_state) {
1119         case HAMMER_FST_IDLE:
1120                 /*
1121                  * Record has no setup dependancy, we can flush it.
1122                  */
1123                 KKASSERT(rec->target_ip == NULL);
1124                 rec->flush_state = HAMMER_FST_FLUSH;
1125                 rec->flush_group = ip->flush_group;
1126                 hammer_ref(&rec->lock);
1127                 r = 1;
1128                 break;
1129         case HAMMER_FST_SETUP:
1130                 /*
1131                  * Record has a setup dependancy.  Try to include the
1132                  * target ip in the flush. 
1133                  *
1134                  * We have to be careful here, if we do not do the right
1135                  * thing we can lose track of dirty inodes and the system
1136                  * will lockup trying to allocate buffers.
1137                  */
1138                 target_ip = rec->target_ip;
1139                 KKASSERT(target_ip != NULL);
1140                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
1141                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
1142                         /*
1143                          * If the target IP is already flushing in our group
1144                          * we are golden, otherwise make sure the target
1145                          * reflushes.
1146                          */
1147                         if (target_ip->flush_group == ip->flush_group) {
1148                                 rec->flush_state = HAMMER_FST_FLUSH;
1149                                 rec->flush_group = ip->flush_group;
1150                                 hammer_ref(&rec->lock);
1151                                 r = 1;
1152                         } else {
1153                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
1154                         }
1155                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
1156                         /*
1157                          * If the target IP is not flushing we can force
1158                          * it to flush, even if it is unable to write out
1159                          * any of its own records we have at least one in
1160                          * hand that we CAN deal with.
1161                          */
1162                         rec->flush_state = HAMMER_FST_FLUSH;
1163                         rec->flush_group = ip->flush_group;
1164                         hammer_ref(&rec->lock);
1165                         hammer_flush_inode_core(target_ip,
1166                                                 HAMMER_FLUSH_RECURSION);
1167                         r = 1;
1168                 } else {
1169                         /*
1170                          * General or delete-on-disk record.
1171                          *
1172                          * XXX this needs help.  If a delete-on-disk we could
1173                          * disconnect the target.  If the target has its own
1174                          * dependancies they really need to be flushed.
1175                          *
1176                          * XXX
1177                          */
1178                         rec->flush_state = HAMMER_FST_FLUSH;
1179                         rec->flush_group = ip->flush_group;
1180                         hammer_ref(&rec->lock);
1181                         hammer_flush_inode_core(target_ip,
1182                                                 HAMMER_FLUSH_RECURSION);
1183                         r = 1;
1184                 }
1185                 break;
1186         case HAMMER_FST_FLUSH:
1187                 /* 
1188                  * Record already associated with a flush group.  It had
1189                  * better be ours.
1190                  */
1191                 KKASSERT(rec->flush_group == ip->flush_group);
1192                 r = 1;
1193                 break;
1194         }
1195         return(r);
1196 }
1197
1198 /*
1199  * Wait for a previously queued flush to complete
1200  */
1201 void
1202 hammer_wait_inode(hammer_inode_t ip)
1203 {
1204         while (ip->flush_state != HAMMER_FST_IDLE) {
1205                 if (ip->flush_state == HAMMER_FST_SETUP) {
1206                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1207                 } else {
1208                         ip->flags |= HAMMER_INODE_FLUSHW;
1209                         tsleep(&ip->flags, 0, "hmrwin", 0);
1210                 }
1211         }
1212 }
1213
1214 /*
1215  * Called by the backend code when a flush has been completed.
1216  * The inode has already been removed from the flush list.
1217  *
1218  * A pipelined flush can occur, in which case we must re-enter the
1219  * inode on the list and re-copy its fields.
1220  */
1221 void
1222 hammer_flush_inode_done(hammer_inode_t ip)
1223 {
1224         int dorel = 0;
1225
1226         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1227
1228         /*
1229          * Merge left-over flags back into the frontend and fix the state.
1230          */
1231         ip->flags |= ip->sync_flags;
1232
1233         /*
1234          * The backend may have adjusted nlinks, so if the adjusted nlinks
1235          * does not match the fronttend set the frontend's RDIRTY flag again.
1236          */
1237         if (ip->ino_data.nlinks != ip->sync_ino_data.nlinks)
1238                 ip->flags |= HAMMER_INODE_DDIRTY;
1239
1240         /*
1241          * Fix up the dirty buffer status.  IO completions will also
1242          * try to clean up rsv_databufs.
1243          */
1244         if (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree)) {
1245                 ip->flags |= HAMMER_INODE_BUFS;
1246         } else {
1247                 ip->hmp->rsv_databufs -= ip->rsv_databufs;
1248                 ip->rsv_databufs = 0;
1249         }
1250
1251         /*
1252          * Re-set the XDIRTY flag if some of the inode's in-memory records
1253          * could not be flushed.
1254          */
1255         KKASSERT((RB_EMPTY(&ip->rec_tree) &&
1256                   (ip->flags & HAMMER_INODE_XDIRTY) == 0) ||
1257                  (!RB_EMPTY(&ip->rec_tree) &&
1258                   (ip->flags & HAMMER_INODE_XDIRTY) != 0));
1259
1260         /*
1261          * Do not lose track of inodes which no longer have vnode
1262          * assocations, otherwise they may never get flushed again.
1263          */
1264         if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
1265                 ip->flags |= HAMMER_INODE_REFLUSH;
1266
1267         /*
1268          * Adjust flush_state.  The target state (idle or setup) shouldn't
1269          * be terribly important since we will reflush if we really need
1270          * to do anything. XXX
1271          */
1272         if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
1273                 ip->flush_state = HAMMER_FST_IDLE;
1274                 dorel = 1;
1275         } else {
1276                 ip->flush_state = HAMMER_FST_SETUP;
1277         }
1278
1279         /*
1280          * Clean up the vnode ref
1281          */
1282         if (ip->flags & HAMMER_INODE_VHELD) {
1283                 ip->flags &= ~HAMMER_INODE_VHELD;
1284                 vrele(ip->vp);
1285         }
1286
1287         /*
1288          * If the frontend made more changes and requested another flush,
1289          * then try to get it running.
1290          */
1291         if (ip->flags & HAMMER_INODE_REFLUSH) {
1292                 ip->flags &= ~HAMMER_INODE_REFLUSH;
1293                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
1294                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
1295                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1296                 } else {
1297                         hammer_flush_inode(ip, 0);
1298                 }
1299         }
1300
1301         /*
1302          * If the inode is now clean drop the space reservation.
1303          */
1304         if ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1305             (ip->flags & HAMMER_INODE_RSV_INODES)) {
1306                 ip->flags &= ~HAMMER_INODE_RSV_INODES;
1307                 --ip->hmp->rsv_inodes;
1308         }
1309
1310         /*
1311          * Finally, if the frontend is waiting for a flush to complete,
1312          * wake it up.
1313          */
1314         if (ip->flush_state != HAMMER_FST_FLUSH) {
1315                 if (ip->flags & HAMMER_INODE_FLUSHW) {
1316                         ip->flags &= ~HAMMER_INODE_FLUSHW;
1317                         wakeup(&ip->flags);
1318                 }
1319         }
1320         if (dorel)
1321                 hammer_rel_inode(ip, 0);
1322 }
1323
1324 /*
1325  * Called from hammer_sync_inode() to synchronize in-memory records
1326  * to the media.
1327  */
1328 static int
1329 hammer_sync_record_callback(hammer_record_t record, void *data)
1330 {
1331         hammer_cursor_t cursor = data;
1332         hammer_transaction_t trans = cursor->trans;
1333         int error;
1334
1335         /*
1336          * Skip records that do not belong to the current flush.
1337          */
1338         ++hammer_stats_record_iterations;
1339         if (record->flush_state != HAMMER_FST_FLUSH)
1340                 return(0);
1341
1342 #if 1
1343         if (record->flush_group != record->ip->flush_group) {
1344                 kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
1345                 Debugger("blah2");
1346                 return(0);
1347         }
1348 #endif
1349         KKASSERT(record->flush_group == record->ip->flush_group);
1350
1351         /*
1352          * Interlock the record using the BE flag.  Once BE is set the
1353          * frontend cannot change the state of FE.
1354          *
1355          * NOTE: If FE is set prior to us setting BE we still sync the
1356          * record out, but the flush completion code converts it to 
1357          * a delete-on-disk record instead of destroying it.
1358          */
1359         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
1360         record->flags |= HAMMER_RECF_INTERLOCK_BE;
1361
1362         /*
1363          * The backend may have already disposed of the record.
1364          */
1365         if (record->flags & HAMMER_RECF_DELETED_BE) {
1366                 error = 0;
1367                 goto done;
1368         }
1369
1370         /*
1371          * If the whole inode is being deleting all on-disk records will
1372          * be deleted very soon, we can't sync any new records to disk
1373          * because they will be deleted in the same transaction they were
1374          * created in (delete_tid == create_tid), which will assert.
1375          *
1376          * XXX There may be a case with RECORD_ADD with DELETED_FE set
1377          * that we currently panic on.
1378          */
1379         if (record->ip->sync_flags & HAMMER_INODE_DELETING) {
1380                 switch(record->type) {
1381                 case HAMMER_MEM_RECORD_DATA:
1382                         /*
1383                          * We don't have to do anything, if the record was
1384                          * committed the space will have been accounted for
1385                          * in the blockmap.
1386                          */
1387                         /* fall through */
1388                 case HAMMER_MEM_RECORD_GENERAL:
1389                         record->flags |= HAMMER_RECF_DELETED_FE;
1390                         record->flags |= HAMMER_RECF_DELETED_BE;
1391                         error = 0;
1392                         goto done;
1393                 case HAMMER_MEM_RECORD_ADD:
1394                         panic("hammer_sync_record_callback: illegal add "
1395                               "during inode deletion record %p", record);
1396                         break; /* NOT REACHED */
1397                 case HAMMER_MEM_RECORD_INODE:
1398                         panic("hammer_sync_record_callback: attempt to "
1399                               "sync inode record %p?", record);
1400                         break; /* NOT REACHED */
1401                 case HAMMER_MEM_RECORD_DEL:
1402                         /* 
1403                          * Follow through and issue the on-disk deletion
1404                          */
1405                         break;
1406                 }
1407         }
1408
1409         /*
1410          * If DELETED_FE is set we may have already sent dependant pieces
1411          * to the disk and we must flush the record as if it hadn't been
1412          * deleted.  This creates a bit of a mess because we have to
1413          * have ip_sync_record convert the record to MEM_RECORD_DEL before
1414          * it inserts the B-Tree record.  Otherwise the media sync might
1415          * be visible to the frontend.
1416          */
1417         if (record->flags & HAMMER_RECF_DELETED_FE) {
1418                 if (record->type == HAMMER_MEM_RECORD_ADD) {
1419                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
1420                 } else {
1421                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
1422                         return(0);
1423                 }
1424         }
1425
1426         /*
1427          * Assign the create_tid for new records.  Deletions already
1428          * have the record's entire key properly set up.
1429          */
1430         if (record->type != HAMMER_MEM_RECORD_DEL)
1431                 record->leaf.base.create_tid = trans->tid;
1432         for (;;) {
1433                 error = hammer_ip_sync_record_cursor(cursor, record);
1434                 if (error != EDEADLK)
1435                         break;
1436                 hammer_done_cursor(cursor);
1437                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
1438                                            record->ip);
1439                 if (error)
1440                         break;
1441         }
1442         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1443
1444         if (error) {
1445                 error = -error;
1446                 if (error != -ENOSPC) {
1447                         kprintf("hammer_sync_record_callback: sync failed rec "
1448                                 "%p, error %d\n", record, error);
1449                         Debugger("sync failed rec");
1450                 }
1451         }
1452 done:
1453         hammer_flush_record_done(record, error);
1454         return(error);
1455 }
1456
1457 /*
1458  * XXX error handling
1459  */
1460 int
1461 hammer_sync_inode(hammer_inode_t ip)
1462 {
1463         struct hammer_transaction trans;
1464         struct hammer_cursor cursor;
1465         hammer_record_t depend;
1466         hammer_record_t next;
1467         int error, tmp_error;
1468         u_int64_t nlinks;
1469
1470         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
1471                 return(0);
1472
1473         hammer_start_transaction_fls(&trans, ip->hmp);
1474         error = hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1475         if (error)
1476                 goto done;
1477
1478         /*
1479          * Any directory records referencing this inode which are not in
1480          * our current flush group must adjust our nlink count for the
1481          * purposes of synchronization to disk.
1482          *
1483          * Records which are in our flush group can be unlinked from our
1484          * inode now, potentially allowing the inode to be physically
1485          * deleted.
1486          */
1487         nlinks = ip->ino_data.nlinks;
1488         next = TAILQ_FIRST(&ip->target_list);
1489         while ((depend = next) != NULL) {
1490                 next = TAILQ_NEXT(depend, target_entry);
1491                 if (depend->flush_state == HAMMER_FST_FLUSH &&
1492                     depend->flush_group == ip->hmp->flusher_act) {
1493                         /*
1494                          * If this is an ADD that was deleted by the frontend
1495                          * the frontend nlinks count will have already been
1496                          * decremented, but the backend is going to sync its
1497                          * directory entry and must account for it.  The
1498                          * record will be converted to a delete-on-disk when
1499                          * it gets synced.
1500                          *
1501                          * If the ADD was not deleted by the frontend we
1502                          * can remove the dependancy from our target_list.
1503                          */
1504                         if (depend->flags & HAMMER_RECF_DELETED_FE) {
1505                                 ++nlinks;
1506                         } else {
1507                                 TAILQ_REMOVE(&ip->target_list, depend,
1508                                              target_entry);
1509                                 depend->target_ip = NULL;
1510                         }
1511                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
1512                         /*
1513                          * Not part of our flush group
1514                          */
1515                         KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0);
1516                         switch(depend->type) {
1517                         case HAMMER_MEM_RECORD_ADD:
1518                                 --nlinks;
1519                                 break;
1520                         case HAMMER_MEM_RECORD_DEL:
1521                                 ++nlinks;
1522                                 break;
1523                         default:
1524                                 break;
1525                         }
1526                 }
1527         }
1528
1529         /*
1530          * Set dirty if we had to modify the link count.
1531          */
1532         if (ip->sync_ino_data.nlinks != nlinks) {
1533                 KKASSERT((int64_t)nlinks >= 0);
1534                 ip->sync_ino_data.nlinks = nlinks;
1535                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1536         }
1537
1538 #if 0
1539         /*
1540          * XXX DISABLED FOR NOW.  With the new reservation support
1541          * we cannot resync pending data without confusing the hell
1542          * out of the in-memory record tree.
1543          */
1544         /*
1545          * Queue up as many dirty buffers as we can then set a flag to
1546          * cause any further BIOs to go to the alternative queue.
1547          */
1548         if (ip->flags & HAMMER_INODE_VHELD)
1549                 error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
1550         ip->flags |= HAMMER_INODE_WRITE_ALT;
1551
1552         /*
1553          * The buffer cache may contain dirty buffers beyond the inode
1554          * state we copied from the frontend to the backend.  Because
1555          * we are syncing our buffer cache on the backend, resync
1556          * the truncation point and the file size so we don't wipe out
1557          * any data.
1558          *
1559          * Syncing the buffer cache on the frontend has serious problems
1560          * because it prevents us from passively queueing dirty inodes
1561          * to the backend (the BIO's could stall indefinitely).
1562          */
1563         if (ip->flags & HAMMER_INODE_TRUNCATED) {
1564                 ip->sync_trunc_off = ip->trunc_off;
1565                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
1566         }
1567         if (ip->sync_ino_data.size != ip->ino_data.size) {
1568                 ip->sync_ino_data.size = ip->ino_data.size;
1569                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1570         }
1571 #endif
1572
1573         /*
1574          * If there is a trunction queued destroy any data past the (aligned)
1575          * truncation point.  Userland will have dealt with the buffer
1576          * containing the truncation point for us.
1577          *
1578          * We don't flush pending frontend data buffers until after we've
1579          * dealth with the truncation.
1580          *
1581          * Don't bother if the inode is or has been deleted.
1582          */
1583         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1584                 /*
1585                  * Interlock trunc_off.  The VOP front-end may continue to
1586                  * make adjustments to it while we are blocked.
1587                  */
1588                 off_t trunc_off;
1589                 off_t aligned_trunc_off;
1590
1591                 trunc_off = ip->sync_trunc_off;
1592                 aligned_trunc_off = (trunc_off + HAMMER_BUFMASK) &
1593                                     ~HAMMER_BUFMASK64;
1594
1595                 /*
1596                  * Delete any whole blocks on-media.  The front-end has
1597                  * already cleaned out any partial block and made it
1598                  * pending.  The front-end may have updated trunc_off
1599                  * while we were blocked so we only use sync_trunc_off.
1600                  */
1601                 error = hammer_ip_delete_range(&cursor, ip,
1602                                                 aligned_trunc_off,
1603                                                 0x7FFFFFFFFFFFFFFFLL, 1);
1604                 if (error)
1605                         Debugger("hammer_ip_delete_range errored");
1606
1607                 /*
1608                  * Clear the truncation flag on the backend after we have
1609                  * complete the deletions.  Backend data is now good again
1610                  * (including new records we are about to sync, below).
1611                  */
1612                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
1613                 ip->sync_trunc_off = 0x7FFFFFFFFFFFFFFFLL;
1614         } else {
1615                 error = 0;
1616         }
1617
1618         /*
1619          * Now sync related records.  These will typically be directory
1620          * entries or delete-on-disk records.
1621          *
1622          * Not all records will be flushed, but clear XDIRTY anyway.  We
1623          * will set it again in the frontend hammer_flush_inode_done() 
1624          * if records remain.
1625          */
1626         if (error == 0) {
1627                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1628                                     hammer_sync_record_callback, &cursor);
1629                 if (tmp_error < 0)
1630                         tmp_error = -error;
1631                 if (tmp_error)
1632                         error = tmp_error;
1633         }
1634
1635         /*
1636          * If we are deleting the inode the frontend had better not have
1637          * any active references on elements making up the inode.
1638          */
1639         if (error == 0 && ip->sync_ino_data.nlinks == 0 &&
1640                 RB_EMPTY(&ip->rec_tree)  &&
1641             (ip->sync_flags & HAMMER_INODE_DELETING) &&
1642             (ip->flags & HAMMER_INODE_DELETED) == 0) {
1643                 int count1 = 0;
1644
1645                 ip->flags |= HAMMER_INODE_DELETED;
1646                 error = hammer_ip_delete_range_all(&cursor, ip, &count1);
1647                 if (error == 0) {
1648                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
1649                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
1650                         KKASSERT(RB_EMPTY(&ip->rec_tree));
1651
1652                         /*
1653                          * Set delete_tid in both the frontend and backend
1654                          * copy of the inode record.  The DELETED flag handles
1655                          * this, do not set RDIRTY.
1656                          */
1657                         ip->ino_leaf.base.delete_tid = trans.tid;
1658                         ip->sync_ino_leaf.base.delete_tid = trans.tid;
1659
1660                         /*
1661                          * Adjust the inode count in the volume header
1662                          */
1663                         if (ip->flags & HAMMER_INODE_ONDISK) {
1664                                 hammer_modify_volume_field(&trans,
1665                                                            trans.rootvol,
1666                                                            vol0_stat_inodes);
1667                                 --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1668                                 hammer_modify_volume_done(trans.rootvol);
1669                         }
1670                 } else {
1671                         ip->flags &= ~HAMMER_INODE_DELETED;
1672                         Debugger("hammer_ip_delete_range_all errored");
1673                 }
1674         }
1675
1676         ip->sync_flags &= ~HAMMER_INODE_BUFS;
1677
1678         if (error)
1679                 Debugger("RB_SCAN errored");
1680
1681         /*
1682          * Now update the inode's on-disk inode-data and/or on-disk record.
1683          * DELETED and ONDISK are managed only in ip->flags.
1684          */
1685         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
1686         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
1687                 /*
1688                  * If deleted and on-disk, don't set any additional flags.
1689                  * the delete flag takes care of things.
1690                  *
1691                  * Clear flags which may have been set by the frontend.
1692                  */
1693                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY|
1694                                     HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES|
1695                                     HAMMER_INODE_DELETING);
1696                 break;
1697         case HAMMER_INODE_DELETED:
1698                 /*
1699                  * Take care of the case where a deleted inode was never
1700                  * flushed to the disk in the first place.
1701                  *
1702                  * Clear flags which may have been set by the frontend.
1703                  */
1704                 ip->sync_flags &= ~(HAMMER_INODE_DDIRTY|
1705                                     HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES|
1706                                     HAMMER_INODE_DELETING);
1707                 while (RB_ROOT(&ip->rec_tree)) {
1708                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
1709                         hammer_ref(&record->lock);
1710                         KKASSERT(record->lock.refs == 1);
1711                         record->flags |= HAMMER_RECF_DELETED_FE;
1712                         record->flags |= HAMMER_RECF_DELETED_BE;
1713                         hammer_rel_mem_record(record);
1714                 }
1715                 break;
1716         case HAMMER_INODE_ONDISK:
1717                 /*
1718                  * If already on-disk, do not set any additional flags.
1719                  */
1720                 break;
1721         default:
1722                 /*
1723                  * If not on-disk and not deleted, set both dirty flags
1724                  * to force an initial record to be written.  Also set
1725                  * the create_tid for the inode.
1726                  *
1727                  * Set create_tid in both the frontend and backend
1728                  * copy of the inode record.
1729                  */
1730                 ip->ino_leaf.base.create_tid = trans.tid;
1731                 ip->sync_ino_leaf.base.create_tid = trans.tid;
1732                 ip->sync_flags |= HAMMER_INODE_DDIRTY;
1733                 break;
1734         }
1735
1736         /*
1737          * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
1738          * is already on-disk the old record is marked as deleted.
1739          *
1740          * If DELETED is set hammer_update_inode() will delete the existing
1741          * record without writing out a new one.
1742          *
1743          * If *ONLY* the ITIMES flag is set we can update the record in-place.
1744          */
1745         if (ip->flags & HAMMER_INODE_DELETED) {
1746                 error = hammer_update_inode(&cursor, ip);
1747         } else 
1748         if ((ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ITIMES)) ==
1749             HAMMER_INODE_ITIMES) {
1750                 error = hammer_update_itimes(&cursor, ip);
1751         } else
1752         if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ITIMES)) {
1753                 error = hammer_update_inode(&cursor, ip);
1754         }
1755         if (error)
1756                 Debugger("hammer_update_itimes/inode errored");
1757 done:
1758         /*
1759          * Save the TID we used to sync the inode with to make sure we
1760          * do not improperly reuse it.
1761          */
1762         hammer_done_cursor(&cursor);
1763         hammer_done_transaction(&trans);
1764         return(error);
1765 }
1766
1767 /*
1768  * This routine is called when the OS is no longer actively referencing
1769  * the inode (but might still be keeping it cached), or when releasing
1770  * the last reference to an inode.
1771  *
1772  * At this point if the inode's nlinks count is zero we want to destroy
1773  * it, which may mean destroying it on-media too.
1774  */
1775 void
1776 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
1777 {
1778         struct vnode *vp;
1779
1780         /*
1781          * Set the DELETING flag when the link count drops to 0 and the
1782          * OS no longer has any opens on the inode.
1783          *
1784          * The backend will clear DELETING (a mod flag) and set DELETED
1785          * (a state flag) when it is actually able to perform the
1786          * operation.
1787          */
1788         if (ip->ino_data.nlinks == 0 &&
1789             (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
1790                 ip->flags |= HAMMER_INODE_DELETING;
1791                 ip->flags |= HAMMER_INODE_TRUNCATED;
1792                 ip->trunc_off = 0;
1793                 vp = NULL;
1794                 if (getvp) {
1795                         if (hammer_get_vnode(ip, &vp) != 0)
1796                                 return;
1797                 }
1798
1799                 /*
1800                  * Final cleanup
1801                  */
1802                 if (ip->vp) {
1803                         vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
1804                         vnode_pager_setsize(ip->vp, 0);
1805                 }
1806                 if (getvp) {
1807                         vput(vp);
1808                 }
1809         }
1810 }
1811
1812 /*
1813  * Re-test an inode when a dependancy had gone away to see if we
1814  * can chain flush it.
1815  */
1816 void
1817 hammer_test_inode(hammer_inode_t ip)
1818 {
1819         if (ip->flags & HAMMER_INODE_REFLUSH) {
1820                 ip->flags &= ~HAMMER_INODE_REFLUSH;
1821                 hammer_ref(&ip->lock);
1822                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
1823                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
1824                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1825                 } else {
1826                         hammer_flush_inode(ip, 0);
1827                 }
1828                 hammer_rel_inode(ip, 0);
1829         }
1830 }
1831
1832 /*
1833  * When a HAMMER inode is reclaimed it may have to be queued to the backend
1834  * for its final sync to disk.  Programs like blogbench can cause the backlog
1835  * to grow indefinitely.  Put a cap on the number of inodes we allow to be
1836  * in this state by giving the flusher time to drain.
1837  */
1838 void
1839 hammer_inode_waitreclaims(hammer_mount_t hmp)
1840 {
1841         while (hmp->inode_reclaims > HAMMER_RECLAIM_MIN &&
1842                hmp->inode_reclaims > hmp->count_inodes / HAMMER_RECLAIM_FACTOR) {
1843                 hmp->flags |= HAMMER_MOUNT_WAITIMAX;
1844                 hammer_flusher_async(hmp);
1845                 tsleep(hmp, 0, "hmimax", hz / 10);
1846         }
1847 }
1848
1849 void
1850 hammer_inode_wakereclaims(hammer_mount_t hmp)
1851 {
1852         if (hmp->inode_reclaims <= HAMMER_RECLAIM_MIN ||
1853             hmp->inode_reclaims <= hmp->count_inodes / HAMMER_RECLAIM_FACTOR) {
1854                 hmp->flags &= ~HAMMER_MOUNT_WAITIMAX;
1855                 wakeup(hmp);
1856         }
1857 }
1858