HAMMER 40F/Many: UNDO cleanup & stabilization.
[dragonfly.git] / sys / vfs / hammer / hammer_inode.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_inode.c,v 1.50 2008/05/04 09:06:45 dillon Exp $
35  */
36
37 #include "hammer.h"
38 #include <vm/vm_extern.h>
39 #include <sys/buf.h>
40 #include <sys/buf2.h>
41
42 static int hammer_unload_inode(struct hammer_inode *ip);
43 static void hammer_flush_inode_core(hammer_inode_t ip, int flags);
44 static int hammer_setup_child_callback(hammer_record_t rec, void *data);
45 static int hammer_setup_parent_inodes(hammer_record_t record);
46
47 /*
48  * The kernel is not actively referencing this vnode but is still holding
49  * it cached.
50  *
51  * This is called from the frontend.
52  */
53 int
54 hammer_vop_inactive(struct vop_inactive_args *ap)
55 {
56         struct hammer_inode *ip = VTOI(ap->a_vp);
57
58         /*
59          * Degenerate case
60          */
61         if (ip == NULL) {
62                 vrecycle(ap->a_vp);
63                 return(0);
64         }
65
66         /*
67          * If the inode no longer has visibility in the filesystem and is
68          * fairly clean, try to recycle it immediately.  This can deadlock
69          * in vfsync() if we aren't careful.
70          */
71         hammer_inode_unloadable_check(ip, 0);
72         if (ip->flags & HAMMER_INODE_MODMASK)
73                 hammer_flush_inode(ip, 0);
74         else if (ip->ino_rec.ino_nlinks == 0)
75                 vrecycle(ap->a_vp);
76         return(0);
77 }
78
79 /*
80  * Release the vnode association.  This is typically (but not always)
81  * the last reference on the inode.
82  *
83  * Once the association is lost we are on our own with regards to
84  * flushing the inode.
85  */
86 int
87 hammer_vop_reclaim(struct vop_reclaim_args *ap)
88 {
89         struct hammer_inode *ip;
90         struct vnode *vp;
91
92         vp = ap->a_vp;
93
94         if ((ip = vp->v_data) != NULL) {
95                 vp->v_data = NULL;
96                 ip->vp = NULL;
97                 hammer_rel_inode(ip, 1);
98         }
99         return(0);
100 }
101
102 /*
103  * Return a locked vnode for the specified inode.  The inode must be
104  * referenced but NOT LOCKED on entry and will remain referenced on
105  * return.
106  *
107  * Called from the frontend.
108  */
109 int
110 hammer_get_vnode(struct hammer_inode *ip, struct vnode **vpp)
111 {
112         struct vnode *vp;
113         int error = 0;
114
115         for (;;) {
116                 if ((vp = ip->vp) == NULL) {
117                         error = getnewvnode(VT_HAMMER, ip->hmp->mp, vpp, 0, 0);
118                         if (error)
119                                 break;
120                         hammer_lock_ex(&ip->lock);
121                         if (ip->vp != NULL) {
122                                 hammer_unlock(&ip->lock);
123                                 vp->v_type = VBAD;
124                                 vx_put(vp);
125                                 continue;
126                         }
127                         hammer_ref(&ip->lock);
128                         vp = *vpp;
129                         ip->vp = vp;
130                         vp->v_type = hammer_get_vnode_type(
131                                             ip->ino_rec.base.base.obj_type);
132
133                         switch(ip->ino_rec.base.base.obj_type) {
134                         case HAMMER_OBJTYPE_CDEV:
135                         case HAMMER_OBJTYPE_BDEV:
136                                 vp->v_ops = &ip->hmp->mp->mnt_vn_spec_ops;
137                                 addaliasu(vp, ip->ino_data.rmajor,
138                                           ip->ino_data.rminor);
139                                 break;
140                         case HAMMER_OBJTYPE_FIFO:
141                                 vp->v_ops = &ip->hmp->mp->mnt_vn_fifo_ops;
142                                 break;
143                         default:
144                                 break;
145                         }
146
147                         /*
148                          * Only mark as the root vnode if the ip is not
149                          * historical, otherwise the VFS cache will get
150                          * confused.  The other half of the special handling
151                          * is in hammer_vop_nlookupdotdot().
152                          */
153                         if (ip->obj_id == HAMMER_OBJID_ROOT &&
154                             ip->obj_asof == ip->hmp->asof) {
155                                 vp->v_flag |= VROOT;
156                         }
157
158                         vp->v_data = (void *)ip;
159                         /* vnode locked by getnewvnode() */
160                         /* make related vnode dirty if inode dirty? */
161                         hammer_unlock(&ip->lock);
162                         if (vp->v_type == VREG)
163                                 vinitvmio(vp, ip->ino_rec.ino_size);
164                         break;
165                 }
166
167                 /*
168                  * loop if the vget fails (aka races), or if the vp
169                  * no longer matches ip->vp.
170                  */
171                 if (vget(vp, LK_EXCLUSIVE) == 0) {
172                         if (vp == ip->vp)
173                                 break;
174                         vput(vp);
175                 }
176         }
177         *vpp = vp;
178         return(error);
179 }
180
181 /*
182  * Acquire a HAMMER inode.  The returned inode is not locked.  These functions
183  * do not attach or detach the related vnode (use hammer_get_vnode() for
184  * that).
185  *
186  * The flags argument is only applied for newly created inodes, and only
187  * certain flags are inherited.
188  *
189  * Called from the frontend.
190  */
191 struct hammer_inode *
192 hammer_get_inode(hammer_transaction_t trans, struct hammer_node **cache,
193                  u_int64_t obj_id, hammer_tid_t asof, int flags, int *errorp)
194 {
195         hammer_mount_t hmp = trans->hmp;
196         struct hammer_inode_info iinfo;
197         struct hammer_cursor cursor;
198         struct hammer_inode *ip;
199
200         /*
201          * Determine if we already have an inode cached.  If we do then
202          * we are golden.
203          */
204         iinfo.obj_id = obj_id;
205         iinfo.obj_asof = asof;
206 loop:
207         ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
208         if (ip) {
209                 hammer_ref(&ip->lock);
210                 *errorp = 0;
211                 return(ip);
212         }
213
214         ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
215         ++hammer_count_inodes;
216         ip->obj_id = obj_id;
217         ip->obj_asof = iinfo.obj_asof;
218         ip->hmp = hmp;
219         ip->flags = flags & HAMMER_INODE_RO;
220         if (hmp->ronly)
221                 ip->flags |= HAMMER_INODE_RO;
222         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
223         RB_INIT(&ip->rec_tree);
224         TAILQ_INIT(&ip->bio_list);
225         TAILQ_INIT(&ip->bio_alt_list);
226         TAILQ_INIT(&ip->target_list);
227
228         /*
229          * Locate the on-disk inode.
230          */
231 retry:
232         hammer_init_cursor(trans, &cursor, cache, NULL);
233         cursor.key_beg.obj_id = ip->obj_id;
234         cursor.key_beg.key = 0;
235         cursor.key_beg.create_tid = 0;
236         cursor.key_beg.delete_tid = 0;
237         cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE;
238         cursor.key_beg.obj_type = 0;
239         cursor.asof = iinfo.obj_asof;
240         cursor.flags = HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_GET_DATA |
241                        HAMMER_CURSOR_ASOF;
242
243         *errorp = hammer_btree_lookup(&cursor);
244         if (*errorp == EDEADLK) {
245                 hammer_done_cursor(&cursor);
246                 goto retry;
247         }
248
249         /*
250          * On success the B-Tree lookup will hold the appropriate
251          * buffer cache buffers and provide a pointer to the requested
252          * information.  Copy the information to the in-memory inode
253          * and cache the B-Tree node to improve future operations.
254          */
255         if (*errorp == 0) {
256                 ip->ino_rec = cursor.record->inode;
257                 ip->ino_data = cursor.data->inode;
258                 hammer_cache_node(cursor.node, &ip->cache[0]);
259                 if (cache)
260                         hammer_cache_node(cursor.node, cache);
261         }
262
263         /*
264          * On success load the inode's record and data and insert the
265          * inode into the B-Tree.  It is possible to race another lookup
266          * insertion of the same inode so deal with that condition too.
267          *
268          * The cursor's locked node interlocks against others creating and
269          * destroying ip while we were blocked.
270          */
271         if (*errorp == 0) {
272                 hammer_ref(&ip->lock);
273                 if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
274                         hammer_uncache_node(&ip->cache[0]);
275                         hammer_uncache_node(&ip->cache[1]);
276                         KKASSERT(ip->lock.refs == 1);
277                         --hammer_count_inodes;
278                         kfree(ip, M_HAMMER);
279                         hammer_done_cursor(&cursor);
280                         goto loop;
281                 }
282                 ip->flags |= HAMMER_INODE_ONDISK;
283         } else {
284                 kprintf("hammer_get_inode: failed ip %p cursor %p error %d\n",
285                         ip, &cursor, *errorp);
286                 /*Debugger("x");*/
287                 --hammer_count_inodes;
288                 kfree(ip, M_HAMMER);
289                 ip = NULL;
290         }
291         hammer_done_cursor(&cursor);
292         return (ip);
293 }
294
295 /*
296  * Create a new filesystem object, returning the inode in *ipp.  The
297  * returned inode will be referenced.
298  *
299  * The inode is created in-memory.
300  */
301 int
302 hammer_create_inode(hammer_transaction_t trans, struct vattr *vap,
303                     struct ucred *cred, hammer_inode_t dip,
304                     struct hammer_inode **ipp)
305 {
306         hammer_mount_t hmp;
307         hammer_inode_t ip;
308         uid_t xuid;
309
310         hmp = trans->hmp;
311         ip = kmalloc(sizeof(*ip), M_HAMMER, M_WAITOK|M_ZERO);
312         ++hammer_count_inodes;
313         ip->obj_id = hammer_alloc_objid(trans, dip);
314         KKASSERT(ip->obj_id != 0);
315         ip->obj_asof = hmp->asof;
316         ip->hmp = hmp;
317         ip->flush_state = HAMMER_FST_IDLE;
318         ip->flags = HAMMER_INODE_DDIRTY | HAMMER_INODE_RDIRTY |
319                     HAMMER_INODE_ITIMES;
320
321         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
322         RB_INIT(&ip->rec_tree);
323         TAILQ_INIT(&ip->bio_list);
324         TAILQ_INIT(&ip->bio_alt_list);
325         TAILQ_INIT(&ip->target_list);
326
327         ip->ino_rec.ino_atime = trans->time;
328         ip->ino_rec.ino_mtime = trans->time;
329         ip->ino_rec.ino_size = 0;
330         ip->ino_rec.ino_nlinks = 0;
331         /* XXX */
332         ip->ino_rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD;
333         ip->ino_rec.base.base.obj_id = ip->obj_id;
334         ip->ino_rec.base.base.key = 0;
335         ip->ino_rec.base.base.create_tid = 0;
336         ip->ino_rec.base.base.delete_tid = 0;
337         ip->ino_rec.base.base.rec_type = HAMMER_RECTYPE_INODE;
338         ip->ino_rec.base.base.obj_type = hammer_get_obj_type(vap->va_type);
339
340         ip->ino_data.version = HAMMER_INODE_DATA_VERSION;
341         ip->ino_data.mode = vap->va_mode;
342         ip->ino_data.ctime = trans->time;
343         ip->ino_data.parent_obj_id = (dip) ? dip->ino_rec.base.base.obj_id : 0;
344
345         switch(ip->ino_rec.base.base.obj_type) {
346         case HAMMER_OBJTYPE_CDEV:
347         case HAMMER_OBJTYPE_BDEV:
348                 ip->ino_data.rmajor = vap->va_rmajor;
349                 ip->ino_data.rminor = vap->va_rminor;
350                 break;
351         default:
352                 break;
353         }
354
355         /*
356          * Calculate default uid/gid and overwrite with information from
357          * the vap.
358          */
359         xuid = hammer_to_unix_xid(&dip->ino_data.uid);
360         ip->ino_data.gid = dip->ino_data.gid;
361         xuid = vop_helper_create_uid(hmp->mp, dip->ino_data.mode, xuid, cred,
362                                      &vap->va_mode);
363         ip->ino_data.mode = vap->va_mode;
364
365         if (vap->va_vaflags & VA_UID_UUID_VALID)
366                 ip->ino_data.uid = vap->va_uid_uuid;
367         else if (vap->va_uid != (uid_t)VNOVAL)
368                 hammer_guid_to_uuid(&ip->ino_data.uid, xuid);
369         if (vap->va_vaflags & VA_GID_UUID_VALID)
370                 ip->ino_data.gid = vap->va_gid_uuid;
371         else if (vap->va_gid != (gid_t)VNOVAL)
372                 hammer_guid_to_uuid(&ip->ino_data.gid, vap->va_gid);
373
374         hammer_ref(&ip->lock);
375         if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
376                 hammer_unref(&ip->lock);
377                 panic("hammer_create_inode: duplicate obj_id %llx", ip->obj_id);
378         }
379         *ipp = ip;
380         return(0);
381 }
382
383 /*
384  * Called by hammer_sync_inode().
385  */
386 static int
387 hammer_update_inode(hammer_cursor_t cursor, hammer_inode_t ip)
388 {
389         hammer_transaction_t trans = cursor->trans;
390         hammer_record_t record;
391         int error;
392
393 retry:
394         error = 0;
395
396         /*
397          * If the inode has a presence on-disk then locate it and mark
398          * it deleted, setting DELONDISK.
399          *
400          * The record may or may not be physically deleted, depending on
401          * the retention policy.
402          */
403         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
404             HAMMER_INODE_ONDISK) {
405                 hammer_normalize_cursor(cursor);
406                 cursor->key_beg.obj_id = ip->obj_id;
407                 cursor->key_beg.key = 0;
408                 cursor->key_beg.create_tid = 0;
409                 cursor->key_beg.delete_tid = 0;
410                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
411                 cursor->key_beg.obj_type = 0;
412                 cursor->asof = ip->obj_asof;
413                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
414                 cursor->flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF;
415                 cursor->flags |= HAMMER_CURSOR_BACKEND;
416
417                 error = hammer_btree_lookup(cursor);
418                 if (hammer_debug_inode)
419                         kprintf("IPDEL %p %08x %d", ip, ip->flags, error);
420                 if (error) {
421                         kprintf("error %d\n", error);
422                         Debugger("hammer_update_inode");
423                 }
424
425                 if (error == 0) {
426                         error = hammer_ip_delete_record(cursor, trans->tid);
427                         if (hammer_debug_inode)
428                                 kprintf(" error %d\n", error);
429                         if (error && error != EDEADLK) {
430                                 kprintf("error %d\n", error);
431                                 Debugger("hammer_update_inode2");
432                         }
433                         if (error == 0) {
434                                 ip->flags |= HAMMER_INODE_DELONDISK;
435                         }
436                         if (cursor->node)
437                                 hammer_cache_node(cursor->node, &ip->cache[0]);
438                 }
439                 if (error == EDEADLK) {
440                         hammer_done_cursor(cursor);
441                         error = hammer_init_cursor(trans, cursor,
442                                                    &ip->cache[0], ip);
443                         if (hammer_debug_inode)
444                                 kprintf("IPDED %p %d\n", ip, error);
445                         if (error == 0)
446                                 goto retry;
447                 }
448         }
449
450         /*
451          * Ok, write out the initial record or a new record (after deleting
452          * the old one), unless the DELETED flag is set.  This routine will
453          * clear DELONDISK if it writes out a record.
454          *
455          * Update our inode statistics if this is the first application of
456          * the inode on-disk.
457          */
458         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED) == 0) {
459                 /*
460                  * Generate a record and write it to the media
461                  */
462                 record = hammer_alloc_mem_record(ip);
463                 record->type = HAMMER_MEM_RECORD_GENERAL;
464                 record->flush_state = HAMMER_FST_FLUSH;
465                 record->rec.inode = ip->sync_ino_rec;
466                 record->rec.inode.base.base.create_tid = trans->tid;
467                 record->rec.inode.base.data_len = sizeof(ip->sync_ino_data);
468                 record->data = (void *)&ip->sync_ino_data;
469                 record->flags |= HAMMER_RECF_INTERLOCK_BE;
470                 for (;;) {
471                         error = hammer_ip_sync_record_cursor(cursor, record);
472                         if (hammer_debug_inode)
473                                 kprintf("GENREC %p rec %08x %d\n",      
474                                         ip, record->flags, error);
475                         if (error != EDEADLK)
476                                 break;
477                         hammer_done_cursor(cursor);
478                         error = hammer_init_cursor(trans, cursor,
479                                                    &ip->cache[0], ip);
480                         if (hammer_debug_inode)
481                                 kprintf("GENREC reinit %d\n", error);
482                         if (error)
483                                 break;
484                 }
485                 if (error) {
486                         kprintf("error %d\n", error);
487                         Debugger("hammer_update_inode3");
488                 }
489
490                 /*
491                  * The record isn't managed by the inode's record tree,
492                  * destroy it whether we succeed or fail.
493                  */
494                 record->flags &= ~HAMMER_RECF_INTERLOCK_BE;
495                 record->flags |= HAMMER_RECF_DELETED_FE;
496                 record->flush_state = HAMMER_FST_IDLE;
497                 hammer_rel_mem_record(record);
498
499                 /*
500                  * Finish up.
501                  */
502                 if (error == 0) {
503                         if (hammer_debug_inode)
504                                 kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
505                         ip->sync_flags &= ~(HAMMER_INODE_RDIRTY |
506                                             HAMMER_INODE_DDIRTY |
507                                             HAMMER_INODE_ITIMES);
508                         ip->flags &= ~HAMMER_INODE_DELONDISK;
509
510                         /*
511                          * Root volume count of inodes
512                          */
513                         if ((ip->flags & HAMMER_INODE_ONDISK) == 0) {
514                                 hammer_modify_volume_field(trans,
515                                                            trans->rootvol,
516                                                            vol0_stat_inodes);
517                                 ++ip->hmp->rootvol->ondisk->vol0_stat_inodes;
518                                 hammer_modify_volume_done(trans->rootvol);
519                                 ip->flags |= HAMMER_INODE_ONDISK;
520                                 if (hammer_debug_inode)
521                                         kprintf("NOWONDISK %p\n", ip);
522                         }
523                 }
524         }
525
526         /*
527          * If the inode has been destroyed, clean out any left-over flags
528          * that may have been set by the frontend.
529          */
530         if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) { 
531                 ip->sync_flags &= ~(HAMMER_INODE_RDIRTY |
532                                     HAMMER_INODE_DDIRTY |
533                                     HAMMER_INODE_ITIMES);
534         }
535         return(error);
536 }
537
538 /*
539  * Update only the itimes fields.  This is done no-historically.  The
540  * record is updated in-place on the disk.
541  */
542 static int
543 hammer_update_itimes(hammer_cursor_t cursor, hammer_inode_t ip)
544 {
545         hammer_transaction_t trans = cursor->trans;
546         struct hammer_inode_record *rec;
547         int error;
548
549 retry:
550         error = 0;
551         if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DELONDISK)) ==
552             HAMMER_INODE_ONDISK) {
553                 hammer_normalize_cursor(cursor);
554                 cursor->key_beg.obj_id = ip->obj_id;
555                 cursor->key_beg.key = 0;
556                 cursor->key_beg.create_tid = 0;
557                 cursor->key_beg.delete_tid = 0;
558                 cursor->key_beg.rec_type = HAMMER_RECTYPE_INODE;
559                 cursor->key_beg.obj_type = 0;
560                 cursor->asof = ip->obj_asof;
561                 cursor->flags &= ~HAMMER_CURSOR_INITMASK;
562                 cursor->flags |= HAMMER_CURSOR_GET_RECORD | HAMMER_CURSOR_ASOF;
563                 cursor->flags |= HAMMER_CURSOR_BACKEND;
564
565                 error = hammer_btree_lookup(cursor);
566                 if (error) {
567                         kprintf("error %d\n", error);
568                         Debugger("hammer_update_itimes1");
569                 }
570                 if (error == 0) {
571                         /*
572                          * Do not generate UNDO records for atime/mtime
573                          * updates.
574                          */
575                         rec = &cursor->record->inode;
576                         hammer_modify_buffer(trans, cursor->record_buffer,
577                                              NULL, 0);
578                         rec->ino_atime = ip->sync_ino_rec.ino_atime;
579                         rec->ino_mtime = ip->sync_ino_rec.ino_mtime;
580                         hammer_modify_buffer_done(cursor->record_buffer);
581                         ip->sync_flags &= ~HAMMER_INODE_ITIMES;
582                         /* XXX recalculate crc */
583                         hammer_cache_node(cursor->node, &ip->cache[0]);
584                 }
585                 if (error == EDEADLK) {
586                         hammer_done_cursor(cursor);
587                         error = hammer_init_cursor(trans, cursor,
588                                                    &ip->cache[0], ip);
589                         if (error == 0)
590                                 goto retry;
591                 }
592         }
593         return(error);
594 }
595
596 /*
597  * Release a reference on an inode, flush as requested.
598  *
599  * On the last reference we queue the inode to the flusher for its final
600  * disposition.
601  */
602 void
603 hammer_rel_inode(struct hammer_inode *ip, int flush)
604 {
605         hammer_mount_t hmp = ip->hmp;
606
607         /*
608          * Handle disposition when dropping the last ref.
609          */
610         for (;;) {
611                 if (ip->lock.refs == 1) {
612                         /*
613                          * Determine whether on-disk action is needed for
614                          * the inode's final disposition.
615                          */
616                         KKASSERT(ip->vp == NULL);
617                         hammer_inode_unloadable_check(ip, 0);
618                         if (ip->flags & HAMMER_INODE_MODMASK) {
619                                 hammer_flush_inode(ip, 0);
620                         } else if (ip->lock.refs == 1) {
621                                 hammer_unload_inode(ip);
622                                 break;
623                         }
624                 } else {
625                         if (flush)
626                                 hammer_flush_inode(ip, 0);
627
628                         /*
629                          * The inode still has multiple refs, try to drop
630                          * one ref.
631                          */
632                         KKASSERT(ip->lock.refs >= 1);
633                         if (ip->lock.refs > 1) {
634                                 hammer_unref(&ip->lock);
635                                 break;
636                         }
637                 }
638         }
639
640         /*
641          * XXX bad hack until I add code to track inodes in SETUP.  We
642          * can queue a lot of inodes to the syncer but if we don't wake
643          * it up the undo sets will be too large or too many unflushed
644          * records will build up and blow our malloc limit.
645          */
646         if (++hmp->reclaim_count > 256) {
647                 hmp->reclaim_count = 0;
648                 hammer_flusher_async(hmp);
649         }
650 }
651
652 /*
653  * Unload and destroy the specified inode.  Must be called with one remaining
654  * reference.  The reference is disposed of.
655  *
656  * This can only be called in the context of the flusher.
657  */
658 static int
659 hammer_unload_inode(struct hammer_inode *ip)
660 {
661         KASSERT(ip->lock.refs == 1,
662                 ("hammer_unload_inode: %d refs\n", ip->lock.refs));
663         KKASSERT(ip->vp == NULL);
664         KKASSERT(ip->flush_state == HAMMER_FST_IDLE);
665         KKASSERT(ip->cursor_ip_refs == 0);
666         KKASSERT((ip->flags & HAMMER_INODE_MODMASK) == 0);
667
668         KKASSERT(RB_EMPTY(&ip->rec_tree));
669         KKASSERT(TAILQ_EMPTY(&ip->target_list));
670         KKASSERT(TAILQ_EMPTY(&ip->bio_list));
671         KKASSERT(TAILQ_EMPTY(&ip->bio_alt_list));
672
673         RB_REMOVE(hammer_ino_rb_tree, &ip->hmp->rb_inos_root, ip);
674
675         hammer_uncache_node(&ip->cache[0]);
676         hammer_uncache_node(&ip->cache[1]);
677         if (ip->objid_cache)
678                 hammer_clear_objid(ip);
679         --hammer_count_inodes;
680         kfree(ip, M_HAMMER);
681
682         return(0);
683 }
684
685 /*
686  * A transaction has modified an inode, requiring updates as specified by
687  * the passed flags.
688  *
689  * HAMMER_INODE_RDIRTY: Inode record has been updated
690  * HAMMER_INODE_DDIRTY: Inode data has been updated
691  * HAMMER_INODE_XDIRTY: Dirty in-memory records
692  * HAMMER_INODE_BUFS:   Dirty buffer cache buffers
693  * HAMMER_INODE_DELETED: Inode record/data must be deleted
694  * HAMMER_INODE_ITIMES: mtime/atime has been updated
695  */
696 void
697 hammer_modify_inode(hammer_transaction_t trans, hammer_inode_t ip, int flags)
698 {
699         KKASSERT ((ip->flags & HAMMER_INODE_RO) == 0 ||
700                   (flags & (HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
701                    HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS|
702                    HAMMER_INODE_DELETED|HAMMER_INODE_ITIMES)) == 0);
703
704         ip->flags |= flags;
705 }
706
707 /*
708  * Request that an inode be flushed.  This whole mess cannot block and may
709  * recurse.  Once requested HAMMER will attempt to actively flush it until
710  * the flush can be done.
711  *
712  * The inode may already be flushing, or may be in a setup state.  We can
713  * place the inode in a flushing state if it is currently idle and flag it
714  * to reflush if it is currently flushing.
715  */
716 void
717 hammer_flush_inode(hammer_inode_t ip, int flags)
718 {
719         hammer_record_t depend;
720         int r, good;
721
722         /*
723          * Trivial 'nothing to flush' case.  If the inode is ina SETUP
724          * state we have to put it back into an IDLE state so we can
725          * drop the extra ref.
726          */
727         if ((ip->flags & HAMMER_INODE_MODMASK) == 0) {
728                 if (ip->flush_state == HAMMER_FST_SETUP) {
729                         ip->flush_state = HAMMER_FST_IDLE;
730                         hammer_rel_inode(ip, 0);
731                 }
732                 return;
733         }
734
735         /*
736          * Our flush action will depend on the current state.
737          */
738         switch(ip->flush_state) {
739         case HAMMER_FST_IDLE:
740                 /*
741                  * We have no dependancies and can flush immediately.  Some
742                  * our children may not be flushable so we have to re-test
743                  * with that additional knowledge.
744                  */
745                 hammer_flush_inode_core(ip, flags);
746                 break;
747         case HAMMER_FST_SETUP:
748                 /*
749                  * Recurse upwards through dependancies via target_list
750                  * and start their flusher actions going if possible.
751                  *
752                  * 'good' is our connectivity.  -1 means we have none and
753                  * can't flush, 0 means there weren't any dependancies, and
754                  * 1 means we have good connectivity.
755                  */
756                 good = 0;
757                 TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
758                         r = hammer_setup_parent_inodes(depend);
759                         if (r < 0 && good == 0)
760                                 good = -1;
761                         if (r > 0)
762                                 good = 1;
763                 }
764
765                 /*
766                  * We can continue if good >= 0.  Determine how many records
767                  * under our inode can be flushed (and mark them).
768                  */
769                 if (good >= 0) {
770                         hammer_flush_inode_core(ip, flags);
771                 } else {
772                         ip->flags |= HAMMER_INODE_REFLUSH;
773                         if (flags & HAMMER_FLUSH_SIGNAL) {
774                                 ip->flags |= HAMMER_INODE_RESIGNAL;
775                                 hammer_flusher_async(ip->hmp);
776                         }
777                 }
778                 break;
779         default:
780                 /*
781                  * We are already flushing, flag the inode to reflush
782                  * if needed after it completes its current flush.
783                  */
784                 if ((ip->flags & HAMMER_INODE_REFLUSH) == 0)
785                         ip->flags |= HAMMER_INODE_REFLUSH;
786                 if (flags & HAMMER_FLUSH_SIGNAL) {
787                         ip->flags |= HAMMER_INODE_RESIGNAL;
788                         hammer_flusher_async(ip->hmp);
789                 }
790                 break;
791         }
792 }
793
794 /*
795  * We are asked to recurse upwards and convert the record from SETUP
796  * to FLUSH if possible.  record->ip is a parent of the caller's inode,
797  * and record->target_ip is the caller's inode.
798  *
799  * Return 1 if the record gives us connectivity
800  *
801  * Return 0 if the record is not relevant 
802  *
803  * Return -1 if we can't resolve the dependancy and there is no connectivity.
804  */
805 static int
806 hammer_setup_parent_inodes(hammer_record_t record)
807 {
808         hammer_mount_t hmp = record->ip->hmp;
809         hammer_record_t depend;
810         hammer_inode_t ip;
811         int r, good;
812
813         KKASSERT(record->flush_state != HAMMER_FST_IDLE);
814         ip = record->ip;
815
816         /*
817          * If the record is already flushing, is it in our flush group?
818          *
819          * If it is in our flush group but it is a general record or a 
820          * delete-on-disk, it does not improve our connectivity (return 0),
821          * and if the target inode is not trying to destroy itself we can't
822          * allow the operation yet anyway (the second return -1).
823          */
824         if (record->flush_state == HAMMER_FST_FLUSH) {
825                 if (record->flush_group != hmp->flusher_next) {
826                         ip->flags |= HAMMER_INODE_REFLUSH;
827                         return(-1);
828                 }
829                 if (record->type == HAMMER_MEM_RECORD_ADD)
830                         return(1);
831                 /* GENERAL or DEL */
832                 return(0);
833         }
834
835         /*
836          * It must be a setup record.  Try to resolve the setup dependancies
837          * by recursing upwards so we can place ip on the flush list.
838          */
839         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
840
841         good = 0;
842         TAILQ_FOREACH(depend, &ip->target_list, target_entry) {
843                 r = hammer_setup_parent_inodes(depend);
844                 if (r < 0 && good == 0)
845                         good = -1;
846                 if (r > 0)
847                         good = 1;
848         }
849
850         /*
851          * We can't flush ip because it has no connectivity (XXX also check
852          * nlinks for pre-existing connectivity!).  Flag it so any resolution
853          * recurses back down.
854          */
855         if (good < 0) {
856                 ip->flags |= HAMMER_INODE_REFLUSH;
857                 return(good);
858         }
859
860         /*
861          * We are go, place the parent inode in a flushing state so we can
862          * place its record in a flushing state.  Note that the parent
863          * may already be flushing.  The record must be in the same flush
864          * group as the parent.
865          */
866         if (ip->flush_state != HAMMER_FST_FLUSH)
867                 hammer_flush_inode_core(ip, HAMMER_FLUSH_RECURSION);
868         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
869         KKASSERT(record->flush_state == HAMMER_FST_SETUP);
870
871 #if 0
872         if (record->type == HAMMER_MEM_RECORD_DEL &&
873             (record->target_ip->flags & (HAMMER_INODE_DELETED|HAMMER_INODE_DELONDISK)) == 0) {
874                 /*
875                  * Regardless of flushing state we cannot sync this path if the
876                  * record represents a delete-on-disk but the target inode
877                  * is not ready to sync its own deletion.
878                  *
879                  * XXX need to count effective nlinks to determine whether
880                  * the flush is ok, otherwise removing a hardlink will
881                  * just leave the DEL record to rot.
882                  */
883                 record->target_ip->flags |= HAMMER_INODE_REFLUSH;
884                 return(-1);
885         } else
886 #endif
887         if (ip->flush_group == ip->hmp->flusher_next) {
888                 /*
889                  * This is the record we wanted to synchronize.
890                  */
891                 record->flush_state = HAMMER_FST_FLUSH;
892                 record->flush_group = ip->flush_group;
893                 hammer_ref(&record->lock);
894                 if (record->type == HAMMER_MEM_RECORD_ADD)
895                         return(1);
896
897                 /*
898                  * A general or delete-on-disk record does not contribute
899                  * to our visibility.  We can still flush it, however.
900                  */
901                 return(0);
902         } else {
903                 /*
904                  * We couldn't resolve the dependancies, request that the
905                  * inode be flushed when the dependancies can be resolved.
906                  */
907                 ip->flags |= HAMMER_INODE_REFLUSH;
908                 return(-1);
909         }
910 }
911
912 /*
913  * This is the core routine placing an inode into the FST_FLUSH state.
914  */
915 static void
916 hammer_flush_inode_core(hammer_inode_t ip, int flags)
917 {
918         int go_count;
919
920         /*
921          * Set flush state and prevent the flusher from cycling into
922          * the next flush group.  Do not place the ip on the list yet.
923          * Inodes not in the idle state get an extra reference.
924          */
925         KKASSERT(ip->flush_state != HAMMER_FST_FLUSH);
926         if (ip->flush_state == HAMMER_FST_IDLE)
927                 hammer_ref(&ip->lock);
928         ip->flush_state = HAMMER_FST_FLUSH;
929         ip->flush_group = ip->hmp->flusher_next;
930         ++ip->hmp->flusher_lock;
931
932         /*
933          * We need to be able to vfsync/truncate from the backend.
934          */
935         KKASSERT((ip->flags & HAMMER_INODE_VHELD) == 0);
936         if (ip->vp && (ip->vp->v_flag & VINACTIVE) == 0) {
937                 ip->flags |= HAMMER_INODE_VHELD;
938                 vref(ip->vp);
939         }
940
941         /*
942          * Figure out how many in-memory records we can actually flush
943          * (not including inode meta-data, buffers, etc).
944          */
945         if (flags & HAMMER_FLUSH_RECURSION) {
946                 go_count = 1;
947         } else {
948                 go_count = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
949                                    hammer_setup_child_callback, NULL);
950         }
951
952         /*
953          * This is a more involved test that includes go_count.  If we
954          * can't flush, flag the inode and return.  If go_count is 0 we
955          * were are unable to flush any records in our rec_tree and
956          * must ignore the XDIRTY flag.
957          */
958         if (go_count == 0) {
959                 if ((ip->flags & HAMMER_INODE_MODMASK_NOXDIRTY) == 0) {
960                         ip->flags |= HAMMER_INODE_REFLUSH;
961                         ip->flush_state = HAMMER_FST_SETUP;
962                         if (ip->flags & HAMMER_INODE_VHELD) {
963                                 ip->flags &= ~HAMMER_INODE_VHELD;
964                                 vrele(ip->vp);
965                         }
966                         if (flags & HAMMER_FLUSH_SIGNAL) {
967                                 ip->flags |= HAMMER_INODE_RESIGNAL;
968                                 hammer_flusher_async(ip->hmp);
969                         }
970                         if (--ip->hmp->flusher_lock == 0)
971                                 wakeup(&ip->hmp->flusher_lock);
972                         return;
973                 }
974         }
975
976         /*
977          * Snapshot the state of the inode for the backend flusher.
978          *
979          * The truncation must be retained in the frontend until after
980          * we've actually performed the record deletion.
981          *
982          * NOTE: The DELETING flag is a mod flag, but it is also sticky,
983          * and stays in ip->flags.  Once set, it stays set until the
984          * inode is destroyed.
985          */
986         ip->sync_flags = (ip->flags & HAMMER_INODE_MODMASK);
987         ip->sync_trunc_off = ip->trunc_off;
988         ip->sync_ino_rec = ip->ino_rec;
989         ip->sync_ino_data = ip->ino_data;
990         ip->flags &= ~HAMMER_INODE_MODMASK | HAMMER_INODE_TRUNCATED;
991
992         /*
993          * The flusher list inherits our inode and reference.
994          */
995         TAILQ_INSERT_TAIL(&ip->hmp->flush_list, ip, flush_entry);
996         if (--ip->hmp->flusher_lock == 0)
997                 wakeup(&ip->hmp->flusher_lock);
998
999         if (flags & HAMMER_FLUSH_SIGNAL)
1000                 hammer_flusher_async(ip->hmp);
1001 }
1002
1003 /*
1004  * Callback for scan of ip->rec_tree.  Try to include each record in our
1005  * flush.  ip->flush_group has been set but the inode has not yet been
1006  * moved into a flushing state.
1007  *
1008  * If we get stuck on a record we have to set HAMMER_INODE_REFLUSH on
1009  * both inodes.
1010  *
1011  * We return 1 for any record placed or found in FST_FLUSH, which prevents
1012  * the caller from shortcutting the flush.
1013  */
1014 static int
1015 hammer_setup_child_callback(hammer_record_t rec, void *data)
1016 {
1017         hammer_inode_t target_ip;
1018         hammer_inode_t ip;
1019         int r;
1020
1021         /*
1022          * If the record has been deleted by the backend (it's being held
1023          * by the frontend in a race), just ignore it.
1024          */
1025         if (rec->flags & HAMMER_RECF_DELETED_BE)
1026                 return(0);
1027
1028         /*
1029          * If the record is in an idle state it has no dependancies and
1030          * can be flushed.
1031          */
1032         ip = rec->ip;
1033         r = 0;
1034
1035         switch(rec->flush_state) {
1036         case HAMMER_FST_IDLE:
1037                 /*
1038                  * Record has no setup dependancy, we can flush it.
1039                  */
1040                 KKASSERT(rec->target_ip == NULL);
1041                 rec->flush_state = HAMMER_FST_FLUSH;
1042                 rec->flush_group = ip->flush_group;
1043                 hammer_ref(&rec->lock);
1044                 r = 1;
1045                 break;
1046         case HAMMER_FST_SETUP:
1047                 /*
1048                  * Record has a setup dependancy.  Try to include the
1049                  * target ip in the flush. 
1050                  *
1051                  * We have to be careful here, if we do not do the right
1052                  * thing we can lose track of dirty inodes and the system
1053                  * will lockup trying to allocate buffers.
1054                  */
1055                 target_ip = rec->target_ip;
1056                 KKASSERT(target_ip != NULL);
1057                 KKASSERT(target_ip->flush_state != HAMMER_FST_IDLE);
1058                 if (target_ip->flush_state == HAMMER_FST_FLUSH) {
1059                         /*
1060                          * If the target IP is already flushing in our group
1061                          * we are golden, otherwise make sure the target
1062                          * reflushes.
1063                          */
1064                         if (target_ip->flush_group == ip->flush_group) {
1065                                 rec->flush_state = HAMMER_FST_FLUSH;
1066                                 rec->flush_group = ip->flush_group;
1067                                 hammer_ref(&rec->lock);
1068                                 r = 1;
1069                         } else {
1070                                 target_ip->flags |= HAMMER_INODE_REFLUSH;
1071                         }
1072                 } else if (rec->type == HAMMER_MEM_RECORD_ADD) {
1073                         /*
1074                          * If the target IP is not flushing we can force
1075                          * it to flush, even if it is unable to write out
1076                          * any of its own records we have at least one in
1077                          * hand that we CAN deal with.
1078                          */
1079                         rec->flush_state = HAMMER_FST_FLUSH;
1080                         rec->flush_group = ip->flush_group;
1081                         hammer_ref(&rec->lock);
1082                         hammer_flush_inode_core(target_ip,
1083                                                 HAMMER_FLUSH_RECURSION);
1084                         r = 1;
1085                 } else {
1086                         /*
1087                          * General or delete-on-disk record.
1088                          *
1089                          * XXX this needs help.  If a delete-on-disk we could
1090                          * disconnect the target.  If the target has its own
1091                          * dependancies they really need to be flushed.
1092                          *
1093                          * XXX
1094                          */
1095                         rec->flush_state = HAMMER_FST_FLUSH;
1096                         rec->flush_group = ip->flush_group;
1097                         hammer_ref(&rec->lock);
1098                         hammer_flush_inode_core(target_ip,
1099                                                 HAMMER_FLUSH_RECURSION);
1100                         r = 1;
1101                 }
1102                 break;
1103         case HAMMER_FST_FLUSH:
1104                 /* 
1105                  * Record already associated with a flush group.  It had
1106                  * better be ours.
1107                  */
1108                 KKASSERT(rec->flush_group == ip->flush_group);
1109                 r = 1;
1110                 break;
1111         }
1112         return(r);
1113 }
1114
1115 /*
1116  * Wait for a previously queued flush to complete
1117  */
1118 void
1119 hammer_wait_inode(hammer_inode_t ip)
1120 {
1121         while (ip->flush_state != HAMMER_FST_IDLE) {
1122                 ip->flags |= HAMMER_INODE_FLUSHW;
1123                 tsleep(&ip->flags, 0, "hmrwin", 0);
1124         }
1125 }
1126
1127 /*
1128  * Called by the backend code when a flush has been completed.
1129  * The inode has already been removed from the flush list.
1130  *
1131  * A pipelined flush can occur, in which case we must re-enter the
1132  * inode on the list and re-copy its fields.
1133  */
1134 void
1135 hammer_flush_inode_done(hammer_inode_t ip)
1136 {
1137         struct bio *bio;
1138         int dorel = 0;
1139
1140         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1141
1142         /*
1143          * Allow BIOs to queue to the inode's primary bioq again.
1144          */
1145         ip->flags &= ~HAMMER_INODE_WRITE_ALT;
1146
1147         /*
1148          * Merge left-over flags back into the frontend and fix the state.
1149          */
1150         ip->flags |= ip->sync_flags;
1151
1152         /*
1153          * The backend may have adjusted nlinks, so if the adjusted nlinks
1154          * does not match the fronttend set the frontend's RDIRTY flag again.
1155          */
1156         if (ip->ino_rec.ino_nlinks != ip->sync_ino_rec.ino_nlinks)
1157                 ip->flags |= HAMMER_INODE_RDIRTY;
1158
1159         /*
1160          * Reflush any BIOs that wound up in the alt list.  Our inode will
1161          * also wind up at the end of the flusher's list.
1162          */
1163         while ((bio = TAILQ_FIRST(&ip->bio_alt_list)) != NULL) {
1164                 TAILQ_REMOVE(&ip->bio_alt_list, bio, bio_act);
1165                 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
1166         }
1167         /*
1168          * Fix up the dirty buffer status.
1169          */
1170         if (TAILQ_FIRST(&ip->bio_list) ||
1171             (ip->vp && RB_ROOT(&ip->vp->v_rbdirty_tree))) {
1172                 ip->flags |= HAMMER_INODE_BUFS;
1173         }
1174
1175         /*
1176          * Re-set the XDIRTY flag if some of the inode's in-memory records
1177          * could not be flushed.
1178          */
1179         if (RB_ROOT(&ip->rec_tree))
1180                 ip->flags |= HAMMER_INODE_XDIRTY;
1181
1182         /*
1183          * Do not lose track of inodes which no longer have vnode
1184          * assocations, otherwise they may never get flushed again.
1185          */
1186         if ((ip->flags & HAMMER_INODE_MODMASK) && ip->vp == NULL)
1187                 ip->flags |= HAMMER_INODE_REFLUSH;
1188
1189         /*
1190          * Adjust flush_state.  The target state (idle or setup) shouldn't
1191          * be terribly important since we will reflush if we really need
1192          * to do anything. XXX
1193          */
1194         if (TAILQ_EMPTY(&ip->target_list) && RB_EMPTY(&ip->rec_tree)) {
1195                 ip->flush_state = HAMMER_FST_IDLE;
1196                 dorel = 1;
1197         } else {
1198                 ip->flush_state = HAMMER_FST_SETUP;
1199         }
1200
1201         /*
1202          * Clean up the vnode ref
1203          */
1204         if (ip->flags & HAMMER_INODE_VHELD) {
1205                 ip->flags &= ~HAMMER_INODE_VHELD;
1206                 vrele(ip->vp);
1207         }
1208
1209         /*
1210          * If the frontend made more changes and requested another flush,
1211          * then try to get it running.
1212          */
1213         if (ip->flags & HAMMER_INODE_REFLUSH) {
1214                 ip->flags &= ~HAMMER_INODE_REFLUSH;
1215                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
1216                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
1217                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1218                 } else {
1219                         hammer_flush_inode(ip, 0);
1220                 }
1221         }
1222
1223         /*
1224          * Finally, if the frontend is waiting for a flush to complete,
1225          * wake it up.
1226          */
1227         if (ip->flush_state != HAMMER_FST_FLUSH) {
1228                 if (ip->flags & HAMMER_INODE_FLUSHW) {
1229                         ip->flags &= ~HAMMER_INODE_FLUSHW;
1230                         wakeup(&ip->flags);
1231                 }
1232         }
1233         if (dorel)
1234                 hammer_rel_inode(ip, 0);
1235 }
1236
1237 /*
1238  * Called from hammer_sync_inode() to synchronize in-memory records
1239  * to the media.
1240  */
1241 static int
1242 hammer_sync_record_callback(hammer_record_t record, void *data)
1243 {
1244         hammer_cursor_t cursor = data;
1245         hammer_transaction_t trans = cursor->trans;
1246         int error;
1247
1248         /*
1249          * Skip records that do not belong to the current flush.
1250          */
1251         if (record->flush_state != HAMMER_FST_FLUSH)
1252                 return(0);
1253         KKASSERT((record->flags & HAMMER_RECF_DELETED_BE) == 0);
1254 #if 1
1255         if (record->flush_group != record->ip->flush_group) {
1256                 kprintf("sync_record %p ip %p bad flush group %d %d\n", record, record->ip, record->flush_group ,record->ip->flush_group);
1257                 Debugger("blah2");
1258                 return(0);
1259         }
1260 #endif
1261         KKASSERT(record->flush_group == record->ip->flush_group);
1262
1263         /*
1264          * Interlock the record using the BE flag.  Once BE is set the
1265          * frontend cannot change the state of FE.
1266          *
1267          * NOTE: If FE is set prior to us setting BE we still sync the
1268          * record out, but the flush completion code converts it to 
1269          * a delete-on-disk record instead of destroying it.
1270          */
1271         KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0);
1272         record->flags |= HAMMER_RECF_INTERLOCK_BE;
1273
1274         /*
1275          * If DELETED_FE is set we may have already sent dependant pieces
1276          * to the disk and we must flush the record as if it hadn't been
1277          * deleted.  This creates a bit of a mess because we have to
1278          * have ip_sync_record convert the record to MEM_RECORD_DEL before
1279          * it inserts the B-Tree record.  Otherwise the media sync might
1280          * be visible to the frontend.
1281          */
1282         if (record->flags & HAMMER_RECF_DELETED_FE) {
1283                 if (record->type == HAMMER_MEM_RECORD_ADD) {
1284                         record->flags |= HAMMER_RECF_CONVERT_DELETE;
1285                 } else {
1286                         KKASSERT(record->type != HAMMER_MEM_RECORD_DEL);
1287                         return(0);
1288                 }
1289         }
1290
1291         /*
1292          * Assign the create_tid for new records.  Deletions already
1293          * have the record's entire key properly set up.
1294          */
1295         if (record->type != HAMMER_MEM_RECORD_DEL)
1296                 record->rec.inode.base.base.create_tid = trans->tid;
1297         for (;;) {
1298                 error = hammer_ip_sync_record_cursor(cursor, record);
1299                 if (error != EDEADLK)
1300                         break;
1301                 hammer_done_cursor(cursor);
1302                 error = hammer_init_cursor(trans, cursor, &record->ip->cache[0],
1303                                            record->ip);
1304                 if (error)
1305                         break;
1306         }
1307         record->flags &= ~HAMMER_RECF_CONVERT_DELETE;
1308
1309         if (error) {
1310                 error = -error;
1311                 if (error != -ENOSPC) {
1312                         kprintf("hammer_sync_record_callback: sync failed rec "
1313                                 "%p, error %d\n", record, error);
1314                         Debugger("sync failed rec");
1315                 }
1316         }
1317         hammer_flush_record_done(record, error);
1318         return(error);
1319 }
1320
1321 /*
1322  * XXX error handling
1323  */
1324 int
1325 hammer_sync_inode(hammer_inode_t ip)
1326 {
1327         struct hammer_transaction trans;
1328         struct hammer_cursor cursor;
1329         struct bio *bio;
1330         hammer_record_t depend;
1331         hammer_record_t next;
1332         int error, tmp_error;
1333         u_int64_t nlinks;
1334
1335         if ((ip->sync_flags & HAMMER_INODE_MODMASK) == 0)
1336                 return(0);
1337
1338         hammer_start_transaction_fls(&trans, ip->hmp);
1339         error = hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1340         if (error)
1341                 goto done;
1342
1343         /*
1344          * Any directory records referencing this inode which are not in
1345          * our current flush group must adjust our nlink count for the
1346          * purposes of synchronization to disk.
1347          *
1348          * Records which are in our flush group can be unlinked from our
1349          * inode now, allowing the inode to be physically deleted.
1350          */
1351         nlinks = ip->ino_rec.ino_nlinks;
1352         next = TAILQ_FIRST(&ip->target_list);
1353         while ((depend = next) != NULL) {
1354                 next = TAILQ_NEXT(depend, target_entry);
1355                 if (depend->flush_state == HAMMER_FST_FLUSH &&
1356                     depend->flush_group == ip->hmp->flusher_act) {
1357                         TAILQ_REMOVE(&ip->target_list, depend, target_entry);
1358                         depend->target_ip = NULL;
1359                         /* no need to signal target_ip, it is us */
1360                 } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) {
1361                         switch(depend->type) {
1362                         case HAMMER_MEM_RECORD_ADD:
1363                                 --nlinks;
1364                                 break;
1365                         case HAMMER_MEM_RECORD_DEL:
1366                                 ++nlinks;
1367                                 break;
1368                         default:
1369                                 break;
1370                         }
1371                 }
1372         }
1373
1374         /*
1375          * Set dirty if we had to modify the link count.
1376          */
1377         if (ip->sync_ino_rec.ino_nlinks != nlinks) {
1378                 KKASSERT((int64_t)nlinks >= 0);
1379                 ip->sync_ino_rec.ino_nlinks = nlinks;
1380                 ip->sync_flags |= HAMMER_INODE_RDIRTY;
1381         }
1382
1383         /*
1384          * Queue up as many dirty buffers as we can then set a flag to
1385          * cause any further BIOs to go to the alternative queue.
1386          */
1387         if (ip->flags & HAMMER_INODE_VHELD)
1388                 error = vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
1389         ip->flags |= HAMMER_INODE_WRITE_ALT;
1390
1391         /*
1392          * The buffer cache may contain dirty buffers beyond the inode
1393          * state we copied from the frontend to the backend.  Because
1394          * we are syncing our buffer cache on the backend, resync
1395          * the truncation point and the file size so we don't wipe out
1396          * any data.
1397          *
1398          * Syncing the buffer cache on the frontend has serious problems
1399          * because it prevents us from passively queueing dirty inodes
1400          * to the backend (the BIO's could stall indefinitely).
1401          */
1402         if (ip->flags & HAMMER_INODE_TRUNCATED) {
1403                 ip->sync_trunc_off = ip->trunc_off;
1404                 ip->sync_flags |= HAMMER_INODE_TRUNCATED;
1405         }
1406         if (ip->sync_ino_rec.ino_size != ip->ino_rec.ino_size) {
1407                 ip->sync_ino_rec.ino_size = ip->ino_rec.ino_size;
1408                 ip->sync_flags |= HAMMER_INODE_RDIRTY;
1409         }
1410
1411         /*
1412          * If there is a trunction queued destroy any data past the (aligned)
1413          * truncation point.  Userland will have dealt with the buffer
1414          * containing the truncation point for us.
1415          *
1416          * We don't flush pending frontend data buffers until after we've
1417          * dealth with the truncation.
1418          *
1419          * Don't bother if the inode is or has been deleted.
1420          */
1421         if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1422                 /*
1423                  * Interlock trunc_off.  The VOP front-end may continue to
1424                  * make adjustments to it while we are blocked.
1425                  */
1426                 off_t trunc_off;
1427                 off_t aligned_trunc_off;
1428
1429                 trunc_off = ip->sync_trunc_off;
1430                 aligned_trunc_off = (trunc_off + HAMMER_BUFMASK) &
1431                                     ~HAMMER_BUFMASK64;
1432
1433                 /*
1434                  * Delete any whole blocks on-media.  The front-end has
1435                  * already cleaned out any partial block and made it
1436                  * pending.  The front-end may have updated trunc_off
1437                  * while we were blocked so do not just unconditionally
1438                  * set it to the maximum offset.
1439                  */
1440                 error = hammer_ip_delete_range(&cursor, ip,
1441                                                 aligned_trunc_off,
1442                                                 0x7FFFFFFFFFFFFFFFLL);
1443                 if (error)
1444                         Debugger("hammer_ip_delete_range errored");
1445                 ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
1446                 if (ip->trunc_off >= trunc_off) {
1447                         ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
1448                         ip->flags &= ~HAMMER_INODE_TRUNCATED;
1449                 }
1450         } else {
1451                 error = 0;
1452         }
1453
1454         /*
1455          * Now sync related records.  These will typically be directory
1456          * entries or delete-on-disk records.
1457          *
1458          * Not all records will be flushed, but clear XDIRTY anyway.  We
1459          * will set it again in the frontend hammer_flush_inode_done() 
1460          * if records remain.
1461          */
1462         if (error == 0) {
1463                 tmp_error = RB_SCAN(hammer_rec_rb_tree, &ip->rec_tree, NULL,
1464                                     hammer_sync_record_callback, &cursor);
1465                 if (tmp_error < 0)
1466                         tmp_error = -error;
1467                 if (tmp_error)
1468                         error = tmp_error;
1469                 if (error == 0)
1470                         ip->sync_flags &= ~HAMMER_INODE_XDIRTY;
1471         }
1472
1473         /*
1474          * If we are deleting the inode the frontend had better not have
1475          * any active references on elements making up the inode.
1476          */
1477         if (error == 0 && ip->sync_ino_rec.ino_nlinks == 0 &&
1478                 RB_EMPTY(&ip->rec_tree)  &&
1479             (ip->sync_flags & HAMMER_INODE_DELETING) &&
1480             (ip->flags & HAMMER_INODE_DELETED) == 0) {
1481                 int count1 = 0;
1482
1483                 kprintf("Y");
1484                 ip->flags |= HAMMER_INODE_DELETED;
1485                 error = hammer_ip_delete_range_all(&cursor, ip, &count1);
1486                 if (error == 0) {
1487                         ip->sync_flags &= ~HAMMER_INODE_DELETING;
1488                         ip->sync_flags &= ~HAMMER_INODE_TRUNCATED;
1489                         KKASSERT(RB_EMPTY(&ip->rec_tree));
1490
1491                         /*
1492                          * Set delete_tid in both the frontend and backend
1493                          * copy of the inode record.  The DELETED flag handles
1494                          * this, do not set RDIRTY.
1495                          */
1496                         ip->ino_rec.base.base.delete_tid = trans.tid;
1497                         ip->sync_ino_rec.base.base.delete_tid = trans.tid;
1498
1499                         /*
1500                          * Adjust the inode count in the volume header
1501                          */
1502                         hammer_modify_volume_field(&trans, trans.rootvol,
1503                                                    vol0_stat_inodes);
1504                         --ip->hmp->rootvol->ondisk->vol0_stat_inodes;
1505                         hammer_modify_volume_done(trans.rootvol);
1506                 } else {
1507                         ip->flags &= ~HAMMER_INODE_DELETED;
1508                         Debugger("hammer_ip_delete_range_all errored");
1509                 }
1510         }
1511
1512         /*
1513          * Flush any queued BIOs.  These will just biodone() the IO's if
1514          * the inode has been deleted.
1515          */
1516         while ((bio = TAILQ_FIRST(&ip->bio_list)) != NULL) {
1517                 TAILQ_REMOVE(&ip->bio_list, bio, bio_act);
1518                 tmp_error = hammer_dowrite(&cursor, ip, bio);
1519                 if (tmp_error)
1520                         error = tmp_error;
1521         }
1522         ip->sync_flags &= ~HAMMER_INODE_BUFS;
1523
1524         if (error)
1525                 Debugger("RB_SCAN errored");
1526
1527         /*
1528          * Now update the inode's on-disk inode-data and/or on-disk record.
1529          * DELETED and ONDISK are managed only in ip->flags.
1530          */
1531         switch(ip->flags & (HAMMER_INODE_DELETED | HAMMER_INODE_ONDISK)) {
1532         case HAMMER_INODE_DELETED|HAMMER_INODE_ONDISK:
1533                 /*
1534                  * If deleted and on-disk, don't set any additional flags.
1535                  * the delete flag takes care of things.
1536                  *
1537                  * Clear flags which may have been set by the frontend.
1538                  */
1539                 ip->sync_flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
1540                                     HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES|
1541                                     HAMMER_INODE_DELETING);
1542                 break;
1543         case HAMMER_INODE_DELETED:
1544                 /*
1545                  * Take care of the case where a deleted inode was never
1546                  * flushed to the disk in the first place.
1547                  *
1548                  * Clear flags which may have been set by the frontend.
1549                  */
1550                 ip->sync_flags &= ~(HAMMER_INODE_RDIRTY|HAMMER_INODE_DDIRTY|
1551                                     HAMMER_INODE_XDIRTY|HAMMER_INODE_ITIMES|
1552                                     HAMMER_INODE_DELETING);
1553                 while (RB_ROOT(&ip->rec_tree)) {
1554                         hammer_record_t record = RB_ROOT(&ip->rec_tree);
1555                         hammer_ref(&record->lock);
1556                         KKASSERT(record->lock.refs == 1);
1557                         record->flags |= HAMMER_RECF_DELETED_FE;
1558                         record->flags |= HAMMER_RECF_DELETED_BE;
1559                         hammer_rel_mem_record(record);
1560                 }
1561                 break;
1562         case HAMMER_INODE_ONDISK:
1563                 /*
1564                  * If already on-disk, do not set any additional flags.
1565                  */
1566                 break;
1567         default:
1568                 /*
1569                  * If not on-disk and not deleted, set both dirty flags
1570                  * to force an initial record to be written.  Also set
1571                  * the create_tid for the inode.
1572                  *
1573                  * Set create_tid in both the frontend and backend
1574                  * copy of the inode record.
1575                  */
1576                 ip->ino_rec.base.base.create_tid = trans.tid;
1577                 ip->sync_ino_rec.base.base.create_tid = trans.tid;
1578                 ip->sync_flags |= HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY;
1579                 break;
1580         }
1581
1582         /*
1583          * If RDIRTY or DDIRTY is set, write out a new record.  If the inode
1584          * is already on-disk the old record is marked as deleted.
1585          *
1586          * If DELETED is set hammer_update_inode() will delete the existing
1587          * record without writing out a new one.
1588          *
1589          * If *ONLY* the ITIMES flag is set we can update the record in-place.
1590          */
1591         if (ip->flags & HAMMER_INODE_DELETED) {
1592                 error = hammer_update_inode(&cursor, ip);
1593         } else 
1594         if ((ip->sync_flags & (HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY |
1595                                HAMMER_INODE_ITIMES)) == HAMMER_INODE_ITIMES) {
1596                 error = hammer_update_itimes(&cursor, ip);
1597         } else
1598         if (ip->sync_flags & (HAMMER_INODE_RDIRTY | HAMMER_INODE_DDIRTY |
1599                               HAMMER_INODE_ITIMES)) {
1600                 error = hammer_update_inode(&cursor, ip);
1601         }
1602         if (error)
1603                 Debugger("hammer_update_itimes/inode errored");
1604 done:
1605         /*
1606          * Save the TID we used to sync the inode with to make sure we
1607          * do not improperly reuse it.
1608          */
1609         hammer_done_cursor(&cursor);
1610         hammer_done_transaction(&trans);
1611         return(error);
1612 }
1613
1614 /*
1615  * This routine is called when the OS is no longer actively referencing
1616  * the inode (but might still be keeping it cached), or when releasing
1617  * the last reference to an inode.
1618  *
1619  * At this point if the inode's nlinks count is zero we want to destroy
1620  * it, which may mean destroying it on-media too.
1621  */
1622 void
1623 hammer_inode_unloadable_check(hammer_inode_t ip, int getvp)
1624 {
1625         struct vnode *vp;
1626
1627         /*
1628          * If the inode is on-media and the link count is 0 we MUST delete
1629          * it on-media.  DELETING is a mod flag, DELETED is a state flag.
1630          */
1631         if (ip->ino_rec.ino_nlinks == 0 &&
1632             (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) == 0) {
1633                 ip->flags |= HAMMER_INODE_DELETING;
1634                 ip->flags |= HAMMER_INODE_TRUNCATED;
1635                 ip->trunc_off = 0;
1636                 vp = NULL;
1637                 if (getvp) {
1638                         if (hammer_get_vnode(ip, &vp) != 0)
1639                                 return;
1640                 }
1641                 if (ip->vp) {
1642                         vtruncbuf(ip->vp, 0, HAMMER_BUFSIZE);
1643                         vnode_pager_setsize(ip->vp, 0);
1644                 }
1645                 if (getvp) {
1646                         vput(vp);
1647                 }
1648         }
1649 }
1650
1651 /*
1652  * Re-test an inode when a dependancy had gone away to see if we
1653  * can chain flush it.
1654  */
1655 void
1656 hammer_test_inode(hammer_inode_t ip)
1657 {
1658         if (ip->flags & HAMMER_INODE_REFLUSH) {
1659                 ip->flags &= ~HAMMER_INODE_REFLUSH;
1660                 hammer_ref(&ip->lock);
1661                 if (ip->flags & HAMMER_INODE_RESIGNAL) {
1662                         ip->flags &= ~HAMMER_INODE_RESIGNAL;
1663                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1664                 } else {
1665                         hammer_flush_inode(ip, 0);
1666                 }
1667                 hammer_rel_inode(ip, 0);
1668         }
1669 }
1670