hammer2 - Rewrite internal chain algorithms
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35
36 #include <sys/cdefs.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/lock.h>
41 #include <sys/uuid.h>
42
43 #include "hammer2.h"
44
45 /*
46  * Recursively flush the specified chain.  The chain is locked and
47  * referenced by the caller and will remain so on return.  The chain
48  * will remain referenced throughout but can temporarily lose its
49  * lock during the recursion to avoid unnecessarily stalling user
50  * processes.
51  */
52 struct hammer2_flush_info {
53         hammer2_chain_t *parent;
54         hammer2_trans_t *trans;
55         int             depth;
56         int             diddeferral;
57         int             pass;
58         int             cache_index;
59         struct h2_flush_deferral_list flush_list;
60         hammer2_tid_t   sync_tid;       /* flush synchronization point */
61         hammer2_tid_t   mirror_tid;     /* collect mirror TID updates */
62 };
63
64 typedef struct hammer2_flush_info hammer2_flush_info_t;
65
66 static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
67                                 hammer2_chain_t *chain);
68 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
69 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
70 static void hammer2_rollup_stats(hammer2_chain_t *parent,
71                                 hammer2_chain_t *child, int how);
72
73 #if 0
74 static __inline
75 void
76 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
77                     int how)
78 {
79         hammer2_key_t bytes;
80
81         if (bref->type != 0) {
82                 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
83                 if (bref->type == HAMMER2_BREF_TYPE_INODE)
84                         info->inode_count += how;
85                 if (how < 0)
86                         info->data_count -= bytes;
87                 else
88                         info->data_count += bytes;
89         }
90 }
91 #endif
92
93 /*
94  * Transaction support functions for writing to the filesystem.
95  *
96  * Initializing a new transaction allocates a transaction ID.  We
97  * don't bother marking the volume header MODIFIED.  Instead, the volume
98  * will be synchronized at a later time as part of a larger flush sequence.
99  *
100  * Non-flush transactions can typically run concurrently.  However if
101  * there are non-flush transaction both before AND after a flush trans,
102  * the transactions after stall until the ones before finish.
103  *
104  * Non-flush transactions occuring after a flush pointer can run concurrently
105  * with that flush.  They only have to wait for transactions prior to the
106  * flush trans to complete before they unstall.
107  *
108  * WARNING! Transaction ids are only allocated when the transaction becomes
109  *          active, which allows other transactions to insert ahead of us
110  *          if we are forced to block (only bioq transactions do that).
111  *
112  * WARNING! Modifications to the root volume cannot dup the root volume
113  *          header to handle synchronization points, so alloc_tid can
114  *          wind up (harmlessly) more advanced on flush.
115  *
116  * WARNING! Operations which might call inode_duplicate()/chain_duplicate()
117  *          depend heavily on having a unique sync_tid to avoid duplication
118  *          collisions (which key off of delete_tid).
119  */
120 void
121 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
122 {
123         hammer2_cluster_t *cluster;
124         hammer2_mount_t *hmp;
125         hammer2_trans_t *scan;
126
127         bzero(trans, sizeof(*trans));
128         trans->pmp = pmp;
129         cluster = pmp->cluster;
130         hmp = cluster->hmp;
131
132         hammer2_voldata_lock(hmp);
133         trans->flags = flags;
134         trans->td = curthread;
135         /*trans->delete_gen = 0;*/      /* multiple deletions within trans */
136
137         if (flags & HAMMER2_TRANS_ISFLUSH) {
138                 /*
139                  * If multiple flushes are trying to run we have to
140                  * wait until it is our turn, then set curflush to
141                  * indicate that a flush is now pending (but not
142                  * necessarily active yet).
143                  *
144                  * NOTE: Do not set trans->blocked here.
145                  */
146                 ++hmp->flushcnt;
147                 while (hmp->curflush != NULL) {
148                         lksleep(&hmp->curflush, &hmp->voldatalk,
149                                 0, "h2multf", hz);
150                 }
151                 hmp->curflush = trans;
152                 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
153
154                 /*
155                  * If we are a flush we have to wait for all transactions
156                  * prior to our flush synchronization point to complete
157                  * before we can start our flush.
158                  *
159                  * Most importantly, this includes bioq flushes.
160                  *
161                  * NOTE: Do not set trans->blocked here.
162                  */
163                 while (TAILQ_FIRST(&hmp->transq) != trans) {
164                         lksleep(&trans->sync_tid, &hmp->voldatalk,
165                                 0, "h2syncw", hz);
166                 }
167
168                 /*
169                  * don't assign sync_tid until we become the running
170                  * flush.  topo_flush_tid is used to control when
171                  * chain modifications in concurrent transactions are
172                  * required to delete-duplicate (so as not to disturb
173                  * the state of what is being currently flushed).
174                  */
175                 trans->sync_tid = hmp->voldata.alloc_tid++;
176                 hmp->topo_flush_tid = trans->sync_tid;
177
178                 /*
179                  * Once we become the running flush we can wakeup anyone
180                  * who blocked on us, up to the next flush.  That is,
181                  * our flush can run concurrent with frontend operations.
182                  */
183                 scan = trans;
184                 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) {
185                         if (scan->flags & HAMMER2_TRANS_ISFLUSH)
186                                 break;
187                         if (scan->blocked == 0)
188                                 break;
189                         scan->blocked = 0;
190                         wakeup(&scan->blocked);
191                 }
192         } else if ((flags & HAMMER2_TRANS_BUFCACHE) && hmp->curflush) {
193                 /*
194                  * We cannot block if we are the bioq thread.  When a
195                  * flush is not pending we can operate normally but
196                  * if a flush IS pending the bioq thread's transaction
197                  * must be placed either before or after curflush.
198                  *
199                  * If the current flush is waiting the bioq thread's
200                  * transaction is placed before.  If it is running the
201                  * bioq thread's transaction is placed after.
202                  */
203                 scan = TAILQ_FIRST(&hmp->transq);
204                 if (scan != hmp->curflush) {
205                         TAILQ_INSERT_BEFORE(hmp->curflush, trans, entry);
206                 } else {
207                         TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
208                 }
209                 trans->sync_tid = hmp->voldata.alloc_tid++;
210         } else {
211                 /*
212                  * If this is a normal transaction and not a flush, or
213                  * if this is a bioq transaction and no flush is pending,
214                  * we can queue normally.
215                  *
216                  * Normal transactions must block while a pending flush is
217                  * waiting for prior transactions to complete.  Once the
218                  * pending flush becomes active we can run concurrently
219                  * with it.
220                  */
221                 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
222                 scan = TAILQ_FIRST(&hmp->transq);
223                 if (hmp->curflush && hmp->curflush != scan) {
224                         trans->blocked = 1;
225                         while (trans->blocked) {
226                                 lksleep(&trans->blocked, &hmp->voldatalk,
227                                         0, "h2trans", hz);
228                         }
229                 }
230                 trans->sync_tid = hmp->voldata.alloc_tid++;
231         }
232         hammer2_voldata_unlock(hmp, 0);
233 }
234
235 void
236 hammer2_trans_done(hammer2_trans_t *trans)
237 {
238         hammer2_cluster_t *cluster;
239         hammer2_mount_t *hmp;
240         hammer2_trans_t *scan;
241
242         cluster = trans->pmp->cluster;
243         hmp = cluster->hmp;
244
245         hammer2_voldata_lock(hmp);
246         TAILQ_REMOVE(&hmp->transq, trans, entry);
247         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
248                 --hmp->flushcnt;
249                 if (hmp->flushcnt) {
250                         /*
251                          * If we were a flush then wakeup anyone waiting on
252                          * curflush (i.e. other flushes that want to run).
253                          * Leave topo_flush_id set (I think we could probably
254                          * clear it to zero here).
255                          */
256                         hmp->curflush = NULL;
257                         wakeup(&hmp->curflush);
258                 } else {
259                         /*
260                          * Theoretically we don't have to clear flush_tid
261                          * here since the flush will have synchronized
262                          * all operations <= flush_tid already.  But for
263                          * now zero-it.
264                          */
265                         hmp->curflush = NULL;
266                         hmp->topo_flush_tid = 0;
267                 }
268         } else {
269                 /*
270                  * If we are not a flush but a flush is now at the head
271                  * of the queue and we were previously blocking it,
272                  * we can now unblock it.
273                  */
274                 if (hmp->flushcnt &&
275                     (scan = TAILQ_FIRST(&hmp->transq)) != NULL &&
276                     trans->sync_tid < scan->sync_tid &&
277                     (scan->flags & HAMMER2_TRANS_ISFLUSH)) {
278                         wakeup(&scan->sync_tid);
279                 }
280         }
281         hammer2_voldata_unlock(hmp, 0);
282 }
283
284 /*
285  * Flush the chain and all modified sub-chains through the specified
286  * synchronization point (sync_tid), propagating parent chain modifications
287  * and mirror_tid updates back up as needed.  Since we are recursing downward
288  * we do not have to deal with the complexities of multi-homed chains (chains
289  * with multiple parents).
290  *
291  * Caller must have interlocked against any non-flush-related modifying
292  * operations in progress whos modify_tid values are less than or equal
293  * to the passed sync_tid.
294  *
295  * Caller must have already vetted synchronization points to ensure they
296  * are properly flushed.  Only snapshots and cluster flushes can create
297  * these sorts of synchronization points.
298  *
299  * This routine can be called from several places but the most important
300  * is from the hammer2_vop_reclaim() function.  We want to try to completely
301  * clean out the inode structure to prevent disconnected inodes from
302  * building up and blowing out the kmalloc pool.  However, it is not actually
303  * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
304  * capability.
305  *
306  * chain is locked on call and will remain locked on return.  If a flush
307  * occured, the chain's MOVED bit will be set indicating that its parent
308  * (which is not part of the flush) should be updated.
309  */
310 void
311 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
312 {
313         hammer2_chain_t *scan;
314         hammer2_chain_core_t *core;
315         hammer2_flush_info_t info;
316
317         /*
318          * Execute the recursive flush and handle deferrals.
319          *
320          * Chains can be ridiculously long (thousands deep), so to
321          * avoid blowing out the kernel stack the recursive flush has a
322          * depth limit.  Elements at the limit are placed on a list
323          * for re-execution after the stack has been popped.
324          */
325         bzero(&info, sizeof(info));
326         TAILQ_INIT(&info.flush_list);
327         info.trans = trans;
328         info.sync_tid = trans->sync_tid;
329         info.mirror_tid = 0;
330         info.cache_index = -1;
331
332         core = chain->core;
333
334         for (;;) {
335                 /*
336                  * Unwind deep recursions which had been deferred.  This
337                  * can leave MOVED set for these chains, which will be
338                  * handled when we [re]flush chain after the unwind.
339                  */
340                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
341                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
342                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
343                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
344
345                         /*
346                          * Now that we've popped back up we can do a secondary
347                          * recursion on the deferred elements.
348                          */
349                         if (hammer2_debug & 0x0040)
350                                 kprintf("defered flush %p\n", scan);
351                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
352                         hammer2_chain_flush(trans, scan);
353                         hammer2_chain_unlock(scan);
354                         hammer2_chain_drop(scan);       /* ref from deferral */
355                 }
356
357                 /*
358                  * Flush pass1 on root.
359                  */
360                 info.diddeferral = 0;
361                 hammer2_chain_flush_core(&info, chain);
362 #if FLUSH_DEBUG
363                 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n",
364                         chain, chain->bref.type, chain->flags);
365 #endif
366
367                 /*
368                  * Only loop if deep recursions have been deferred.
369                  */
370                 if (TAILQ_EMPTY(&info.flush_list))
371                         break;
372         }
373 }
374
375 /*
376  * This is the core of the chain flushing code.  The chain is locked by the
377  * caller and remains locked on return.  This function is keyed off of
378  * the SUBMODIFIED bit but must make fine-grained choices based on the
379  * synchronization point we are flushing to.
380  *
381  * If the flush accomplished any work chain will be flagged MOVED
382  * indicating a copy-on-write propagation back up is required.
383  * Deep sub-nodes may also have been entered onto the deferral list.
384  * MOVED is never set on the volume root.
385  *
386  * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
387  *       only when a chain is specifically modified, and not updated
388  *       for copy-on-write propagations.  MODIFIED is set on any modification
389  *       including copy-on-write propagations.
390  */
391 static void
392 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
393 {
394         hammer2_mount_t *hmp;
395         hammer2_blockref_t *bref;
396         hammer2_off_t pbase;
397         hammer2_off_t pmask;
398         hammer2_tid_t saved_sync;
399         hammer2_trans_t *trans = info->trans;
400         hammer2_chain_core_t *core;
401         size_t psize;
402         size_t boff;
403         char *bdata;
404         struct buf *bp;
405         int error;
406         int wasmodified;
407         int diddeferral = 0;
408
409         hmp = chain->hmp;
410
411 #if FLUSH_DEBUG
412         if (info->parent)
413                 kprintf("flush_core %p->%p.%d %08x (%s)\n",
414                         info->parent, chain, chain->bref.type,
415                         chain->flags,
416                         ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
417                                 chain->data->ipdata.filename : "?"));
418         else
419                 kprintf("flush_core NULL->%p.%d %08x (%s)\n",
420                         chain, chain->bref.type,
421                         chain->flags,
422                         ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
423                                 chain->data->ipdata.filename : "?"));
424 #endif
425         /*
426          * Ignore chains modified beyond the current flush point.  These
427          * will be treated as if they did not exist.
428          */
429         if (chain->modify_tid > info->sync_tid)
430                 return;
431
432         /*
433          * Deleted chains which have not been destroyed must be retained,
434          * and we probably have to recurse to clean-up any sub-trees.
435          * However, restricted flushes can stop processing here because
436          * the chain cleanup will be handled by a later normal flush.
437          *
438          * The MODIFIED bit can likely be cleared in this situation and we
439          * will do so later on in this procedure.
440          */
441         if (chain->delete_tid <= info->sync_tid) {
442                 if (trans->flags & HAMMER2_TRANS_RESTRICTED)
443                         return;
444         }
445
446         saved_sync = info->sync_tid;
447         core = chain->core;
448
449         /*
450          * If SUBMODIFIED is set we recurse the flush and adjust the
451          * blockrefs accordingly.
452          *
453          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
454          *       finishing in the face of filesystem activity.
455          */
456         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
457                 hammer2_chain_t *saved_parent;
458                 hammer2_tid_t saved_mirror;
459                 hammer2_chain_layer_t *layer;
460
461                 /*
462                  * Clear SUBMODIFIED to catch races.  Note that any child
463                  * with MODIFIED, DELETED, or MOVED set during scan2, or
464                  * which tries to lastdrop but can't free its structures,
465                  * or which gets defered, will cause SUBMODIFIED to be set
466                  * again.
467                  *
468                  * We don't want to set our chain to MODIFIED gratuitously.
469                  *
470                  * We need an extra ref on chain because we are going to
471                  * release its lock temporarily in our child loop.
472                  */
473                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
474                 hammer2_chain_ref(chain);
475
476                 /*
477                  * Run two passes.  The first pass handles MODIFIED and
478                  * SUBMODIFIED chains and recurses while the second pass
479                  * handles MOVED chains on the way back up.
480                  *
481                  * If the stack gets too deep we defer scan1, but must
482                  * be sure to still run scan2 if on the next loop the
483                  * deferred chain has been flushed and now needs MOVED
484                  * handling on the way back up.
485                  *
486                  * Scan1 is recursive.
487                  *
488                  * NOTE: The act of handling a modified/submodified chain can
489                  *       cause the MOVED Flag to be set.  It can also be set
490                  *       via hammer2_chain_delete() and in other situations.
491                  *
492                  * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
493                  *       because children can be physically removed during
494                  *       the scan.
495                  */
496                 saved_parent = info->parent;
497                 saved_mirror = info->mirror_tid;
498                 info->parent = chain;
499                 info->mirror_tid = chain->bref.mirror_tid;
500
501                 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
502                         if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
503                                 hammer2_chain_ref(chain);
504                                 TAILQ_INSERT_TAIL(&info->flush_list,
505                                                   chain, flush_node);
506                                 atomic_set_int(&chain->flags,
507                                                HAMMER2_CHAIN_DEFERRED);
508                         }
509                         diddeferral = 1;
510                 } else {
511                         info->diddeferral = 0;
512                         spin_lock(&core->cst.spin);
513                         TAILQ_FOREACH_REVERSE(layer, &core->layerq,
514                                               h2_layer_list, entry) {
515                                 ++layer->refs;
516                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
517                                         NULL, hammer2_chain_flush_scan1, info);
518                                 --layer->refs;
519                                 diddeferral += info->diddeferral;
520                         }
521                         spin_unlock(&core->cst.spin);
522                 }
523
524                 /*
525                  * Handle successfully flushed children who are in the MOVED
526                  * state on the way back up the recursion.  This can have
527                  * the side-effect of clearing MOVED.
528                  *
529                  * Scan2 is non-recursive.
530                  */
531                 if (diddeferral) {
532                         atomic_set_int(&chain->flags,
533                                        HAMMER2_CHAIN_SUBMODIFIED);
534                         spin_lock(&core->cst.spin);
535                 } else {
536                         spin_lock(&core->cst.spin);
537                         TAILQ_FOREACH_REVERSE(layer, &core->layerq,
538                                               h2_layer_list, entry) {
539                                 info->pass = 1;
540                                 ++layer->refs;
541                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
542                                         NULL, hammer2_chain_flush_scan2, info);
543                                 info->pass = 2;
544                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
545                                         NULL, hammer2_chain_flush_scan2, info);
546                                 /*diddeferral += info->diddeferral; n/a*/
547                                 --layer->refs;
548                         }
549                 }
550                 hammer2_chain_layer_check_locked(chain->hmp, core);
551                 spin_unlock(&core->cst.spin);
552
553                 chain->bref.mirror_tid = info->mirror_tid;
554                 info->mirror_tid = saved_mirror;
555                 info->parent = saved_parent;
556                 hammer2_chain_drop(chain);
557         }
558
559         /*
560          * Restore sync_tid in case it was restricted by a delete/duplicate.
561          */
562         info->sync_tid = saved_sync;
563
564         /*
565          * Rollup diddeferral for caller.  Note direct assignment, not +=.
566          */
567         info->diddeferral = diddeferral;
568
569         /*
570          * Do not flush chain if there were any deferrals.  It will be
571          * retried later after the deferrals are independently handled.
572          */
573         if (diddeferral) {
574                 if (hammer2_debug & 0x0008) {
575                         kprintf("%*.*s} %p/%d %04x (deferred)",
576                                 info->depth, info->depth, "",
577                                 chain, chain->refs, chain->flags);
578                 }
579                 return;
580         }
581
582         /*
583          * If we encounter a deleted chain within our flush we can clear
584          * the MODIFIED bit and avoid flushing it whether it has been
585          * destroyed or not.  We must make sure that the chain is flagged
586          * MOVED in this situation so the parent picks up the deletion.
587          *
588          * Note that scan2 has already executed above so statistics have
589          * already been rolled up.
590          */
591         if (chain->delete_tid <= info->sync_tid) {
592                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
593                         if (chain->bp) {
594                                 if (chain->bytes == chain->bp->b_bufsize)
595                                         chain->bp->b_flags |= B_INVAL|B_RELBUF;
596                         }
597                         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
598                                 hammer2_chain_ref(chain);
599                                 atomic_set_int(&chain->flags,
600                                                HAMMER2_CHAIN_MOVED);
601                         }
602                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
603                         hammer2_chain_drop(chain);
604                 }
605                 return;
606         }
607 #if 0
608         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
609             (chain->flags & HAMMER2_CHAIN_DELETED) &&
610             (trans->flags & HAMMER2_TRANS_RESTRICTED) == 0) {
611                 /*
612                  * Throw-away the MODIFIED flag
613                  */
614                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
615                         if (chain->bp) {
616                                 if (chain->bytes == chain->bp->b_bufsize)
617                                         chain->bp->b_flags |= B_INVAL|B_RELBUF;
618                         }
619                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
620                         hammer2_chain_drop(chain);
621                 }
622                 return;
623         }
624 #endif
625
626         /*
627          * A degenerate flush might not have flushed anything and thus not
628          * processed modified blocks on the way back up.  Detect the case.
629          *
630          * Note that MOVED can be set without MODIFIED being set due to
631          * a deletion, in which case it is handled by Scan2 later on.
632          *
633          * Both bits can be set along with DELETED due to a deletion if
634          * modified data within the synchronization zone and the chain
635          * was then deleted beyond the zone, in which case we still have
636          * to flush for synchronization point consistency.  Otherwise though
637          * DELETED and MODIFIED are treated as separate flags.
638          */
639         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
640                 return;
641
642         /*
643          * Issue flush.
644          *
645          * A DESTROYED node that reaches this point must be flushed for
646          * synchronization point consistency.
647          */
648
649         /*
650          * Update mirror_tid, clear MODIFIED, and set MOVED.
651          *
652          * The caller will update the parent's reference to this chain
653          * by testing MOVED as long as the modification was in-bounds.
654          *
655          * MOVED is never set on the volume root as there is no parent
656          * to adjust.
657          */
658         if (chain->bref.mirror_tid < info->sync_tid)
659                 chain->bref.mirror_tid = info->sync_tid;
660         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
661         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
662         if (chain == &hmp->vchain)
663                 kprintf("(FLUSHED VOLUME HEADER)\n");
664         if (chain == &hmp->fchain)
665                 kprintf("(FLUSHED FREEMAP HEADER)\n");
666
667         if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
668             chain == &hmp->vchain ||
669             chain == &hmp->fchain) {
670                 /*
671                  * Drop the ref from the MODIFIED bit we cleared.
672                  */
673                 if (wasmodified)
674                         hammer2_chain_drop(chain);
675         } else {
676                 /*
677                  * If we were MODIFIED we inherit the ref from clearing
678                  * that bit, otherwise we need another ref.
679                  */
680                 if (wasmodified == 0)
681                         hammer2_chain_ref(chain);
682                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
683         }
684
685         /*
686          * If this is part of a recursive flush we can go ahead and write
687          * out the buffer cache buffer and pass a new bref back up the chain
688          * via the MOVED bit.
689          *
690          * Volume headers are NOT flushed here as they require special
691          * processing.
692          */
693         switch(chain->bref.type) {
694         case HAMMER2_BREF_TYPE_FREEMAP:
695                 hammer2_modify_volume(hmp);
696                 break;
697         case HAMMER2_BREF_TYPE_VOLUME:
698                 /*
699                  * We should flush the free block table before we calculate
700                  * CRCs and copy voldata -> volsync.
701                  *
702                  * To prevent SMP races, fchain must remain locked until
703                  * voldata is copied to volsync.
704                  */
705                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
706                 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
707                                          HAMMER2_CHAIN_SUBMODIFIED)) {
708                         /* this will modify vchain as a side effect */
709                         hammer2_chain_flush(info->trans, &hmp->fchain);
710                 }
711
712                 /*
713                  * The volume header is flushed manually by the syncer, not
714                  * here.  All we do is adjust the crc's.
715                  */
716                 KKASSERT(chain->data != NULL);
717                 KKASSERT(chain->bp == NULL);
718                 kprintf("volume header mirror_tid %jd\n",
719                         hmp->voldata.mirror_tid);
720
721                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
722                         hammer2_icrc32(
723                                 (char *)&hmp->voldata +
724                                  HAMMER2_VOLUME_ICRC1_OFF,
725                                 HAMMER2_VOLUME_ICRC1_SIZE);
726                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
727                         hammer2_icrc32(
728                                 (char *)&hmp->voldata +
729                                  HAMMER2_VOLUME_ICRC0_OFF,
730                                 HAMMER2_VOLUME_ICRC0_SIZE);
731                 hmp->voldata.icrc_volheader =
732                         hammer2_icrc32(
733                                 (char *)&hmp->voldata +
734                                  HAMMER2_VOLUME_ICRCVH_OFF,
735                                 HAMMER2_VOLUME_ICRCVH_SIZE);
736                 hmp->volsync = hmp->voldata;
737                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
738                 hammer2_chain_unlock(&hmp->fchain);
739                 break;
740         case HAMMER2_BREF_TYPE_DATA:
741                 /*
742                  * Data elements have already been flushed via the logical
743                  * file buffer cache.  Their hash was set in the bref by
744                  * the vop_write code.
745                  *
746                  * Make sure any device buffer(s) have been flushed out here.
747                  * (there aren't usually any to flush).
748                  */
749                 psize = hammer2_devblksize(chain->bytes);
750                 pmask = (hammer2_off_t)psize - 1;
751                 pbase = chain->bref.data_off & ~pmask;
752                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
753
754                 bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0);
755                 if (bp) {
756                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
757                             (B_CACHE | B_DIRTY)) {
758                                 cluster_awrite(bp);
759                         } else {
760                                 bp->b_flags |= B_RELBUF;
761                                 brelse(bp);
762                         }
763                 }
764                 break;
765 #if 0
766         case HAMMER2_BREF_TYPE_INDIRECT:
767                 /*
768                  * Indirect blocks may be in an INITIAL state.  Use the
769                  * chain_lock() call to ensure that the buffer has been
770                  * instantiated (even though it is already locked the buffer
771                  * might not have been instantiated).
772                  *
773                  * Only write the buffer out if it is dirty, it is possible
774                  * the operating system had already written out the buffer.
775                  */
776                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
777                 KKASSERT(chain->bp != NULL);
778
779                 bp = chain->bp;
780                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
781                     (bp->b_flags & B_DIRTY)) {
782                         bdwrite(chain->bp);
783                 } else {
784                         brelse(chain->bp);
785                 }
786                 chain->bp = NULL;
787                 chain->data = NULL;
788                 hammer2_chain_unlock(chain);
789                 break;
790 #endif
791         case HAMMER2_BREF_TYPE_INDIRECT:
792         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
793                 /*
794                  * Device-backed.  Buffer will be flushed by the sync
795                  * code XXX.
796                  */
797                 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
798                 break;
799         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
800         default:
801                 /*
802                  * Embedded elements have to be flushed out.
803                  * (Basically just BREF_TYPE_INODE).
804                  */
805                 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
806                 KKASSERT(chain->data != NULL);
807                 KKASSERT(chain->bp == NULL);
808                 bref = &chain->bref;
809
810                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
811                 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) ==
812                          HAMMER2_CHECK_ISCSI32 ||
813                          HAMMER2_DEC_CHECK(chain->bref.methods) ==
814                          HAMMER2_CHECK_FREEMAP);
815
816                 /*
817                  * The data is embedded, we have to acquire the
818                  * buffer cache buffer and copy the data into it.
819                  */
820                 psize = hammer2_devblksize(chain->bytes);
821                 pmask = (hammer2_off_t)psize - 1;
822                 pbase = bref->data_off & ~pmask;
823                 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
824
825                 /*
826                  * The getblk() optimization can only be used if the
827                  * physical block size matches the request.
828                  */
829                 error = bread(hmp->devvp, pbase, psize, &bp);
830                 KKASSERT(error == 0);
831
832                 bdata = (char *)bp->b_data + boff;
833
834                 /*
835                  * Copy the data to the buffer, mark the buffer
836                  * dirty, and convert the chain to unmodified.
837                  */
838                 bcopy(chain->data, bdata, chain->bytes);
839                 bp->b_flags |= B_CLUSTEROK;
840                 bdwrite(bp);
841                 bp = NULL;
842
843                 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
844                 case HAMMER2_CHECK_FREEMAP:
845                         chain->bref.check.freemap.icrc32 =
846                                 hammer2_icrc32(chain->data, chain->bytes);
847                         break;
848                 case HAMMER2_CHECK_ISCSI32:
849                         chain->bref.check.iscsi32.value =
850                                 hammer2_icrc32(chain->data, chain->bytes);
851                         break;
852                 default:
853                         panic("hammer2_flush_core: bad crc type");
854                         break; /* NOT REACHED */
855                 }
856                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
857                         ++hammer2_iod_meta_write;
858                 else
859                         ++hammer2_iod_indr_write;
860         }
861 }
862
863 /*
864  * Flush helper scan1 (recursive)
865  *
866  * Flushes the children of the caller's chain (parent) and updates
867  * the blockref, restricted by sync_tid.
868  *
869  * Ripouts during the loop should not cause any problems.  Because we are
870  * flushing to a synchronization point, modification races will occur after
871  * sync_tid and do not have to be flushed anyway.
872  *
873  * It is also ok if the parent is chain_duplicate()'d while unlocked because
874  * the delete/duplication will install a delete_tid that is still larger than
875  * our current sync_tid.
876  */
877 static int
878 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
879 {
880         hammer2_flush_info_t *info = data;
881         hammer2_trans_t *trans = info->trans;
882         hammer2_chain_t *parent = info->parent;
883         int diddeferral;
884
885         /*
886          * We should only need to recurse if SUBMODIFIED is set, but as
887          * a safety also recurse if MODIFIED is also set.
888          *
889          * Return early if neither bit is set.  We must re-assert the
890          * SUBMODIFIED flag in the parent if any child covered by the
891          * parent (via delete_tid) is skipped.
892          */
893         if ((child->flags & (HAMMER2_CHAIN_MODIFIED |
894                              HAMMER2_CHAIN_SUBMODIFIED)) == 0) {
895                 return (0);
896         }
897         if (child->modify_tid > trans->sync_tid) {
898                 if (parent->delete_tid > trans->sync_tid) {
899                         atomic_set_int(&parent->flags,
900                                        HAMMER2_CHAIN_SUBMODIFIED);
901                 }
902                 return (0);
903         }
904
905         hammer2_chain_ref(child);
906         spin_unlock(&parent->core->cst.spin);
907
908         /*
909          * The caller has added a ref to the parent so we can temporarily
910          * unlock it in order to lock the child.  Re-check the flags before
911          * continuing.
912          */
913         hammer2_chain_unlock(parent);
914         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
915
916         if ((child->flags & (HAMMER2_CHAIN_MODIFIED |
917                              HAMMER2_CHAIN_SUBMODIFIED)) == 0) {
918                 hammer2_chain_unlock(child);
919                 hammer2_chain_drop(child);
920                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
921                 spin_lock(&parent->core->cst.spin);
922                 return (0);
923         }
924         if (child->modify_tid > trans->sync_tid) {
925                 hammer2_chain_unlock(child);
926                 hammer2_chain_drop(child);
927                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
928                 spin_lock(&parent->core->cst.spin);
929                 if (parent->delete_tid > trans->sync_tid) {
930                         atomic_set_int(&parent->flags,
931                                        HAMMER2_CHAIN_SUBMODIFIED);
932                 }
933                 return (0);
934         }
935
936         /*
937          * The DESTROYED flag can only be initially set on an unreferenced
938          * deleted inode and will propagate downward via the mechanic below.
939          * Such inode chains have been deleted for good and should no longer
940          * be subject to delete/duplication.
941          *
942          * This optimization allows the inode reclaim (destroy unlinked file
943          * on vnode reclamation after last close) to be flagged by just
944          * setting HAMMER2_CHAIN_DESTROYED at the top level and then will
945          * cause the chains to be terminated and related buffers to be
946          * invalidated and not flushed out.
947          *
948          * We have to be careful not to propagate the DESTROYED flag if
949          * the destruction occurred after our flush sync_tid.
950          */
951         if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
952             (child->flags & HAMMER2_CHAIN_DELETED) &&
953             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
954                 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED |
955                                               HAMMER2_CHAIN_SUBMODIFIED);
956         }
957
958         /*
959          * Recurse and collect deferral data.
960          */
961         diddeferral = info->diddeferral;
962         ++info->depth;
963         hammer2_chain_flush_core(info, child);
964 #if FLUSH_DEBUG
965         kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n",
966                 parent, parent->flags, child, child->bref.type, child->flags);
967 #endif
968         --info->depth;
969         info->diddeferral += diddeferral;
970
971         if (child->flags & HAMMER2_CHAIN_SUBMODIFIED)
972                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
973
974         hammer2_chain_unlock(child);
975         hammer2_chain_drop(child);
976
977         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
978
979         spin_lock(&parent->core->cst.spin);
980
981         return (0);
982 }
983
984 /*
985  * Flush helper scan2 (non-recursive)
986  *
987  * This pass on a chain's children propagates any MOVED or DELETED
988  * elements back up the chain towards the root after those elements have
989  * been fully flushed.  Unlike scan1, this function is NOT recursive and
990  * the parent remains locked across the entire scan.
991  *
992  * SCAN2 is called twice, once with pass set to 1 and once with it set to 2.
993  * We have to do this so base[] elements can be deleted in pass 1 to make
994  * room for adding new elements in pass 2.
995  *
996  * This function also rolls up storage statistics.
997  *
998  * NOTE!  We must re-set SUBMODIFIED on the parent(s) as appropriate, and
999  *        due to the above conditions it is possible to do this and still
1000  *        have some children flagged MOVED depending on the synchronization.
1001  *
1002  * NOTE!  A deletion is a visbility issue, there can still be references to
1003  *        deleted elements (for example, to an unlinked file which is still
1004  *        open), and there can also be multiple chains pointing to the same
1005  *        bref where some are deleted and some are not (for example due to
1006  *        a rename).   So a chain marked for deletion is basically considered
1007  *        to be live until it is explicitly destroyed or until its ref-count
1008  *        reaches zero (also implying that MOVED and MODIFIED are clear).
1009  */
1010 static int
1011 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
1012 {
1013         hammer2_flush_info_t *info = data;
1014         hammer2_chain_t *parent = info->parent;
1015         hammer2_chain_core_t *above = child->above;
1016         hammer2_mount_t *hmp = child->hmp;
1017         hammer2_trans_t *trans = info->trans;
1018         hammer2_blockref_t *base;
1019         int count;
1020         int ok;
1021
1022         /*
1023          * Inodes with stale children that have been converted to DIRECTDATA
1024          * mode (file extension or hardlink conversion typically) need to
1025          * skipped right now before we start messing with a non-existant
1026          * block table.
1027          */
1028 #if 0
1029         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1030             (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
1031                 goto finalize;
1032         }
1033 #endif
1034
1035         /*
1036          * Ignore children created after our flush point, treating them as
1037          * if they did not exist).  These children will not cause the parent
1038          * to be updated.
1039          *
1040          * When we encounter such children and the parent chain has not been
1041          * deleted, delete/duplicated, or delete/duplicated-for-move, then
1042          * the parent may be used to funnel through several flush points.
1043          * We must re-set the SUBMODIFIED flag in the parent to ensure that
1044          * those flushes have visbility.  A simple test of delete_tid suffices
1045          * to determine if the parent spans beyond our current flush.
1046          */
1047         if (child->modify_tid > trans->sync_tid) {
1048                 goto finalize;
1049         }
1050
1051         /*
1052          * Ignore children which have not changed.  The parent's block table
1053          * is already correct.
1054          *
1055          * XXX The MOVED bit is only cleared when all multi-homed parents
1056          *     have flushed, creating a situation where a re-flush can occur
1057          *     via a parent which has already flushed.  The hammer2_base_*()
1058          *     functions currently have a hack to deal with this case but
1059          *     we need something better.
1060          */
1061         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
1062                 goto finalize;
1063         }
1064
1065         /*
1066          * Make sure child is referenced before we unlock.
1067          */
1068         hammer2_chain_ref(child);
1069         spin_unlock(&above->cst.spin);
1070
1071         /*
1072          * Parent reflushed after the child has passed them by should skip
1073          * due to the modify_tid test. XXX
1074          */
1075         hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
1076         KKASSERT(child->above == above);
1077         KKASSERT(parent->core == above);
1078
1079         /*
1080          * The parent's blockref to the child must be deleted or updated.
1081          *
1082          * This point is not reached on successful DESTROYED optimizations
1083          * but can be reached on recursive deletions and restricted flushes.
1084          *
1085          * Because flushes are ordered we do not have to make a
1086          * modify/duplicate of indirect blocks.  That is, the flush
1087          * code does not have to kmalloc or duplicate anything.  We
1088          * can adjust the indirect block table in-place and reuse the
1089          * chain.  It IS possible that the chain has already been duplicated
1090          * or may wind up being duplicated on-the-fly by modifying code
1091          * on the frontend.  We simply use the original and ignore such
1092          * chains.  However, it does mean we can't clear the MOVED bit.
1093          *
1094          * XXX recursive deletions not optimized.
1095          */
1096         hammer2_chain_modify(trans, &parent,
1097                              HAMMER2_MODIFY_NO_MODIFY_TID |
1098                              HAMMER2_MODIFY_ASSERTNOCOPY);
1099
1100         switch(parent->bref.type) {
1101         case HAMMER2_BREF_TYPE_INODE:
1102                 /*
1103                  * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we
1104                  * properly duplicate the inode headers and do proper flush
1105                  * range checks (all the children should be beyond the flush
1106                  * point).  For now just don't sync the non-applicable
1107                  * children.
1108                  *
1109                  * XXX Can also occur due to hardlink consolidation.  We
1110                  * set OPFLAG_DIRECTDATA to prevent the indirect and data
1111                  * blocks from syncing ot the hardlink pointer.
1112                  */
1113 #if 0
1114                 KKASSERT((parent->data->ipdata.op_flags &
1115                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1116 #endif
1117 #if 0
1118                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1119                         base = NULL;
1120                 } else
1121 #endif
1122                 {
1123                         base = &parent->data->ipdata.u.blockset.blockref[0];
1124                         count = HAMMER2_SET_COUNT;
1125                 }
1126                 break;
1127         case HAMMER2_BREF_TYPE_INDIRECT:
1128         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1129                 if (parent->data) {
1130                         base = &parent->data->npdata[0];
1131                 } else {
1132                         base = NULL;
1133                         KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
1134                 }
1135                 count = parent->bytes / sizeof(hammer2_blockref_t);
1136                 break;
1137         case HAMMER2_BREF_TYPE_VOLUME:
1138                 base = &hmp->voldata.sroot_blockset.blockref[0];
1139                 count = HAMMER2_SET_COUNT;
1140                 break;
1141         case HAMMER2_BREF_TYPE_FREEMAP:
1142                 base = &parent->data->npdata[0];
1143                 count = HAMMER2_SET_COUNT;
1144                 break;
1145         default:
1146                 base = NULL;
1147                 count = 0;
1148                 panic("hammer2_chain_flush_scan2: "
1149                       "unrecognized blockref type: %d",
1150                       parent->bref.type);
1151         }
1152
1153         /*
1154          * Don't bother updating a deleted parent's blockrefs (caller will
1155          * optimize-out the disk write).  Note that this is not optional,
1156          * a deleted parent's blockref array might not be synchronized at
1157          * all so calling hammer2_base*() functions could result in a panic.
1158          *
1159          * Otherwise, we need to be COUNTEDBREFS synchronized for the
1160          * hammer2_base_*() functions.
1161          */
1162         if (parent->delete_tid <= trans->sync_tid)
1163                 base = NULL;
1164         else if ((above->flags & HAMMER2_CORE_COUNTEDBREFS) == 0)
1165                 hammer2_chain_countbrefs(above, base, count);
1166
1167         /*
1168          * Update the parent's blockref table and propagate mirror_tid.
1169          *
1170          * NOTE! Children with modify_tid's beyond our flush point are
1171          *       considered to not exist for the purposes of updating the
1172          *       parent's blockref array.
1173          *
1174          * NOTE! Updates to a parent's blockref table do not adjust the
1175          *       parent's bref.modify_tid, only its bref.mirror_tid.
1176          */
1177         if (info->pass == 1 && child->delete_tid <= trans->sync_tid) {
1178                 /*
1179                  * Deleting.  Only adjust the block array if it contains
1180                  * the child's entry (child's REPLACE flag is set).  Clear
1181                  * the child's REPLACE flag only once all possible parent's
1182                  * have been updated.
1183                  */
1184                 ok = 1;
1185                 if (base && (child->flags & HAMMER2_CHAIN_REPLACE)) {
1186                         hammer2_rollup_stats(parent, child, -1);
1187                         spin_lock(&above->cst.spin);
1188                         hammer2_base_delete(base, count, above,
1189                                             &info->cache_index, &child->bref);
1190                         if (TAILQ_NEXT(parent, core_entry) == NULL) {
1191                                 atomic_clear_int(&child->flags,
1192                                                HAMMER2_CHAIN_REPLACE);
1193                         }
1194                         spin_unlock(&above->cst.spin);
1195                 }
1196                 if (info->mirror_tid < child->delete_tid)
1197                         info->mirror_tid = child->delete_tid;
1198         } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) {
1199                 /*
1200                  * Inserting.  Only set the child's REPLACE flag indicating
1201                  * that the parent's blockref array entry is valid once all
1202                  * possible parent's have been updated.
1203                  */
1204                 ok = 1;
1205                 if (base) {
1206                         if (child->flags & HAMMER2_CHAIN_REPLACE)
1207                                 hammer2_rollup_stats(parent, child, 0);
1208                         else
1209                                 hammer2_rollup_stats(parent, child, 1);
1210                         spin_lock(&above->cst.spin);
1211                         hammer2_base_insert(base, count, above,
1212                                             &info->cache_index, &child->bref,
1213                                             child->flags);
1214                         if (TAILQ_NEXT(parent, core_entry) == NULL) {
1215                                 atomic_set_int(&child->flags,
1216                                                HAMMER2_CHAIN_REPLACE);
1217                         }
1218                         spin_unlock(&above->cst.spin);
1219                 }
1220                 if (info->mirror_tid < child->modify_tid)
1221                         info->mirror_tid = child->modify_tid;
1222         } else {
1223                 ok = 0;
1224         }
1225
1226         if (info->mirror_tid < child->bref.mirror_tid) {
1227                 info->mirror_tid = child->bref.mirror_tid;
1228         }
1229         if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
1230              parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) &&
1231             hmp->voldata.mirror_tid < child->bref.mirror_tid) {
1232                 hmp->voldata.mirror_tid = child->bref.mirror_tid;
1233         }
1234
1235         /*
1236          * Only clear MOVED once all possible parents have been flushed.
1237          *
1238          * When can we safely clear the MOVED flag?  Flushes down duplicate
1239          * paths can occur out of order, for example if an inode is moved
1240          * as part of a hardlink consolidation or if an inode is moved into
1241          * an indirect block indexed before the inode.
1242          */
1243         if (ok && (child->flags & HAMMER2_CHAIN_MOVED)) {
1244                 hammer2_chain_t *scan;
1245                 int ok = 1;
1246
1247                 spin_lock(&above->cst.spin);
1248                 TAILQ_FOREACH(scan, &above->ownerq, core_entry) {
1249                         /*
1250                          * XXX weird code also checked at the top of scan2,
1251                          *     I would like to fix this by detaching the core
1252                          *     on initial hardlink consolidation (1->2 nlinks).
1253                          */
1254 #if 0
1255                         if (scan->bref.type == HAMMER2_BREF_TYPE_INODE &&
1256                             (scan->data->ipdata.op_flags &
1257                              HAMMER2_OPFLAG_DIRECTDATA)) {
1258                                 continue;
1259                         }
1260 #endif
1261                         if (scan->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1262                                 ok = 0;
1263                                 break;
1264                         }
1265                 }
1266                 spin_unlock(&above->cst.spin);
1267                 if (ok) {
1268                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
1269                         hammer2_chain_drop(child);      /* flag */
1270                 }
1271         }
1272
1273         /*
1274          * Unlock the child.  This can wind up dropping the child's
1275          * last ref, removing it from the parent's RB tree, and deallocating
1276          * the structure.  The RB_SCAN() our caller is doing handles the
1277          * situation.
1278          */
1279         hammer2_chain_unlock(child);
1280         hammer2_chain_drop(child);
1281         spin_lock(&above->cst.spin);
1282
1283         /*
1284          * The parent cleared SUBMODIFIED prior to the scan.  If the child
1285          * still requires a flush (possibly due to being outside the current
1286          * synchronization zone), we must re-set SUBMODIFIED on the way back
1287          * up.
1288          */
1289 finalize:
1290         return (0);
1291 }
1292
1293 static
1294 void
1295 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
1296 {
1297 #if 0
1298         hammer2_chain_t *grandp;
1299 #endif
1300
1301         parent->data_count += child->data_count;
1302         parent->inode_count += child->inode_count;
1303         child->data_count = 0;
1304         child->inode_count = 0;
1305         if (how < 0) {
1306                 parent->data_count -= child->bytes;
1307                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1308                         parent->inode_count -= 1;
1309 #if 0
1310                         /* XXX child->data may be NULL atm */
1311                         parent->data_count -= child->data->ipdata.data_count;
1312                         parent->inode_count -= child->data->ipdata.inode_count;
1313 #endif
1314                 }
1315         } else if (how > 0) {
1316                 parent->data_count += child->bytes;
1317                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1318                         parent->inode_count += 1;
1319 #if 0
1320                         /* XXX child->data may be NULL atm */
1321                         parent->data_count += child->data->ipdata.data_count;
1322                         parent->inode_count += child->data->ipdata.inode_count;
1323 #endif
1324                 }
1325         }
1326         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1327                 parent->data->ipdata.data_count += parent->data_count;
1328                 parent->data->ipdata.inode_count += parent->inode_count;
1329 #if 0
1330                 for (grandp = parent->above->first_parent;
1331                      grandp;
1332                      grandp = grandp->next_parent) {
1333                         grandp->data_count += parent->data_count;
1334                         grandp->inode_count += parent->inode_count;
1335                 }
1336 #endif
1337                 parent->data_count = 0;
1338                 parent->inode_count = 0;
1339         }
1340 }