Merge branch 'vendor/LDNS'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35
36 #include <sys/cdefs.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/lock.h>
41 #include <sys/uuid.h>
42
43 #include "hammer2.h"
44
45 /*
46  * Recursively flush the specified chain.  The chain is locked and
47  * referenced by the caller and will remain so on return.  The chain
48  * will remain referenced throughout but can temporarily lose its
49  * lock during the recursion to avoid unnecessarily stalling user
50  * processes.
51  */
52 struct hammer2_flush_info {
53         hammer2_chain_t *parent;
54         hammer2_trans_t *trans;
55         int             depth;
56         int             diddeferral;
57         int             pass;
58         int             cache_index;
59         struct h2_flush_deferral_list flush_list;
60         hammer2_tid_t   sync_tid;       /* flush synchronization point */
61         hammer2_tid_t   mirror_tid;     /* collect mirror TID updates */
62 };
63
64 typedef struct hammer2_flush_info hammer2_flush_info_t;
65
66 static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
67                                 hammer2_chain_t *chain);
68 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
69 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
70 static void hammer2_rollup_stats(hammer2_chain_t *parent,
71                                 hammer2_chain_t *child, int how);
72
73 #if 0
74 static __inline
75 void
76 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
77                     int how)
78 {
79         hammer2_key_t bytes;
80
81         if (bref->type != 0) {
82                 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
83                 if (bref->type == HAMMER2_BREF_TYPE_INODE)
84                         info->inode_count += how;
85                 if (how < 0)
86                         info->data_count -= bytes;
87                 else
88                         info->data_count += bytes;
89         }
90 }
91 #endif
92
93 /*
94  * Transaction support functions for writing to the filesystem.
95  *
96  * Initializing a new transaction allocates a transaction ID.  We
97  * don't bother marking the volume header MODIFIED.  Instead, the volume
98  * will be synchronized at a later time as part of a larger flush sequence.
99  *
100  * Non-flush transactions can typically run concurrently.  However if
101  * there are non-flush transaction both before AND after a flush trans,
102  * the transactions after stall until the ones before finish.
103  *
104  * Non-flush transactions occuring after a flush pointer can run concurrently
105  * with that flush.  They only have to wait for transactions prior to the
106  * flush trans to complete before they unstall.
107  *
108  * WARNING! Transaction ids are only allocated when the transaction becomes
109  *          active, which allows other transactions to insert ahead of us
110  *          if we are forced to block (only bioq transactions do that).
111  *
112  * WARNING! Modifications to the root volume cannot dup the root volume
113  *          header to handle synchronization points, so alloc_tid can
114  *          wind up (harmlessly) more advanced on flush.
115  *
116  * WARNING! Operations which might call inode_duplicate()/chain_duplicate()
117  *          depend heavily on having a unique sync_tid to avoid duplication
118  *          collisions (which key off of delete_tid).
119  */
120 void
121 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
122 {
123         hammer2_mount_t *hmp;
124         hammer2_trans_t *scan;
125
126         bzero(trans, sizeof(*trans));
127         trans->pmp = pmp;
128         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
129
130         hammer2_voldata_lock(hmp);
131         trans->flags = flags;
132         trans->td = curthread;
133         /*trans->delete_gen = 0;*/      /* multiple deletions within trans */
134
135         if (flags & HAMMER2_TRANS_ISFLUSH) {
136                 /*
137                  * If multiple flushes are trying to run we have to
138                  * wait until it is our turn, then set curflush to
139                  * indicate that a flush is now pending (but not
140                  * necessarily active yet).
141                  *
142                  * NOTE: Do not set trans->blocked here.
143                  */
144                 ++hmp->flushcnt;
145                 while (hmp->curflush != NULL) {
146                         lksleep(&hmp->curflush, &hmp->voldatalk,
147                                 0, "h2multf", hz);
148                 }
149                 hmp->curflush = trans;
150                 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
151
152                 /*
153                  * If we are a flush we have to wait for all transactions
154                  * prior to our flush synchronization point to complete
155                  * before we can start our flush.
156                  *
157                  * Most importantly, this includes bioq flushes.
158                  *
159                  * NOTE: Do not set trans->blocked here.
160                  */
161                 while (TAILQ_FIRST(&hmp->transq) != trans) {
162                         lksleep(&trans->sync_tid, &hmp->voldatalk,
163                                 0, "h2syncw", hz);
164                 }
165
166                 /*
167                  * don't assign sync_tid until we become the running
168                  * flush.  topo_flush_tid is used to control when
169                  * chain modifications in concurrent transactions are
170                  * required to delete-duplicate (so as not to disturb
171                  * the state of what is being currently flushed).
172                  */
173                 trans->sync_tid = hmp->voldata.alloc_tid++;
174                 hmp->topo_flush_tid = trans->sync_tid;
175
176                 /*
177                  * Once we become the running flush we can wakeup anyone
178                  * who blocked on us, up to the next flush.  That is,
179                  * our flush can run concurrent with frontend operations.
180                  */
181                 scan = trans;
182                 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) {
183                         if (scan->flags & HAMMER2_TRANS_ISFLUSH)
184                                 break;
185                         if (scan->blocked == 0)
186                                 break;
187                         scan->blocked = 0;
188                         wakeup(&scan->blocked);
189                 }
190         } else if ((flags & HAMMER2_TRANS_BUFCACHE) && hmp->curflush) {
191                 /*
192                  * We cannot block if we are the bioq thread.  When a
193                  * flush is not pending we can operate normally but
194                  * if a flush IS pending the bioq thread's transaction
195                  * must be placed either before or after curflush.
196                  *
197                  * If the current flush is waiting the bioq thread's
198                  * transaction is placed before.  If it is running the
199                  * bioq thread's transaction is placed after.
200                  */
201                 scan = TAILQ_FIRST(&hmp->transq);
202                 if (scan != hmp->curflush) {
203                         TAILQ_INSERT_BEFORE(hmp->curflush, trans, entry);
204                 } else {
205                         TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
206                 }
207                 trans->sync_tid = hmp->voldata.alloc_tid++;
208         } else {
209                 /*
210                  * If this is a normal transaction and not a flush, or
211                  * if this is a bioq transaction and no flush is pending,
212                  * we can queue normally.
213                  *
214                  * Normal transactions must block while a pending flush is
215                  * waiting for prior transactions to complete.  Once the
216                  * pending flush becomes active we can run concurrently
217                  * with it.
218                  */
219                 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
220                 scan = TAILQ_FIRST(&hmp->transq);
221                 if (hmp->curflush && hmp->curflush != scan) {
222                         trans->blocked = 1;
223                         while (trans->blocked) {
224                                 lksleep(&trans->blocked, &hmp->voldatalk,
225                                         0, "h2trans", hz);
226                         }
227                 }
228                 trans->sync_tid = hmp->voldata.alloc_tid++;
229         }
230         hammer2_voldata_unlock(hmp, 0);
231 }
232
233 void
234 hammer2_trans_done(hammer2_trans_t *trans)
235 {
236         hammer2_mount_t *hmp;
237         hammer2_trans_t *scan;
238
239         hmp = trans->pmp->cluster.chains[0]->hmp;
240
241         hammer2_voldata_lock(hmp);
242         TAILQ_REMOVE(&hmp->transq, trans, entry);
243         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
244                 --hmp->flushcnt;
245                 if (hmp->flushcnt) {
246                         /*
247                          * If we were a flush then wakeup anyone waiting on
248                          * curflush (i.e. other flushes that want to run).
249                          * Leave topo_flush_id set (I think we could probably
250                          * clear it to zero here).
251                          */
252                         hmp->curflush = NULL;
253                         wakeup(&hmp->curflush);
254                 } else {
255                         /*
256                          * Theoretically we don't have to clear flush_tid
257                          * here since the flush will have synchronized
258                          * all operations <= flush_tid already.  But for
259                          * now zero-it.
260                          */
261                         hmp->curflush = NULL;
262                         hmp->topo_flush_tid = 0;
263                 }
264         } else {
265                 /*
266                  * If we are not a flush but a flush is now at the head
267                  * of the queue and we were previously blocking it,
268                  * we can now unblock it.
269                  */
270                 if (hmp->flushcnt &&
271                     (scan = TAILQ_FIRST(&hmp->transq)) != NULL &&
272                     trans->sync_tid < scan->sync_tid &&
273                     (scan->flags & HAMMER2_TRANS_ISFLUSH)) {
274                         wakeup(&scan->sync_tid);
275                 }
276         }
277         hammer2_voldata_unlock(hmp, 0);
278 }
279
280 /*
281  * Flush the chain and all modified sub-chains through the specified
282  * synchronization point (sync_tid), propagating parent chain modifications
283  * and mirror_tid updates back up as needed.  Since we are recursing downward
284  * we do not have to deal with the complexities of multi-homed chains (chains
285  * with multiple parents).
286  *
287  * Caller must have interlocked against any non-flush-related modifying
288  * operations in progress whos modify_tid values are less than or equal
289  * to the passed sync_tid.
290  *
291  * Caller must have already vetted synchronization points to ensure they
292  * are properly flushed.  Only snapshots and cluster flushes can create
293  * these sorts of synchronization points.
294  *
295  * This routine can be called from several places but the most important
296  * is from the hammer2_vop_reclaim() function.  We want to try to completely
297  * clean out the inode structure to prevent disconnected inodes from
298  * building up and blowing out the kmalloc pool.  However, it is not actually
299  * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
300  * capability.
301  *
302  * chain is locked on call and will remain locked on return.  If a flush
303  * occured, the chain's MOVED bit will be set indicating that its parent
304  * (which is not part of the flush) should be updated.
305  */
306 void
307 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
308 {
309         hammer2_chain_t *scan;
310         hammer2_chain_core_t *core;
311         hammer2_flush_info_t info;
312
313         /*
314          * Execute the recursive flush and handle deferrals.
315          *
316          * Chains can be ridiculously long (thousands deep), so to
317          * avoid blowing out the kernel stack the recursive flush has a
318          * depth limit.  Elements at the limit are placed on a list
319          * for re-execution after the stack has been popped.
320          */
321         bzero(&info, sizeof(info));
322         TAILQ_INIT(&info.flush_list);
323         info.trans = trans;
324         info.sync_tid = trans->sync_tid;
325         info.mirror_tid = 0;
326         info.cache_index = -1;
327
328         core = chain->core;
329
330         for (;;) {
331                 /*
332                  * Unwind deep recursions which had been deferred.  This
333                  * can leave MOVED set for these chains, which will be
334                  * handled when we [re]flush chain after the unwind.
335                  */
336                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
337                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
338                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
339                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
340
341                         /*
342                          * Now that we've popped back up we can do a secondary
343                          * recursion on the deferred elements.
344                          */
345                         if (hammer2_debug & 0x0040)
346                                 kprintf("defered flush %p\n", scan);
347                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
348                         hammer2_chain_flush(trans, scan);
349                         hammer2_chain_unlock(scan);
350                         hammer2_chain_drop(scan);       /* ref from deferral */
351                 }
352
353                 /*
354                  * Flush pass1 on root.
355                  */
356                 info.diddeferral = 0;
357                 hammer2_chain_flush_core(&info, chain);
358 #if FLUSH_DEBUG
359                 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n",
360                         chain, chain->bref.type, chain->flags);
361 #endif
362
363                 /*
364                  * Only loop if deep recursions have been deferred.
365                  */
366                 if (TAILQ_EMPTY(&info.flush_list))
367                         break;
368         }
369 }
370
371 /*
372  * This is the core of the chain flushing code.  The chain is locked by the
373  * caller and remains locked on return.  This function is keyed off of
374  * the SUBMODIFIED bit but must make fine-grained choices based on the
375  * synchronization point we are flushing to.
376  *
377  * If the flush accomplished any work chain will be flagged MOVED
378  * indicating a copy-on-write propagation back up is required.
379  * Deep sub-nodes may also have been entered onto the deferral list.
380  * MOVED is never set on the volume root.
381  *
382  * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
383  *       only when a chain is specifically modified, and not updated
384  *       for copy-on-write propagations.  MODIFIED is set on any modification
385  *       including copy-on-write propagations.
386  */
387 static void
388 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
389 {
390         hammer2_mount_t *hmp;
391         hammer2_blockref_t *bref;
392         hammer2_off_t pbase;
393         hammer2_off_t pmask;
394         hammer2_tid_t saved_sync;
395 #if 0
396         hammer2_trans_t *trans = info->trans;
397 #endif
398         hammer2_chain_core_t *core;
399         size_t psize;
400         size_t boff;
401         char *bdata;
402         struct buf *bp;
403         int error;
404         int wasmodified;
405         int diddeferral = 0;
406
407         hmp = chain->hmp;
408
409 #if FLUSH_DEBUG
410         if (info->parent)
411                 kprintf("flush_core %p->%p.%d %08x (%s)\n",
412                         info->parent, chain, chain->bref.type,
413                         chain->flags,
414                         ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
415                                 chain->data->ipdata.filename : "?"));
416         else
417                 kprintf("flush_core NULL->%p.%d %08x (%s)\n",
418                         chain, chain->bref.type,
419                         chain->flags,
420                         ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
421                                 chain->data->ipdata.filename : "?"));
422 #endif
423         /*
424          * Ignore chains modified beyond the current flush point.  These
425          * will be treated as if they did not exist.
426          */
427         if (chain->modify_tid > info->sync_tid)
428                 return;
429
430 #if 0
431         /*
432          * Deleted chains which have not been destroyed must be retained,
433          * and we probably have to recurse to clean-up any sub-trees.
434          * However, restricted flushes can stop processing here because
435          * the chain cleanup will be handled by a later normal flush.
436          *
437          * The MODIFIED bit can likely be cleared in this situation and we
438          * will do so later on in this procedure.
439          */
440         if (chain->delete_tid <= info->sync_tid) {
441                 if (trans->flags & HAMMER2_TRANS_RESTRICTED)
442                         return;
443         }
444 #endif
445
446         saved_sync = info->sync_tid;
447         core = chain->core;
448
449         /*
450          * If SUBMODIFIED is set we recurse the flush and adjust the
451          * blockrefs accordingly.
452          *
453          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
454          *       finishing in the face of filesystem activity.
455          */
456         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
457                 hammer2_chain_t *saved_parent;
458                 hammer2_tid_t saved_mirror;
459                 hammer2_chain_layer_t *layer;
460
461                 /*
462                  * Clear SUBMODIFIED to catch races.  Note that any child
463                  * with MODIFIED, DELETED, or MOVED set during scan2, or
464                  * which tries to lastdrop but can't free its structures,
465                  * or which gets defered, will cause SUBMODIFIED to be set
466                  * again.
467                  *
468                  * We don't want to set our chain to MODIFIED gratuitously.
469                  *
470                  * We need an extra ref on chain because we are going to
471                  * release its lock temporarily in our child loop.
472                  */
473                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
474                 hammer2_chain_ref(chain);
475
476                 /*
477                  * Run two passes.  The first pass handles MODIFIED and
478                  * SUBMODIFIED chains and recurses while the second pass
479                  * handles MOVED chains on the way back up.
480                  *
481                  * If the stack gets too deep we defer scan1, but must
482                  * be sure to still run scan2 if on the next loop the
483                  * deferred chain has been flushed and now needs MOVED
484                  * handling on the way back up.
485                  *
486                  * Scan1 is recursive.
487                  *
488                  * NOTE: The act of handling a modified/submodified chain can
489                  *       cause the MOVED Flag to be set.  It can also be set
490                  *       via hammer2_chain_delete() and in other situations.
491                  *
492                  * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
493                  *       because children can be physically removed during
494                  *       the scan.
495                  */
496                 saved_parent = info->parent;
497                 saved_mirror = info->mirror_tid;
498                 info->parent = chain;
499                 info->mirror_tid = chain->bref.mirror_tid;
500
501                 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
502                         if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
503                                 hammer2_chain_ref(chain);
504                                 TAILQ_INSERT_TAIL(&info->flush_list,
505                                                   chain, flush_node);
506                                 atomic_set_int(&chain->flags,
507                                                HAMMER2_CHAIN_DEFERRED);
508                         }
509                         diddeferral = 1;
510                 } else {
511                         info->diddeferral = 0;
512                         spin_lock(&core->cst.spin);
513                         KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
514                         TAILQ_FOREACH_REVERSE(layer, &core->layerq,
515                                               h2_layer_list, entry) {
516                                 ++layer->refs;
517                                 KKASSERT(layer->good == 0xABCD);
518                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
519                                         NULL, hammer2_chain_flush_scan1, info);
520                                 --layer->refs;
521                                 diddeferral += info->diddeferral;
522                         }
523                         spin_unlock(&core->cst.spin);
524                 }
525
526                 /*
527                  * Handle successfully flushed children who are in the MOVED
528                  * state on the way back up the recursion.  This can have
529                  * the side-effect of clearing MOVED.
530                  *
531                  * Scan2 is non-recursive.
532                  */
533                 if (diddeferral) {
534                         atomic_set_int(&chain->flags,
535                                        HAMMER2_CHAIN_SUBMODIFIED);
536                         spin_lock(&core->cst.spin);
537                 } else {
538                         spin_lock(&core->cst.spin);
539                         KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
540                         TAILQ_FOREACH_REVERSE(layer, &core->layerq,
541                                               h2_layer_list, entry) {
542                                 info->pass = 1;
543                                 ++layer->refs;
544                                 KKASSERT(layer->good == 0xABCD);
545                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
546                                         NULL, hammer2_chain_flush_scan2, info);
547                                 info->pass = 2;
548                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
549                                         NULL, hammer2_chain_flush_scan2, info);
550                                 /*diddeferral += info->diddeferral; n/a*/
551                                 --layer->refs;
552                         }
553                 }
554                 hammer2_chain_layer_check_locked(chain->hmp, core);
555                 spin_unlock(&core->cst.spin);
556
557                 chain->bref.mirror_tid = info->mirror_tid;
558                 info->mirror_tid = saved_mirror;
559                 info->parent = saved_parent;
560                 KKASSERT(chain->refs > 1);
561                 hammer2_chain_drop(chain);
562         }
563
564         /*
565          * Restore sync_tid in case it was restricted by a delete/duplicate.
566          */
567         info->sync_tid = saved_sync;
568
569         /*
570          * Rollup diddeferral for caller.  Note direct assignment, not +=.
571          */
572         info->diddeferral = diddeferral;
573
574         /*
575          * Do not flush chain if there were any deferrals.  It will be
576          * retried later after the deferrals are independently handled.
577          */
578         if (diddeferral) {
579                 if (hammer2_debug & 0x0008) {
580                         kprintf("%*.*s} %p/%d %04x (deferred)",
581                                 info->depth, info->depth, "",
582                                 chain, chain->refs, chain->flags);
583                 }
584                 return;
585         }
586
587         /*
588          * If we encounter a deleted chain within our flush we can clear
589          * the MODIFIED bit and avoid flushing it whether it has been
590          * destroyed or not.  We must make sure that the chain is flagged
591          * MOVED in this situation so the parent picks up the deletion.
592          *
593          * Note that scan2 has already executed above so statistics have
594          * already been rolled up.
595          */
596         if (chain->delete_tid <= info->sync_tid) {
597                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
598                         if (chain->bp) {
599                                 if (chain->bytes == chain->bp->b_bufsize)
600                                         chain->bp->b_flags |= B_INVAL|B_RELBUF;
601                         }
602                         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
603                                 hammer2_chain_ref(chain);
604                                 atomic_set_int(&chain->flags,
605                                                HAMMER2_CHAIN_MOVED);
606                         }
607                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
608                         hammer2_chain_drop(chain);
609                 }
610                 return;
611         }
612 #if 0
613         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
614             (chain->flags & HAMMER2_CHAIN_DELETED) &&
615             (trans->flags & HAMMER2_TRANS_RESTRICTED) == 0) {
616                 /*
617                  * Throw-away the MODIFIED flag
618                  */
619                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
620                         if (chain->bp) {
621                                 if (chain->bytes == chain->bp->b_bufsize)
622                                         chain->bp->b_flags |= B_INVAL|B_RELBUF;
623                         }
624                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
625                         hammer2_chain_drop(chain);
626                 }
627                 return;
628         }
629 #endif
630
631         /*
632          * A degenerate flush might not have flushed anything and thus not
633          * processed modified blocks on the way back up.  Detect the case.
634          *
635          * Note that MOVED can be set without MODIFIED being set due to
636          * a deletion, in which case it is handled by Scan2 later on.
637          *
638          * Both bits can be set along with DELETED due to a deletion if
639          * modified data within the synchronization zone and the chain
640          * was then deleted beyond the zone, in which case we still have
641          * to flush for synchronization point consistency.  Otherwise though
642          * DELETED and MODIFIED are treated as separate flags.
643          */
644         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
645                 return;
646
647         /*
648          * Issue flush.
649          *
650          * A DESTROYED node that reaches this point must be flushed for
651          * synchronization point consistency.
652          */
653
654         /*
655          * Update mirror_tid, clear MODIFIED, and set MOVED.
656          *
657          * The caller will update the parent's reference to this chain
658          * by testing MOVED as long as the modification was in-bounds.
659          *
660          * MOVED is never set on the volume root as there is no parent
661          * to adjust.
662          */
663         if (chain->bref.mirror_tid < info->sync_tid)
664                 chain->bref.mirror_tid = info->sync_tid;
665         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
666         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
667         if (chain == &hmp->vchain)
668                 kprintf("(FLUSHED VOLUME HEADER)\n");
669         if (chain == &hmp->fchain)
670                 kprintf("(FLUSHED FREEMAP HEADER)\n");
671
672         if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
673             chain == &hmp->vchain ||
674             chain == &hmp->fchain) {
675                 /*
676                  * Drop the ref from the MODIFIED bit we cleared.
677                  * Net is -0 or -1 ref depending.
678                  */
679                 if (wasmodified)
680                         hammer2_chain_drop(chain);
681         } else {
682                 /*
683                  * Drop the ref from the MODIFIED bit we cleared and
684                  * set a ref for the MOVED bit we are setting.  Net
685                  * is +0 or +1 ref depending.
686                  */
687                 if (wasmodified == 0)
688                         hammer2_chain_ref(chain);
689                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
690         }
691
692         /*
693          * If this is part of a recursive flush we can go ahead and write
694          * out the buffer cache buffer and pass a new bref back up the chain
695          * via the MOVED bit.
696          *
697          * Volume headers are NOT flushed here as they require special
698          * processing.
699          */
700         switch(chain->bref.type) {
701         case HAMMER2_BREF_TYPE_FREEMAP:
702                 hammer2_modify_volume(hmp);
703                 break;
704         case HAMMER2_BREF_TYPE_VOLUME:
705                 /*
706                  * We should flush the free block table before we calculate
707                  * CRCs and copy voldata -> volsync.
708                  *
709                  * To prevent SMP races, fchain must remain locked until
710                  * voldata is copied to volsync.
711                  */
712                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
713                 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
714                                          HAMMER2_CHAIN_SUBMODIFIED)) {
715                         /* this will modify vchain as a side effect */
716                         hammer2_chain_flush(info->trans, &hmp->fchain);
717                 }
718
719                 /*
720                  * The volume header is flushed manually by the syncer, not
721                  * here.  All we do is adjust the crc's.
722                  */
723                 KKASSERT(chain->data != NULL);
724                 KKASSERT(chain->bp == NULL);
725                 kprintf("volume header mirror_tid %jd\n",
726                         hmp->voldata.mirror_tid);
727
728                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
729                         hammer2_icrc32(
730                                 (char *)&hmp->voldata +
731                                  HAMMER2_VOLUME_ICRC1_OFF,
732                                 HAMMER2_VOLUME_ICRC1_SIZE);
733                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
734                         hammer2_icrc32(
735                                 (char *)&hmp->voldata +
736                                  HAMMER2_VOLUME_ICRC0_OFF,
737                                 HAMMER2_VOLUME_ICRC0_SIZE);
738                 hmp->voldata.icrc_volheader =
739                         hammer2_icrc32(
740                                 (char *)&hmp->voldata +
741                                  HAMMER2_VOLUME_ICRCVH_OFF,
742                                 HAMMER2_VOLUME_ICRCVH_SIZE);
743                 hmp->volsync = hmp->voldata;
744                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
745                 hammer2_chain_unlock(&hmp->fchain);
746                 break;
747         case HAMMER2_BREF_TYPE_DATA:
748                 /*
749                  * Data elements have already been flushed via the logical
750                  * file buffer cache.  Their hash was set in the bref by
751                  * the vop_write code.
752                  *
753                  * Make sure any device buffer(s) have been flushed out here.
754                  * (there aren't usually any to flush).
755                  */
756                 psize = hammer2_devblksize(chain->bytes);
757                 pmask = (hammer2_off_t)psize - 1;
758                 pbase = chain->bref.data_off & ~pmask;
759                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
760
761                 bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0);
762                 if (bp) {
763                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
764                             (B_CACHE | B_DIRTY)) {
765                                 cluster_awrite(bp);
766                         } else {
767                                 bp->b_flags |= B_RELBUF;
768                                 brelse(bp);
769                         }
770                 }
771                 break;
772 #if 0
773         case HAMMER2_BREF_TYPE_INDIRECT:
774                 /*
775                  * Indirect blocks may be in an INITIAL state.  Use the
776                  * chain_lock() call to ensure that the buffer has been
777                  * instantiated (even though it is already locked the buffer
778                  * might not have been instantiated).
779                  *
780                  * Only write the buffer out if it is dirty, it is possible
781                  * the operating system had already written out the buffer.
782                  */
783                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
784                 KKASSERT(chain->bp != NULL);
785
786                 bp = chain->bp;
787                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
788                     (bp->b_flags & B_DIRTY)) {
789                         bdwrite(chain->bp);
790                 } else {
791                         brelse(chain->bp);
792                 }
793                 chain->bp = NULL;
794                 chain->data = NULL;
795                 hammer2_chain_unlock(chain);
796                 break;
797 #endif
798         case HAMMER2_BREF_TYPE_INDIRECT:
799         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
800                 /*
801                  * Device-backed.  Buffer will be flushed by the sync
802                  * code XXX.
803                  */
804                 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
805                 break;
806         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
807         default:
808                 /*
809                  * Embedded elements have to be flushed out.
810                  * (Basically just BREF_TYPE_INODE).
811                  */
812                 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
813                 KKASSERT(chain->data != NULL);
814                 KKASSERT(chain->bp == NULL);
815                 bref = &chain->bref;
816
817                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
818                 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) ==
819                          HAMMER2_CHECK_ISCSI32 ||
820                          HAMMER2_DEC_CHECK(chain->bref.methods) ==
821                          HAMMER2_CHECK_FREEMAP);
822
823                 /*
824                  * The data is embedded, we have to acquire the
825                  * buffer cache buffer and copy the data into it.
826                  */
827                 psize = hammer2_devblksize(chain->bytes);
828                 pmask = (hammer2_off_t)psize - 1;
829                 pbase = bref->data_off & ~pmask;
830                 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
831
832                 /*
833                  * The getblk() optimization can only be used if the
834                  * physical block size matches the request.
835                  */
836                 error = bread(hmp->devvp, pbase, psize, &bp);
837                 KKASSERT(error == 0);
838
839                 bdata = (char *)bp->b_data + boff;
840
841                 /*
842                  * Copy the data to the buffer, mark the buffer
843                  * dirty, and convert the chain to unmodified.
844                  */
845                 bcopy(chain->data, bdata, chain->bytes);
846                 bp->b_flags |= B_CLUSTEROK;
847                 bdwrite(bp);
848                 bp = NULL;
849
850                 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
851                 case HAMMER2_CHECK_FREEMAP:
852                         chain->bref.check.freemap.icrc32 =
853                                 hammer2_icrc32(chain->data, chain->bytes);
854                         break;
855                 case HAMMER2_CHECK_ISCSI32:
856                         chain->bref.check.iscsi32.value =
857                                 hammer2_icrc32(chain->data, chain->bytes);
858                         break;
859                 default:
860                         panic("hammer2_flush_core: bad crc type");
861                         break; /* NOT REACHED */
862                 }
863                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
864                         ++hammer2_iod_meta_write;
865                 else
866                         ++hammer2_iod_indr_write;
867         }
868 }
869
870 /*
871  * Flush helper scan1 (recursive)
872  *
873  * Flushes the children of the caller's chain (parent) and updates
874  * the blockref, restricted by sync_tid.
875  *
876  * Ripouts during the loop should not cause any problems.  Because we are
877  * flushing to a synchronization point, modification races will occur after
878  * sync_tid and do not have to be flushed anyway.
879  *
880  * It is also ok if the parent is chain_duplicate()'d while unlocked because
881  * the delete/duplication will install a delete_tid that is still larger than
882  * our current sync_tid.
883  */
884 static int
885 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
886 {
887         hammer2_flush_info_t *info = data;
888         hammer2_trans_t *trans = info->trans;
889         hammer2_chain_t *parent = info->parent;
890         int diddeferral;
891
892         /*
893          * We should only need to recurse if SUBMODIFIED is set, but as
894          * a safety also recurse if MODIFIED is also set.
895          *
896          * Return early if neither bit is set.  We must re-assert the
897          * SUBMODIFIED flag in the parent if any child covered by the
898          * parent (via delete_tid) is skipped.
899          */
900         if ((child->flags & (HAMMER2_CHAIN_MODIFIED |
901                              HAMMER2_CHAIN_SUBMODIFIED)) == 0) {
902                 return (0);
903         }
904         if (child->modify_tid > trans->sync_tid) {
905                 if (parent->delete_tid > trans->sync_tid) {
906                         atomic_set_int(&parent->flags,
907                                        HAMMER2_CHAIN_SUBMODIFIED);
908                 }
909                 return (0);
910         }
911
912         hammer2_chain_ref(child);
913         spin_unlock(&parent->core->cst.spin);
914
915         /*
916          * The caller has added a ref to the parent so we can temporarily
917          * unlock it in order to lock the child.  Re-check the flags before
918          * continuing.
919          */
920         hammer2_chain_unlock(parent);
921         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
922
923         if ((child->flags & (HAMMER2_CHAIN_MODIFIED |
924                              HAMMER2_CHAIN_SUBMODIFIED)) == 0) {
925                 hammer2_chain_unlock(child);
926                 hammer2_chain_drop(child);
927                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
928                 spin_lock(&parent->core->cst.spin);
929                 return (0);
930         }
931         if (child->modify_tid > trans->sync_tid) {
932                 hammer2_chain_unlock(child);
933                 hammer2_chain_drop(child);
934                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
935                 spin_lock(&parent->core->cst.spin);
936                 if (parent->delete_tid > trans->sync_tid) {
937                         atomic_set_int(&parent->flags,
938                                        HAMMER2_CHAIN_SUBMODIFIED);
939                 }
940                 return (0);
941         }
942
943         /*
944          * The DESTROYED flag can only be initially set on an unreferenced
945          * deleted inode and will propagate downward via the mechanic below.
946          * Such inode chains have been deleted for good and should no longer
947          * be subject to delete/duplication.
948          *
949          * This optimization allows the inode reclaim (destroy unlinked file
950          * on vnode reclamation after last close) to be flagged by just
951          * setting HAMMER2_CHAIN_DESTROYED at the top level and then will
952          * cause the chains to be terminated and related buffers to be
953          * invalidated and not flushed out.
954          *
955          * We have to be careful not to propagate the DESTROYED flag if
956          * the destruction occurred after our flush sync_tid.
957          */
958         if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
959             (child->flags & HAMMER2_CHAIN_DELETED) &&
960             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
961                 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED |
962                                               HAMMER2_CHAIN_SUBMODIFIED);
963         }
964
965         /*
966          * Recurse and collect deferral data.
967          */
968         diddeferral = info->diddeferral;
969         ++info->depth;
970         hammer2_chain_flush_core(info, child);
971 #if FLUSH_DEBUG
972         kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n",
973                 parent, parent->flags, child, child->bref.type, child->flags);
974 #endif
975         --info->depth;
976         info->diddeferral += diddeferral;
977
978         if (child->flags & HAMMER2_CHAIN_SUBMODIFIED)
979                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
980
981         hammer2_chain_unlock(child);
982         hammer2_chain_drop(child);
983
984         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
985
986         spin_lock(&parent->core->cst.spin);
987
988         return (0);
989 }
990
991 /*
992  * Flush helper scan2 (non-recursive)
993  *
994  * This pass on a chain's children propagates any MOVED or DELETED
995  * elements back up the chain towards the root after those elements have
996  * been fully flushed.  Unlike scan1, this function is NOT recursive and
997  * the parent remains locked across the entire scan.
998  *
999  * SCAN2 is called twice, once with pass set to 1 and once with it set to 2.
1000  * We have to do this so base[] elements can be deleted in pass 1 to make
1001  * room for adding new elements in pass 2.
1002  *
1003  * This function also rolls up storage statistics.
1004  *
1005  * NOTE!  We must re-set SUBMODIFIED on the parent(s) as appropriate, and
1006  *        due to the above conditions it is possible to do this and still
1007  *        have some children flagged MOVED depending on the synchronization.
1008  *
1009  * NOTE!  A deletion is a visbility issue, there can still be references to
1010  *        deleted elements (for example, to an unlinked file which is still
1011  *        open), and there can also be multiple chains pointing to the same
1012  *        bref where some are deleted and some are not (for example due to
1013  *        a rename).   So a chain marked for deletion is basically considered
1014  *        to be live until it is explicitly destroyed or until its ref-count
1015  *        reaches zero (also implying that MOVED and MODIFIED are clear).
1016  */
1017 static int
1018 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
1019 {
1020         hammer2_flush_info_t *info = data;
1021         hammer2_chain_t *parent = info->parent;
1022         hammer2_chain_core_t *above = child->above;
1023         hammer2_mount_t *hmp = child->hmp;
1024         hammer2_trans_t *trans = info->trans;
1025         hammer2_blockref_t *base;
1026         int count;
1027         int ok;
1028
1029         /*
1030          * Inodes with stale children that have been converted to DIRECTDATA
1031          * mode (file extension or hardlink conversion typically) need to
1032          * skipped right now before we start messing with a non-existant
1033          * block table.
1034          */
1035 #if 0
1036         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1037             (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
1038                 goto finalize;
1039         }
1040 #endif
1041
1042         /*
1043          * Ignore children created after our flush point, treating them as
1044          * if they did not exist).  These children will not cause the parent
1045          * to be updated.
1046          *
1047          * When we encounter such children and the parent chain has not been
1048          * deleted, delete/duplicated, or delete/duplicated-for-move, then
1049          * the parent may be used to funnel through several flush points.
1050          * We must re-set the SUBMODIFIED flag in the parent to ensure that
1051          * those flushes have visbility.  A simple test of delete_tid suffices
1052          * to determine if the parent spans beyond our current flush.
1053          */
1054         if (child->modify_tid > trans->sync_tid) {
1055                 goto finalize;
1056         }
1057
1058         /*
1059          * Ignore children which have not changed.  The parent's block table
1060          * is already correct.
1061          *
1062          * XXX The MOVED bit is only cleared when all multi-homed parents
1063          *     have flushed, creating a situation where a re-flush can occur
1064          *     via a parent which has already flushed.  The hammer2_base_*()
1065          *     functions currently have a hack to deal with this case but
1066          *     we need something better.
1067          */
1068         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
1069                 goto finalize;
1070         }
1071
1072         /*
1073          * Make sure child is referenced before we unlock.
1074          */
1075         hammer2_chain_ref(child);
1076         spin_unlock(&above->cst.spin);
1077
1078         /*
1079          * Parent reflushed after the child has passed them by should skip
1080          * due to the modify_tid test. XXX
1081          */
1082         hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
1083         KKASSERT(child->above == above);
1084         KKASSERT(parent->core == above);
1085
1086         /*
1087          * The parent's blockref to the child must be deleted or updated.
1088          *
1089          * This point is not reached on successful DESTROYED optimizations
1090          * but can be reached on recursive deletions and restricted flushes.
1091          *
1092          * Because flushes are ordered we do not have to make a
1093          * modify/duplicate of indirect blocks.  That is, the flush
1094          * code does not have to kmalloc or duplicate anything.  We
1095          * can adjust the indirect block table in-place and reuse the
1096          * chain.  It IS possible that the chain has already been duplicated
1097          * or may wind up being duplicated on-the-fly by modifying code
1098          * on the frontend.  We simply use the original and ignore such
1099          * chains.  However, it does mean we can't clear the MOVED bit.
1100          *
1101          * XXX recursive deletions not optimized.
1102          */
1103         hammer2_chain_modify(trans, &parent,
1104                              HAMMER2_MODIFY_NO_MODIFY_TID |
1105                              HAMMER2_MODIFY_ASSERTNOCOPY);
1106
1107         switch(parent->bref.type) {
1108         case HAMMER2_BREF_TYPE_INODE:
1109                 /*
1110                  * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we
1111                  * properly duplicate the inode headers and do proper flush
1112                  * range checks (all the children should be beyond the flush
1113                  * point).  For now just don't sync the non-applicable
1114                  * children.
1115                  *
1116                  * XXX Can also occur due to hardlink consolidation.  We
1117                  * set OPFLAG_DIRECTDATA to prevent the indirect and data
1118                  * blocks from syncing ot the hardlink pointer.
1119                  */
1120 #if 0
1121                 KKASSERT((parent->data->ipdata.op_flags &
1122                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1123 #endif
1124 #if 0
1125                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1126                         base = NULL;
1127                 } else
1128 #endif
1129                 {
1130                         base = &parent->data->ipdata.u.blockset.blockref[0];
1131                         count = HAMMER2_SET_COUNT;
1132                 }
1133                 break;
1134         case HAMMER2_BREF_TYPE_INDIRECT:
1135         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1136                 if (parent->data) {
1137                         base = &parent->data->npdata[0];
1138                 } else {
1139                         base = NULL;
1140                         KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
1141                 }
1142                 count = parent->bytes / sizeof(hammer2_blockref_t);
1143                 break;
1144         case HAMMER2_BREF_TYPE_VOLUME:
1145                 base = &hmp->voldata.sroot_blockset.blockref[0];
1146                 count = HAMMER2_SET_COUNT;
1147                 break;
1148         case HAMMER2_BREF_TYPE_FREEMAP:
1149                 base = &parent->data->npdata[0];
1150                 count = HAMMER2_SET_COUNT;
1151                 break;
1152         default:
1153                 base = NULL;
1154                 count = 0;
1155                 panic("hammer2_chain_flush_scan2: "
1156                       "unrecognized blockref type: %d",
1157                       parent->bref.type);
1158         }
1159
1160         /*
1161          * Don't bother updating a deleted parent's blockrefs (caller will
1162          * optimize-out the disk write).  Note that this is not optional,
1163          * a deleted parent's blockref array might not be synchronized at
1164          * all so calling hammer2_base*() functions could result in a panic.
1165          *
1166          * Otherwise, we need to be COUNTEDBREFS synchronized for the
1167          * hammer2_base_*() functions.
1168          */
1169         if (parent->delete_tid <= trans->sync_tid)
1170                 base = NULL;
1171         else if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
1172                 hammer2_chain_countbrefs(parent, base, count);
1173
1174         /*
1175          * Update the parent's blockref table and propagate mirror_tid.
1176          *
1177          * NOTE! Children with modify_tid's beyond our flush point are
1178          *       considered to not exist for the purposes of updating the
1179          *       parent's blockref array.
1180          *
1181          * NOTE! Updates to a parent's blockref table do not adjust the
1182          *       parent's bref.modify_tid, only its bref.mirror_tid.
1183          */
1184         if (info->pass == 1 && child->delete_tid <= trans->sync_tid) {
1185                 /*
1186                  * Deleting.  Only adjust the block array if it contains
1187                  * the child's entry (child's REPLACE flag is set).  Clear
1188                  * the child's REPLACE flag only once all possible parent's
1189                  * have been updated.
1190                  */
1191                 ok = 1;
1192                 if (base && (child->flags & HAMMER2_CHAIN_REPLACE)) {
1193                         hammer2_rollup_stats(parent, child, -1);
1194                         spin_lock(&above->cst.spin);
1195                         hammer2_base_delete(parent, base, count,
1196                                             &info->cache_index, &child->bref);
1197                         if (TAILQ_NEXT(parent, core_entry) == NULL) {
1198                                 atomic_clear_int(&child->flags,
1199                                                HAMMER2_CHAIN_REPLACE);
1200                         }
1201                         spin_unlock(&above->cst.spin);
1202                 }
1203                 if (info->mirror_tid < child->delete_tid)
1204                         info->mirror_tid = child->delete_tid;
1205         } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) {
1206                 /*
1207                  * Inserting.  Only set the child's REPLACE flag indicating
1208                  * that the parent's blockref array entry is valid once all
1209                  * possible parent's have been updated.
1210                  */
1211                 ok = 1;
1212                 if (base) {
1213                         if (child->flags & HAMMER2_CHAIN_REPLACE)
1214                                 hammer2_rollup_stats(parent, child, 0);
1215                         else
1216                                 hammer2_rollup_stats(parent, child, 1);
1217                         spin_lock(&above->cst.spin);
1218                         hammer2_base_insert(parent, base, count,
1219                                             &info->cache_index, &child->bref,
1220                                             child->flags);
1221                         if (TAILQ_NEXT(parent, core_entry) == NULL) {
1222                                 atomic_set_int(&child->flags,
1223                                                HAMMER2_CHAIN_REPLACE);
1224                         }
1225                         spin_unlock(&above->cst.spin);
1226                 }
1227                 if (info->mirror_tid < child->modify_tid)
1228                         info->mirror_tid = child->modify_tid;
1229         } else {
1230                 ok = 0;
1231         }
1232
1233         if (info->mirror_tid < child->bref.mirror_tid) {
1234                 info->mirror_tid = child->bref.mirror_tid;
1235         }
1236         if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
1237              parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) &&
1238             hmp->voldata.mirror_tid < child->bref.mirror_tid) {
1239                 hmp->voldata.mirror_tid = child->bref.mirror_tid;
1240         }
1241
1242         /*
1243          * Only clear MOVED once all possible parents have been flushed.
1244          *
1245          * When can we safely clear the MOVED flag?  Flushes down duplicate
1246          * paths can occur out of order, for example if an inode is moved
1247          * as part of a hardlink consolidation or if an inode is moved into
1248          * an indirect block indexed before the inode.
1249          */
1250         if (ok && (child->flags & HAMMER2_CHAIN_MOVED)) {
1251                 hammer2_chain_t *scan;
1252                 int ok = 1;
1253
1254                 spin_lock(&above->cst.spin);
1255                 TAILQ_FOREACH(scan, &above->ownerq, core_entry) {
1256                         /*
1257                          * XXX weird code also checked at the top of scan2,
1258                          *     I would like to fix this by detaching the core
1259                          *     on initial hardlink consolidation (1->2 nlinks).
1260                          */
1261 #if 0
1262                         if (scan->bref.type == HAMMER2_BREF_TYPE_INODE &&
1263                             (scan->data->ipdata.op_flags &
1264                              HAMMER2_OPFLAG_DIRECTDATA)) {
1265                                 continue;
1266                         }
1267 #endif
1268                         if (scan->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1269                                 ok = 0;
1270                                 break;
1271                         }
1272                 }
1273                 spin_unlock(&above->cst.spin);
1274                 if (ok) {
1275                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
1276                         hammer2_chain_drop(child);      /* flag */
1277                 }
1278         }
1279
1280         /*
1281          * Unlock the child.  This can wind up dropping the child's
1282          * last ref, removing it from the parent's RB tree, and deallocating
1283          * the structure.  The RB_SCAN() our caller is doing handles the
1284          * situation.
1285          */
1286         hammer2_chain_unlock(child);
1287         hammer2_chain_drop(child);
1288         spin_lock(&above->cst.spin);
1289
1290         /*
1291          * The parent cleared SUBMODIFIED prior to the scan.  If the child
1292          * still requires a flush (possibly due to being outside the current
1293          * synchronization zone), we must re-set SUBMODIFIED on the way back
1294          * up.
1295          */
1296 finalize:
1297         return (0);
1298 }
1299
1300 static
1301 void
1302 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
1303 {
1304 #if 0
1305         hammer2_chain_t *grandp;
1306 #endif
1307
1308         parent->data_count += child->data_count;
1309         parent->inode_count += child->inode_count;
1310         child->data_count = 0;
1311         child->inode_count = 0;
1312         if (how < 0) {
1313                 parent->data_count -= child->bytes;
1314                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1315                         parent->inode_count -= 1;
1316 #if 0
1317                         /* XXX child->data may be NULL atm */
1318                         parent->data_count -= child->data->ipdata.data_count;
1319                         parent->inode_count -= child->data->ipdata.inode_count;
1320 #endif
1321                 }
1322         } else if (how > 0) {
1323                 parent->data_count += child->bytes;
1324                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1325                         parent->inode_count += 1;
1326 #if 0
1327                         /* XXX child->data may be NULL atm */
1328                         parent->data_count += child->data->ipdata.data_count;
1329                         parent->inode_count += child->data->ipdata.inode_count;
1330 #endif
1331                 }
1332         }
1333         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1334                 parent->data->ipdata.data_count += parent->data_count;
1335                 parent->data->ipdata.inode_count += parent->inode_count;
1336 #if 0
1337                 for (grandp = parent->above->first_parent;
1338                      grandp;
1339                      grandp = grandp->next_parent) {
1340                         grandp->data_count += parent->data_count;
1341                         grandp->inode_count += parent->inode_count;
1342                 }
1343 #endif
1344                 parent->data_count = 0;
1345                 parent->inode_count = 0;
1346         }
1347 }