hammer2 - Revamp hammer2_cluster structure part 1
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35
36 #include <sys/cdefs.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/lock.h>
41 #include <sys/uuid.h>
42
43 #include "hammer2.h"
44
45 /*
46  * Recursively flush the specified chain.  The chain is locked and
47  * referenced by the caller and will remain so on return.  The chain
48  * will remain referenced throughout but can temporarily lose its
49  * lock during the recursion to avoid unnecessarily stalling user
50  * processes.
51  */
52 struct hammer2_flush_info {
53         hammer2_chain_t *parent;
54         hammer2_trans_t *trans;
55         int             depth;
56         int             diddeferral;
57         int             pass;
58         int             cache_index;
59         struct h2_flush_deferral_list flush_list;
60         hammer2_tid_t   sync_tid;       /* flush synchronization point */
61         hammer2_tid_t   mirror_tid;     /* collect mirror TID updates */
62 };
63
64 typedef struct hammer2_flush_info hammer2_flush_info_t;
65
66 static void hammer2_chain_flush_core(hammer2_flush_info_t *info,
67                                 hammer2_chain_t *chain);
68 static int hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data);
69 static int hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data);
70 static void hammer2_rollup_stats(hammer2_chain_t *parent,
71                                 hammer2_chain_t *child, int how);
72
73 #if 0
74 static __inline
75 void
76 hammer2_updatestats(hammer2_flush_info_t *info, hammer2_blockref_t *bref,
77                     int how)
78 {
79         hammer2_key_t bytes;
80
81         if (bref->type != 0) {
82                 bytes = 1 << (bref->data_off & HAMMER2_OFF_MASK_RADIX);
83                 if (bref->type == HAMMER2_BREF_TYPE_INODE)
84                         info->inode_count += how;
85                 if (how < 0)
86                         info->data_count -= bytes;
87                 else
88                         info->data_count += bytes;
89         }
90 }
91 #endif
92
93 /*
94  * Transaction support functions for writing to the filesystem.
95  *
96  * Initializing a new transaction allocates a transaction ID.  We
97  * don't bother marking the volume header MODIFIED.  Instead, the volume
98  * will be synchronized at a later time as part of a larger flush sequence.
99  *
100  * Non-flush transactions can typically run concurrently.  However if
101  * there are non-flush transaction both before AND after a flush trans,
102  * the transactions after stall until the ones before finish.
103  *
104  * Non-flush transactions occuring after a flush pointer can run concurrently
105  * with that flush.  They only have to wait for transactions prior to the
106  * flush trans to complete before they unstall.
107  *
108  * WARNING! Transaction ids are only allocated when the transaction becomes
109  *          active, which allows other transactions to insert ahead of us
110  *          if we are forced to block (only bioq transactions do that).
111  *
112  * WARNING! Modifications to the root volume cannot dup the root volume
113  *          header to handle synchronization points, so alloc_tid can
114  *          wind up (harmlessly) more advanced on flush.
115  *
116  * WARNING! Operations which might call inode_duplicate()/chain_duplicate()
117  *          depend heavily on having a unique sync_tid to avoid duplication
118  *          collisions (which key off of delete_tid).
119  */
120 void
121 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
122 {
123         hammer2_mount_t *hmp;
124         hammer2_trans_t *scan;
125
126         bzero(trans, sizeof(*trans));
127         trans->pmp = pmp;
128         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
129
130         hammer2_voldata_lock(hmp);
131         trans->flags = flags;
132         trans->td = curthread;
133         /*trans->delete_gen = 0;*/      /* multiple deletions within trans */
134
135         if (flags & HAMMER2_TRANS_ISFLUSH) {
136                 /*
137                  * If multiple flushes are trying to run we have to
138                  * wait until it is our turn, then set curflush to
139                  * indicate that a flush is now pending (but not
140                  * necessarily active yet).
141                  *
142                  * NOTE: Do not set trans->blocked here.
143                  */
144                 ++hmp->flushcnt;
145                 while (hmp->curflush != NULL) {
146                         lksleep(&hmp->curflush, &hmp->voldatalk,
147                                 0, "h2multf", hz);
148                 }
149                 hmp->curflush = trans;
150                 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
151
152                 /*
153                  * If we are a flush we have to wait for all transactions
154                  * prior to our flush synchronization point to complete
155                  * before we can start our flush.
156                  *
157                  * Most importantly, this includes bioq flushes.
158                  *
159                  * NOTE: Do not set trans->blocked here.
160                  */
161                 while (TAILQ_FIRST(&hmp->transq) != trans) {
162                         lksleep(&trans->sync_tid, &hmp->voldatalk,
163                                 0, "h2syncw", hz);
164                 }
165
166                 /*
167                  * don't assign sync_tid until we become the running
168                  * flush.  topo_flush_tid is used to control when
169                  * chain modifications in concurrent transactions are
170                  * required to delete-duplicate (so as not to disturb
171                  * the state of what is being currently flushed).
172                  */
173                 trans->sync_tid = hmp->voldata.alloc_tid++;
174                 hmp->topo_flush_tid = trans->sync_tid;
175
176                 /*
177                  * Once we become the running flush we can wakeup anyone
178                  * who blocked on us, up to the next flush.  That is,
179                  * our flush can run concurrent with frontend operations.
180                  */
181                 scan = trans;
182                 while ((scan = TAILQ_NEXT(scan, entry)) != NULL) {
183                         if (scan->flags & HAMMER2_TRANS_ISFLUSH)
184                                 break;
185                         if (scan->blocked == 0)
186                                 break;
187                         scan->blocked = 0;
188                         wakeup(&scan->blocked);
189                 }
190         } else if ((flags & HAMMER2_TRANS_BUFCACHE) && hmp->curflush) {
191                 /*
192                  * We cannot block if we are the bioq thread.  When a
193                  * flush is not pending we can operate normally but
194                  * if a flush IS pending the bioq thread's transaction
195                  * must be placed either before or after curflush.
196                  *
197                  * If the current flush is waiting the bioq thread's
198                  * transaction is placed before.  If it is running the
199                  * bioq thread's transaction is placed after.
200                  */
201                 scan = TAILQ_FIRST(&hmp->transq);
202                 if (scan != hmp->curflush) {
203                         TAILQ_INSERT_BEFORE(hmp->curflush, trans, entry);
204                 } else {
205                         TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
206                 }
207                 trans->sync_tid = hmp->voldata.alloc_tid++;
208         } else {
209                 /*
210                  * If this is a normal transaction and not a flush, or
211                  * if this is a bioq transaction and no flush is pending,
212                  * we can queue normally.
213                  *
214                  * Normal transactions must block while a pending flush is
215                  * waiting for prior transactions to complete.  Once the
216                  * pending flush becomes active we can run concurrently
217                  * with it.
218                  */
219                 TAILQ_INSERT_TAIL(&hmp->transq, trans, entry);
220                 scan = TAILQ_FIRST(&hmp->transq);
221                 if (hmp->curflush && hmp->curflush != scan) {
222                         trans->blocked = 1;
223                         while (trans->blocked) {
224                                 lksleep(&trans->blocked, &hmp->voldatalk,
225                                         0, "h2trans", hz);
226                         }
227                 }
228                 trans->sync_tid = hmp->voldata.alloc_tid++;
229         }
230         hammer2_voldata_unlock(hmp, 0);
231 }
232
233 void
234 hammer2_trans_done(hammer2_trans_t *trans)
235 {
236         hammer2_mount_t *hmp;
237         hammer2_trans_t *scan;
238
239         hmp = trans->pmp->cluster.chains[0]->hmp;
240
241         hammer2_voldata_lock(hmp);
242         TAILQ_REMOVE(&hmp->transq, trans, entry);
243         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
244                 --hmp->flushcnt;
245                 if (hmp->flushcnt) {
246                         /*
247                          * If we were a flush then wakeup anyone waiting on
248                          * curflush (i.e. other flushes that want to run).
249                          * Leave topo_flush_id set (I think we could probably
250                          * clear it to zero here).
251                          */
252                         hmp->curflush = NULL;
253                         wakeup(&hmp->curflush);
254                 } else {
255                         /*
256                          * Theoretically we don't have to clear flush_tid
257                          * here since the flush will have synchronized
258                          * all operations <= flush_tid already.  But for
259                          * now zero-it.
260                          */
261                         hmp->curflush = NULL;
262                         hmp->topo_flush_tid = 0;
263                 }
264         } else {
265                 /*
266                  * If we are not a flush but a flush is now at the head
267                  * of the queue and we were previously blocking it,
268                  * we can now unblock it.
269                  */
270                 if (hmp->flushcnt &&
271                     (scan = TAILQ_FIRST(&hmp->transq)) != NULL &&
272                     trans->sync_tid < scan->sync_tid &&
273                     (scan->flags & HAMMER2_TRANS_ISFLUSH)) {
274                         wakeup(&scan->sync_tid);
275                 }
276         }
277         hammer2_voldata_unlock(hmp, 0);
278 }
279
280 /*
281  * Flush the chain and all modified sub-chains through the specified
282  * synchronization point (sync_tid), propagating parent chain modifications
283  * and mirror_tid updates back up as needed.  Since we are recursing downward
284  * we do not have to deal with the complexities of multi-homed chains (chains
285  * with multiple parents).
286  *
287  * Caller must have interlocked against any non-flush-related modifying
288  * operations in progress whos modify_tid values are less than or equal
289  * to the passed sync_tid.
290  *
291  * Caller must have already vetted synchronization points to ensure they
292  * are properly flushed.  Only snapshots and cluster flushes can create
293  * these sorts of synchronization points.
294  *
295  * This routine can be called from several places but the most important
296  * is from the hammer2_vop_reclaim() function.  We want to try to completely
297  * clean out the inode structure to prevent disconnected inodes from
298  * building up and blowing out the kmalloc pool.  However, it is not actually
299  * necessary to flush reclaimed inodes to maintain HAMMER2's crash recovery
300  * capability.
301  *
302  * chain is locked on call and will remain locked on return.  If a flush
303  * occured, the chain's MOVED bit will be set indicating that its parent
304  * (which is not part of the flush) should be updated.
305  */
306 void
307 hammer2_chain_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
308 {
309         hammer2_chain_t *scan;
310         hammer2_chain_core_t *core;
311         hammer2_flush_info_t info;
312
313         /*
314          * Execute the recursive flush and handle deferrals.
315          *
316          * Chains can be ridiculously long (thousands deep), so to
317          * avoid blowing out the kernel stack the recursive flush has a
318          * depth limit.  Elements at the limit are placed on a list
319          * for re-execution after the stack has been popped.
320          */
321         bzero(&info, sizeof(info));
322         TAILQ_INIT(&info.flush_list);
323         info.trans = trans;
324         info.sync_tid = trans->sync_tid;
325         info.mirror_tid = 0;
326         info.cache_index = -1;
327
328         core = chain->core;
329
330         for (;;) {
331                 /*
332                  * Unwind deep recursions which had been deferred.  This
333                  * can leave MOVED set for these chains, which will be
334                  * handled when we [re]flush chain after the unwind.
335                  */
336                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
337                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
338                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
339                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
340
341                         /*
342                          * Now that we've popped back up we can do a secondary
343                          * recursion on the deferred elements.
344                          */
345                         if (hammer2_debug & 0x0040)
346                                 kprintf("defered flush %p\n", scan);
347                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
348                         hammer2_chain_flush(trans, scan);
349                         hammer2_chain_unlock(scan);
350                         hammer2_chain_drop(scan);       /* ref from deferral */
351                 }
352
353                 /*
354                  * Flush pass1 on root.
355                  */
356                 info.diddeferral = 0;
357                 hammer2_chain_flush_core(&info, chain);
358 #if FLUSH_DEBUG
359                 kprintf("flush_core_done parent=<base> chain=%p.%d %08x\n",
360                         chain, chain->bref.type, chain->flags);
361 #endif
362
363                 /*
364                  * Only loop if deep recursions have been deferred.
365                  */
366                 if (TAILQ_EMPTY(&info.flush_list))
367                         break;
368         }
369 }
370
371 /*
372  * This is the core of the chain flushing code.  The chain is locked by the
373  * caller and remains locked on return.  This function is keyed off of
374  * the SUBMODIFIED bit but must make fine-grained choices based on the
375  * synchronization point we are flushing to.
376  *
377  * If the flush accomplished any work chain will be flagged MOVED
378  * indicating a copy-on-write propagation back up is required.
379  * Deep sub-nodes may also have been entered onto the deferral list.
380  * MOVED is never set on the volume root.
381  *
382  * NOTE: modify_tid is different from MODIFIED.  modify_tid is updated
383  *       only when a chain is specifically modified, and not updated
384  *       for copy-on-write propagations.  MODIFIED is set on any modification
385  *       including copy-on-write propagations.
386  */
387 static void
388 hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain)
389 {
390         hammer2_mount_t *hmp;
391         hammer2_blockref_t *bref;
392         hammer2_off_t pbase;
393         hammer2_off_t pmask;
394         hammer2_tid_t saved_sync;
395         hammer2_trans_t *trans = info->trans;
396         hammer2_chain_core_t *core;
397         size_t psize;
398         size_t boff;
399         char *bdata;
400         struct buf *bp;
401         int error;
402         int wasmodified;
403         int diddeferral = 0;
404
405         hmp = chain->hmp;
406
407 #if FLUSH_DEBUG
408         if (info->parent)
409                 kprintf("flush_core %p->%p.%d %08x (%s)\n",
410                         info->parent, chain, chain->bref.type,
411                         chain->flags,
412                         ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
413                                 chain->data->ipdata.filename : "?"));
414         else
415                 kprintf("flush_core NULL->%p.%d %08x (%s)\n",
416                         chain, chain->bref.type,
417                         chain->flags,
418                         ((chain->bref.type == HAMMER2_BREF_TYPE_INODE) ?
419                                 chain->data->ipdata.filename : "?"));
420 #endif
421         /*
422          * Ignore chains modified beyond the current flush point.  These
423          * will be treated as if they did not exist.
424          */
425         if (chain->modify_tid > info->sync_tid)
426                 return;
427
428         /*
429          * Deleted chains which have not been destroyed must be retained,
430          * and we probably have to recurse to clean-up any sub-trees.
431          * However, restricted flushes can stop processing here because
432          * the chain cleanup will be handled by a later normal flush.
433          *
434          * The MODIFIED bit can likely be cleared in this situation and we
435          * will do so later on in this procedure.
436          */
437         if (chain->delete_tid <= info->sync_tid) {
438                 if (trans->flags & HAMMER2_TRANS_RESTRICTED)
439                         return;
440         }
441
442         saved_sync = info->sync_tid;
443         core = chain->core;
444
445         /*
446          * If SUBMODIFIED is set we recurse the flush and adjust the
447          * blockrefs accordingly.
448          *
449          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
450          *       finishing in the face of filesystem activity.
451          */
452         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
453                 hammer2_chain_t *saved_parent;
454                 hammer2_tid_t saved_mirror;
455                 hammer2_chain_layer_t *layer;
456
457                 /*
458                  * Clear SUBMODIFIED to catch races.  Note that any child
459                  * with MODIFIED, DELETED, or MOVED set during scan2, or
460                  * which tries to lastdrop but can't free its structures,
461                  * or which gets defered, will cause SUBMODIFIED to be set
462                  * again.
463                  *
464                  * We don't want to set our chain to MODIFIED gratuitously.
465                  *
466                  * We need an extra ref on chain because we are going to
467                  * release its lock temporarily in our child loop.
468                  */
469                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
470                 hammer2_chain_ref(chain);
471
472                 /*
473                  * Run two passes.  The first pass handles MODIFIED and
474                  * SUBMODIFIED chains and recurses while the second pass
475                  * handles MOVED chains on the way back up.
476                  *
477                  * If the stack gets too deep we defer scan1, but must
478                  * be sure to still run scan2 if on the next loop the
479                  * deferred chain has been flushed and now needs MOVED
480                  * handling on the way back up.
481                  *
482                  * Scan1 is recursive.
483                  *
484                  * NOTE: The act of handling a modified/submodified chain can
485                  *       cause the MOVED Flag to be set.  It can also be set
486                  *       via hammer2_chain_delete() and in other situations.
487                  *
488                  * NOTE: RB_SCAN() must be used instead of RB_FOREACH()
489                  *       because children can be physically removed during
490                  *       the scan.
491                  */
492                 saved_parent = info->parent;
493                 saved_mirror = info->mirror_tid;
494                 info->parent = chain;
495                 info->mirror_tid = chain->bref.mirror_tid;
496
497                 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
498                         if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
499                                 hammer2_chain_ref(chain);
500                                 TAILQ_INSERT_TAIL(&info->flush_list,
501                                                   chain, flush_node);
502                                 atomic_set_int(&chain->flags,
503                                                HAMMER2_CHAIN_DEFERRED);
504                         }
505                         diddeferral = 1;
506                 } else {
507                         info->diddeferral = 0;
508                         spin_lock(&core->cst.spin);
509                         KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
510                         TAILQ_FOREACH_REVERSE(layer, &core->layerq,
511                                               h2_layer_list, entry) {
512                                 ++layer->refs;
513                                 KKASSERT(layer->good == 0xABCD);
514                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
515                                         NULL, hammer2_chain_flush_scan1, info);
516                                 --layer->refs;
517                                 diddeferral += info->diddeferral;
518                         }
519                         spin_unlock(&core->cst.spin);
520                 }
521
522                 /*
523                  * Handle successfully flushed children who are in the MOVED
524                  * state on the way back up the recursion.  This can have
525                  * the side-effect of clearing MOVED.
526                  *
527                  * Scan2 is non-recursive.
528                  */
529                 if (diddeferral) {
530                         atomic_set_int(&chain->flags,
531                                        HAMMER2_CHAIN_SUBMODIFIED);
532                         spin_lock(&core->cst.spin);
533                 } else {
534                         spin_lock(&core->cst.spin);
535                         KKASSERT(core->good == 0x1234 && core->sharecnt > 0);
536                         TAILQ_FOREACH_REVERSE(layer, &core->layerq,
537                                               h2_layer_list, entry) {
538                                 info->pass = 1;
539                                 ++layer->refs;
540                                 KKASSERT(layer->good == 0xABCD);
541                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
542                                         NULL, hammer2_chain_flush_scan2, info);
543                                 info->pass = 2;
544                                 RB_SCAN(hammer2_chain_tree, &layer->rbtree,
545                                         NULL, hammer2_chain_flush_scan2, info);
546                                 /*diddeferral += info->diddeferral; n/a*/
547                                 --layer->refs;
548                         }
549                 }
550                 hammer2_chain_layer_check_locked(chain->hmp, core);
551                 spin_unlock(&core->cst.spin);
552
553                 chain->bref.mirror_tid = info->mirror_tid;
554                 info->mirror_tid = saved_mirror;
555                 info->parent = saved_parent;
556                 KKASSERT(chain->refs > 1);
557                 hammer2_chain_drop(chain);
558         }
559
560         /*
561          * Restore sync_tid in case it was restricted by a delete/duplicate.
562          */
563         info->sync_tid = saved_sync;
564
565         /*
566          * Rollup diddeferral for caller.  Note direct assignment, not +=.
567          */
568         info->diddeferral = diddeferral;
569
570         /*
571          * Do not flush chain if there were any deferrals.  It will be
572          * retried later after the deferrals are independently handled.
573          */
574         if (diddeferral) {
575                 if (hammer2_debug & 0x0008) {
576                         kprintf("%*.*s} %p/%d %04x (deferred)",
577                                 info->depth, info->depth, "",
578                                 chain, chain->refs, chain->flags);
579                 }
580                 return;
581         }
582
583         /*
584          * If we encounter a deleted chain within our flush we can clear
585          * the MODIFIED bit and avoid flushing it whether it has been
586          * destroyed or not.  We must make sure that the chain is flagged
587          * MOVED in this situation so the parent picks up the deletion.
588          *
589          * Note that scan2 has already executed above so statistics have
590          * already been rolled up.
591          */
592         if (chain->delete_tid <= info->sync_tid) {
593                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
594                         if (chain->bp) {
595                                 if (chain->bytes == chain->bp->b_bufsize)
596                                         chain->bp->b_flags |= B_INVAL|B_RELBUF;
597                         }
598                         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
599                                 hammer2_chain_ref(chain);
600                                 atomic_set_int(&chain->flags,
601                                                HAMMER2_CHAIN_MOVED);
602                         }
603                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
604                         hammer2_chain_drop(chain);
605                 }
606                 return;
607         }
608 #if 0
609         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
610             (chain->flags & HAMMER2_CHAIN_DELETED) &&
611             (trans->flags & HAMMER2_TRANS_RESTRICTED) == 0) {
612                 /*
613                  * Throw-away the MODIFIED flag
614                  */
615                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
616                         if (chain->bp) {
617                                 if (chain->bytes == chain->bp->b_bufsize)
618                                         chain->bp->b_flags |= B_INVAL|B_RELBUF;
619                         }
620                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
621                         hammer2_chain_drop(chain);
622                 }
623                 return;
624         }
625 #endif
626
627         /*
628          * A degenerate flush might not have flushed anything and thus not
629          * processed modified blocks on the way back up.  Detect the case.
630          *
631          * Note that MOVED can be set without MODIFIED being set due to
632          * a deletion, in which case it is handled by Scan2 later on.
633          *
634          * Both bits can be set along with DELETED due to a deletion if
635          * modified data within the synchronization zone and the chain
636          * was then deleted beyond the zone, in which case we still have
637          * to flush for synchronization point consistency.  Otherwise though
638          * DELETED and MODIFIED are treated as separate flags.
639          */
640         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0)
641                 return;
642
643         /*
644          * Issue flush.
645          *
646          * A DESTROYED node that reaches this point must be flushed for
647          * synchronization point consistency.
648          */
649
650         /*
651          * Update mirror_tid, clear MODIFIED, and set MOVED.
652          *
653          * The caller will update the parent's reference to this chain
654          * by testing MOVED as long as the modification was in-bounds.
655          *
656          * MOVED is never set on the volume root as there is no parent
657          * to adjust.
658          */
659         if (chain->bref.mirror_tid < info->sync_tid)
660                 chain->bref.mirror_tid = info->sync_tid;
661         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
662         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
663         if (chain == &hmp->vchain)
664                 kprintf("(FLUSHED VOLUME HEADER)\n");
665         if (chain == &hmp->fchain)
666                 kprintf("(FLUSHED FREEMAP HEADER)\n");
667
668         if ((chain->flags & HAMMER2_CHAIN_MOVED) ||
669             chain == &hmp->vchain ||
670             chain == &hmp->fchain) {
671                 /*
672                  * Drop the ref from the MODIFIED bit we cleared.
673                  */
674                 if (wasmodified)
675                         hammer2_chain_drop(chain);
676         } else {
677                 /*
678                  * If we were MODIFIED we inherit the ref from clearing
679                  * that bit, otherwise we need another ref.
680                  */
681                 if (wasmodified == 0)
682                         hammer2_chain_ref(chain);
683                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
684         }
685
686         /*
687          * If this is part of a recursive flush we can go ahead and write
688          * out the buffer cache buffer and pass a new bref back up the chain
689          * via the MOVED bit.
690          *
691          * Volume headers are NOT flushed here as they require special
692          * processing.
693          */
694         switch(chain->bref.type) {
695         case HAMMER2_BREF_TYPE_FREEMAP:
696                 hammer2_modify_volume(hmp);
697                 break;
698         case HAMMER2_BREF_TYPE_VOLUME:
699                 /*
700                  * We should flush the free block table before we calculate
701                  * CRCs and copy voldata -> volsync.
702                  *
703                  * To prevent SMP races, fchain must remain locked until
704                  * voldata is copied to volsync.
705                  */
706                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
707                 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
708                                          HAMMER2_CHAIN_SUBMODIFIED)) {
709                         /* this will modify vchain as a side effect */
710                         hammer2_chain_flush(info->trans, &hmp->fchain);
711                 }
712
713                 /*
714                  * The volume header is flushed manually by the syncer, not
715                  * here.  All we do is adjust the crc's.
716                  */
717                 KKASSERT(chain->data != NULL);
718                 KKASSERT(chain->bp == NULL);
719                 kprintf("volume header mirror_tid %jd\n",
720                         hmp->voldata.mirror_tid);
721
722                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
723                         hammer2_icrc32(
724                                 (char *)&hmp->voldata +
725                                  HAMMER2_VOLUME_ICRC1_OFF,
726                                 HAMMER2_VOLUME_ICRC1_SIZE);
727                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
728                         hammer2_icrc32(
729                                 (char *)&hmp->voldata +
730                                  HAMMER2_VOLUME_ICRC0_OFF,
731                                 HAMMER2_VOLUME_ICRC0_SIZE);
732                 hmp->voldata.icrc_volheader =
733                         hammer2_icrc32(
734                                 (char *)&hmp->voldata +
735                                  HAMMER2_VOLUME_ICRCVH_OFF,
736                                 HAMMER2_VOLUME_ICRCVH_SIZE);
737                 hmp->volsync = hmp->voldata;
738                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
739                 hammer2_chain_unlock(&hmp->fchain);
740                 break;
741         case HAMMER2_BREF_TYPE_DATA:
742                 /*
743                  * Data elements have already been flushed via the logical
744                  * file buffer cache.  Their hash was set in the bref by
745                  * the vop_write code.
746                  *
747                  * Make sure any device buffer(s) have been flushed out here.
748                  * (there aren't usually any to flush).
749                  */
750                 psize = hammer2_devblksize(chain->bytes);
751                 pmask = (hammer2_off_t)psize - 1;
752                 pbase = chain->bref.data_off & ~pmask;
753                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
754
755                 bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0);
756                 if (bp) {
757                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
758                             (B_CACHE | B_DIRTY)) {
759                                 cluster_awrite(bp);
760                         } else {
761                                 bp->b_flags |= B_RELBUF;
762                                 brelse(bp);
763                         }
764                 }
765                 break;
766 #if 0
767         case HAMMER2_BREF_TYPE_INDIRECT:
768                 /*
769                  * Indirect blocks may be in an INITIAL state.  Use the
770                  * chain_lock() call to ensure that the buffer has been
771                  * instantiated (even though it is already locked the buffer
772                  * might not have been instantiated).
773                  *
774                  * Only write the buffer out if it is dirty, it is possible
775                  * the operating system had already written out the buffer.
776                  */
777                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
778                 KKASSERT(chain->bp != NULL);
779
780                 bp = chain->bp;
781                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
782                     (bp->b_flags & B_DIRTY)) {
783                         bdwrite(chain->bp);
784                 } else {
785                         brelse(chain->bp);
786                 }
787                 chain->bp = NULL;
788                 chain->data = NULL;
789                 hammer2_chain_unlock(chain);
790                 break;
791 #endif
792         case HAMMER2_BREF_TYPE_INDIRECT:
793         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
794                 /*
795                  * Device-backed.  Buffer will be flushed by the sync
796                  * code XXX.
797                  */
798                 KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
799                 break;
800         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
801         default:
802                 /*
803                  * Embedded elements have to be flushed out.
804                  * (Basically just BREF_TYPE_INODE).
805                  */
806                 KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
807                 KKASSERT(chain->data != NULL);
808                 KKASSERT(chain->bp == NULL);
809                 bref = &chain->bref;
810
811                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
812                 KKASSERT(HAMMER2_DEC_CHECK(chain->bref.methods) ==
813                          HAMMER2_CHECK_ISCSI32 ||
814                          HAMMER2_DEC_CHECK(chain->bref.methods) ==
815                          HAMMER2_CHECK_FREEMAP);
816
817                 /*
818                  * The data is embedded, we have to acquire the
819                  * buffer cache buffer and copy the data into it.
820                  */
821                 psize = hammer2_devblksize(chain->bytes);
822                 pmask = (hammer2_off_t)psize - 1;
823                 pbase = bref->data_off & ~pmask;
824                 boff = bref->data_off & (HAMMER2_OFF_MASK & pmask);
825
826                 /*
827                  * The getblk() optimization can only be used if the
828                  * physical block size matches the request.
829                  */
830                 error = bread(hmp->devvp, pbase, psize, &bp);
831                 KKASSERT(error == 0);
832
833                 bdata = (char *)bp->b_data + boff;
834
835                 /*
836                  * Copy the data to the buffer, mark the buffer
837                  * dirty, and convert the chain to unmodified.
838                  */
839                 bcopy(chain->data, bdata, chain->bytes);
840                 bp->b_flags |= B_CLUSTEROK;
841                 bdwrite(bp);
842                 bp = NULL;
843
844                 switch(HAMMER2_DEC_CHECK(chain->bref.methods)) {
845                 case HAMMER2_CHECK_FREEMAP:
846                         chain->bref.check.freemap.icrc32 =
847                                 hammer2_icrc32(chain->data, chain->bytes);
848                         break;
849                 case HAMMER2_CHECK_ISCSI32:
850                         chain->bref.check.iscsi32.value =
851                                 hammer2_icrc32(chain->data, chain->bytes);
852                         break;
853                 default:
854                         panic("hammer2_flush_core: bad crc type");
855                         break; /* NOT REACHED */
856                 }
857                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
858                         ++hammer2_iod_meta_write;
859                 else
860                         ++hammer2_iod_indr_write;
861         }
862 }
863
864 /*
865  * Flush helper scan1 (recursive)
866  *
867  * Flushes the children of the caller's chain (parent) and updates
868  * the blockref, restricted by sync_tid.
869  *
870  * Ripouts during the loop should not cause any problems.  Because we are
871  * flushing to a synchronization point, modification races will occur after
872  * sync_tid and do not have to be flushed anyway.
873  *
874  * It is also ok if the parent is chain_duplicate()'d while unlocked because
875  * the delete/duplication will install a delete_tid that is still larger than
876  * our current sync_tid.
877  */
878 static int
879 hammer2_chain_flush_scan1(hammer2_chain_t *child, void *data)
880 {
881         hammer2_flush_info_t *info = data;
882         hammer2_trans_t *trans = info->trans;
883         hammer2_chain_t *parent = info->parent;
884         int diddeferral;
885
886         /*
887          * We should only need to recurse if SUBMODIFIED is set, but as
888          * a safety also recurse if MODIFIED is also set.
889          *
890          * Return early if neither bit is set.  We must re-assert the
891          * SUBMODIFIED flag in the parent if any child covered by the
892          * parent (via delete_tid) is skipped.
893          */
894         if ((child->flags & (HAMMER2_CHAIN_MODIFIED |
895                              HAMMER2_CHAIN_SUBMODIFIED)) == 0) {
896                 return (0);
897         }
898         if (child->modify_tid > trans->sync_tid) {
899                 if (parent->delete_tid > trans->sync_tid) {
900                         atomic_set_int(&parent->flags,
901                                        HAMMER2_CHAIN_SUBMODIFIED);
902                 }
903                 return (0);
904         }
905
906         hammer2_chain_ref(child);
907         spin_unlock(&parent->core->cst.spin);
908
909         /*
910          * The caller has added a ref to the parent so we can temporarily
911          * unlock it in order to lock the child.  Re-check the flags before
912          * continuing.
913          */
914         hammer2_chain_unlock(parent);
915         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
916
917         if ((child->flags & (HAMMER2_CHAIN_MODIFIED |
918                              HAMMER2_CHAIN_SUBMODIFIED)) == 0) {
919                 hammer2_chain_unlock(child);
920                 hammer2_chain_drop(child);
921                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
922                 spin_lock(&parent->core->cst.spin);
923                 return (0);
924         }
925         if (child->modify_tid > trans->sync_tid) {
926                 hammer2_chain_unlock(child);
927                 hammer2_chain_drop(child);
928                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
929                 spin_lock(&parent->core->cst.spin);
930                 if (parent->delete_tid > trans->sync_tid) {
931                         atomic_set_int(&parent->flags,
932                                        HAMMER2_CHAIN_SUBMODIFIED);
933                 }
934                 return (0);
935         }
936
937         /*
938          * The DESTROYED flag can only be initially set on an unreferenced
939          * deleted inode and will propagate downward via the mechanic below.
940          * Such inode chains have been deleted for good and should no longer
941          * be subject to delete/duplication.
942          *
943          * This optimization allows the inode reclaim (destroy unlinked file
944          * on vnode reclamation after last close) to be flagged by just
945          * setting HAMMER2_CHAIN_DESTROYED at the top level and then will
946          * cause the chains to be terminated and related buffers to be
947          * invalidated and not flushed out.
948          *
949          * We have to be careful not to propagate the DESTROYED flag if
950          * the destruction occurred after our flush sync_tid.
951          */
952         if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
953             (child->flags & HAMMER2_CHAIN_DELETED) &&
954             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
955                 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROYED |
956                                               HAMMER2_CHAIN_SUBMODIFIED);
957         }
958
959         /*
960          * Recurse and collect deferral data.
961          */
962         diddeferral = info->diddeferral;
963         ++info->depth;
964         hammer2_chain_flush_core(info, child);
965 #if FLUSH_DEBUG
966         kprintf("flush_core_done parent=%p flags=%08x child=%p.%d %08x\n",
967                 parent, parent->flags, child, child->bref.type, child->flags);
968 #endif
969         --info->depth;
970         info->diddeferral += diddeferral;
971
972         if (child->flags & HAMMER2_CHAIN_SUBMODIFIED)
973                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
974
975         hammer2_chain_unlock(child);
976         hammer2_chain_drop(child);
977
978         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
979
980         spin_lock(&parent->core->cst.spin);
981
982         return (0);
983 }
984
985 /*
986  * Flush helper scan2 (non-recursive)
987  *
988  * This pass on a chain's children propagates any MOVED or DELETED
989  * elements back up the chain towards the root after those elements have
990  * been fully flushed.  Unlike scan1, this function is NOT recursive and
991  * the parent remains locked across the entire scan.
992  *
993  * SCAN2 is called twice, once with pass set to 1 and once with it set to 2.
994  * We have to do this so base[] elements can be deleted in pass 1 to make
995  * room for adding new elements in pass 2.
996  *
997  * This function also rolls up storage statistics.
998  *
999  * NOTE!  We must re-set SUBMODIFIED on the parent(s) as appropriate, and
1000  *        due to the above conditions it is possible to do this and still
1001  *        have some children flagged MOVED depending on the synchronization.
1002  *
1003  * NOTE!  A deletion is a visbility issue, there can still be references to
1004  *        deleted elements (for example, to an unlinked file which is still
1005  *        open), and there can also be multiple chains pointing to the same
1006  *        bref where some are deleted and some are not (for example due to
1007  *        a rename).   So a chain marked for deletion is basically considered
1008  *        to be live until it is explicitly destroyed or until its ref-count
1009  *        reaches zero (also implying that MOVED and MODIFIED are clear).
1010  */
1011 static int
1012 hammer2_chain_flush_scan2(hammer2_chain_t *child, void *data)
1013 {
1014         hammer2_flush_info_t *info = data;
1015         hammer2_chain_t *parent = info->parent;
1016         hammer2_chain_core_t *above = child->above;
1017         hammer2_mount_t *hmp = child->hmp;
1018         hammer2_trans_t *trans = info->trans;
1019         hammer2_blockref_t *base;
1020         int count;
1021         int ok;
1022
1023         /*
1024          * Inodes with stale children that have been converted to DIRECTDATA
1025          * mode (file extension or hardlink conversion typically) need to
1026          * skipped right now before we start messing with a non-existant
1027          * block table.
1028          */
1029 #if 0
1030         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1031             (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
1032                 goto finalize;
1033         }
1034 #endif
1035
1036         /*
1037          * Ignore children created after our flush point, treating them as
1038          * if they did not exist).  These children will not cause the parent
1039          * to be updated.
1040          *
1041          * When we encounter such children and the parent chain has not been
1042          * deleted, delete/duplicated, or delete/duplicated-for-move, then
1043          * the parent may be used to funnel through several flush points.
1044          * We must re-set the SUBMODIFIED flag in the parent to ensure that
1045          * those flushes have visbility.  A simple test of delete_tid suffices
1046          * to determine if the parent spans beyond our current flush.
1047          */
1048         if (child->modify_tid > trans->sync_tid) {
1049                 goto finalize;
1050         }
1051
1052         /*
1053          * Ignore children which have not changed.  The parent's block table
1054          * is already correct.
1055          *
1056          * XXX The MOVED bit is only cleared when all multi-homed parents
1057          *     have flushed, creating a situation where a re-flush can occur
1058          *     via a parent which has already flushed.  The hammer2_base_*()
1059          *     functions currently have a hack to deal with this case but
1060          *     we need something better.
1061          */
1062         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
1063                 goto finalize;
1064         }
1065
1066         /*
1067          * Make sure child is referenced before we unlock.
1068          */
1069         hammer2_chain_ref(child);
1070         spin_unlock(&above->cst.spin);
1071
1072         /*
1073          * Parent reflushed after the child has passed them by should skip
1074          * due to the modify_tid test. XXX
1075          */
1076         hammer2_chain_lock(child, HAMMER2_RESOLVE_NEVER);
1077         KKASSERT(child->above == above);
1078         KKASSERT(parent->core == above);
1079
1080         /*
1081          * The parent's blockref to the child must be deleted or updated.
1082          *
1083          * This point is not reached on successful DESTROYED optimizations
1084          * but can be reached on recursive deletions and restricted flushes.
1085          *
1086          * Because flushes are ordered we do not have to make a
1087          * modify/duplicate of indirect blocks.  That is, the flush
1088          * code does not have to kmalloc or duplicate anything.  We
1089          * can adjust the indirect block table in-place and reuse the
1090          * chain.  It IS possible that the chain has already been duplicated
1091          * or may wind up being duplicated on-the-fly by modifying code
1092          * on the frontend.  We simply use the original and ignore such
1093          * chains.  However, it does mean we can't clear the MOVED bit.
1094          *
1095          * XXX recursive deletions not optimized.
1096          */
1097         hammer2_chain_modify(trans, &parent,
1098                              HAMMER2_MODIFY_NO_MODIFY_TID |
1099                              HAMMER2_MODIFY_ASSERTNOCOPY);
1100
1101         switch(parent->bref.type) {
1102         case HAMMER2_BREF_TYPE_INODE:
1103                 /*
1104                  * XXX Should assert that OPFLAG_DIRECTDATA is 0 once we
1105                  * properly duplicate the inode headers and do proper flush
1106                  * range checks (all the children should be beyond the flush
1107                  * point).  For now just don't sync the non-applicable
1108                  * children.
1109                  *
1110                  * XXX Can also occur due to hardlink consolidation.  We
1111                  * set OPFLAG_DIRECTDATA to prevent the indirect and data
1112                  * blocks from syncing ot the hardlink pointer.
1113                  */
1114 #if 0
1115                 KKASSERT((parent->data->ipdata.op_flags &
1116                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1117 #endif
1118 #if 0
1119                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1120                         base = NULL;
1121                 } else
1122 #endif
1123                 {
1124                         base = &parent->data->ipdata.u.blockset.blockref[0];
1125                         count = HAMMER2_SET_COUNT;
1126                 }
1127                 break;
1128         case HAMMER2_BREF_TYPE_INDIRECT:
1129         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1130                 if (parent->data) {
1131                         base = &parent->data->npdata[0];
1132                 } else {
1133                         base = NULL;
1134                         KKASSERT(child->flags & HAMMER2_CHAIN_DELETED);
1135                 }
1136                 count = parent->bytes / sizeof(hammer2_blockref_t);
1137                 break;
1138         case HAMMER2_BREF_TYPE_VOLUME:
1139                 base = &hmp->voldata.sroot_blockset.blockref[0];
1140                 count = HAMMER2_SET_COUNT;
1141                 break;
1142         case HAMMER2_BREF_TYPE_FREEMAP:
1143                 base = &parent->data->npdata[0];
1144                 count = HAMMER2_SET_COUNT;
1145                 break;
1146         default:
1147                 base = NULL;
1148                 count = 0;
1149                 panic("hammer2_chain_flush_scan2: "
1150                       "unrecognized blockref type: %d",
1151                       parent->bref.type);
1152         }
1153
1154         /*
1155          * Don't bother updating a deleted parent's blockrefs (caller will
1156          * optimize-out the disk write).  Note that this is not optional,
1157          * a deleted parent's blockref array might not be synchronized at
1158          * all so calling hammer2_base*() functions could result in a panic.
1159          *
1160          * Otherwise, we need to be COUNTEDBREFS synchronized for the
1161          * hammer2_base_*() functions.
1162          */
1163         if (parent->delete_tid <= trans->sync_tid)
1164                 base = NULL;
1165         else if ((parent->flags & HAMMER2_CHAIN_COUNTEDBREFS) == 0)
1166                 hammer2_chain_countbrefs(parent, base, count);
1167
1168         /*
1169          * Update the parent's blockref table and propagate mirror_tid.
1170          *
1171          * NOTE! Children with modify_tid's beyond our flush point are
1172          *       considered to not exist for the purposes of updating the
1173          *       parent's blockref array.
1174          *
1175          * NOTE! Updates to a parent's blockref table do not adjust the
1176          *       parent's bref.modify_tid, only its bref.mirror_tid.
1177          */
1178         if (info->pass == 1 && child->delete_tid <= trans->sync_tid) {
1179                 /*
1180                  * Deleting.  Only adjust the block array if it contains
1181                  * the child's entry (child's REPLACE flag is set).  Clear
1182                  * the child's REPLACE flag only once all possible parent's
1183                  * have been updated.
1184                  */
1185                 ok = 1;
1186                 if (base && (child->flags & HAMMER2_CHAIN_REPLACE)) {
1187                         hammer2_rollup_stats(parent, child, -1);
1188                         spin_lock(&above->cst.spin);
1189                         hammer2_base_delete(parent, base, count,
1190                                             &info->cache_index, &child->bref);
1191                         if (TAILQ_NEXT(parent, core_entry) == NULL) {
1192                                 atomic_clear_int(&child->flags,
1193                                                HAMMER2_CHAIN_REPLACE);
1194                         }
1195                         spin_unlock(&above->cst.spin);
1196                 }
1197                 if (info->mirror_tid < child->delete_tid)
1198                         info->mirror_tid = child->delete_tid;
1199         } else if (info->pass == 2 && child->delete_tid > trans->sync_tid) {
1200                 /*
1201                  * Inserting.  Only set the child's REPLACE flag indicating
1202                  * that the parent's blockref array entry is valid once all
1203                  * possible parent's have been updated.
1204                  */
1205                 ok = 1;
1206                 if (base) {
1207                         if (child->flags & HAMMER2_CHAIN_REPLACE)
1208                                 hammer2_rollup_stats(parent, child, 0);
1209                         else
1210                                 hammer2_rollup_stats(parent, child, 1);
1211                         spin_lock(&above->cst.spin);
1212                         hammer2_base_insert(parent, base, count,
1213                                             &info->cache_index, &child->bref,
1214                                             child->flags);
1215                         if (TAILQ_NEXT(parent, core_entry) == NULL) {
1216                                 atomic_set_int(&child->flags,
1217                                                HAMMER2_CHAIN_REPLACE);
1218                         }
1219                         spin_unlock(&above->cst.spin);
1220                 }
1221                 if (info->mirror_tid < child->modify_tid)
1222                         info->mirror_tid = child->modify_tid;
1223         } else {
1224                 ok = 0;
1225         }
1226
1227         if (info->mirror_tid < child->bref.mirror_tid) {
1228                 info->mirror_tid = child->bref.mirror_tid;
1229         }
1230         if ((parent->bref.type == HAMMER2_BREF_TYPE_VOLUME ||
1231              parent->bref.type == HAMMER2_BREF_TYPE_FREEMAP) &&
1232             hmp->voldata.mirror_tid < child->bref.mirror_tid) {
1233                 hmp->voldata.mirror_tid = child->bref.mirror_tid;
1234         }
1235
1236         /*
1237          * Only clear MOVED once all possible parents have been flushed.
1238          *
1239          * When can we safely clear the MOVED flag?  Flushes down duplicate
1240          * paths can occur out of order, for example if an inode is moved
1241          * as part of a hardlink consolidation or if an inode is moved into
1242          * an indirect block indexed before the inode.
1243          */
1244         if (ok && (child->flags & HAMMER2_CHAIN_MOVED)) {
1245                 hammer2_chain_t *scan;
1246                 int ok = 1;
1247
1248                 spin_lock(&above->cst.spin);
1249                 TAILQ_FOREACH(scan, &above->ownerq, core_entry) {
1250                         /*
1251                          * XXX weird code also checked at the top of scan2,
1252                          *     I would like to fix this by detaching the core
1253                          *     on initial hardlink consolidation (1->2 nlinks).
1254                          */
1255 #if 0
1256                         if (scan->bref.type == HAMMER2_BREF_TYPE_INODE &&
1257                             (scan->data->ipdata.op_flags &
1258                              HAMMER2_OPFLAG_DIRECTDATA)) {
1259                                 continue;
1260                         }
1261 #endif
1262                         if (scan->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1263                                 ok = 0;
1264                                 break;
1265                         }
1266                 }
1267                 spin_unlock(&above->cst.spin);
1268                 if (ok) {
1269                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
1270                         hammer2_chain_drop(child);      /* flag */
1271                 }
1272         }
1273
1274         /*
1275          * Unlock the child.  This can wind up dropping the child's
1276          * last ref, removing it from the parent's RB tree, and deallocating
1277          * the structure.  The RB_SCAN() our caller is doing handles the
1278          * situation.
1279          */
1280         hammer2_chain_unlock(child);
1281         hammer2_chain_drop(child);
1282         spin_lock(&above->cst.spin);
1283
1284         /*
1285          * The parent cleared SUBMODIFIED prior to the scan.  If the child
1286          * still requires a flush (possibly due to being outside the current
1287          * synchronization zone), we must re-set SUBMODIFIED on the way back
1288          * up.
1289          */
1290 finalize:
1291         return (0);
1292 }
1293
1294 static
1295 void
1296 hammer2_rollup_stats(hammer2_chain_t *parent, hammer2_chain_t *child, int how)
1297 {
1298 #if 0
1299         hammer2_chain_t *grandp;
1300 #endif
1301
1302         parent->data_count += child->data_count;
1303         parent->inode_count += child->inode_count;
1304         child->data_count = 0;
1305         child->inode_count = 0;
1306         if (how < 0) {
1307                 parent->data_count -= child->bytes;
1308                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1309                         parent->inode_count -= 1;
1310 #if 0
1311                         /* XXX child->data may be NULL atm */
1312                         parent->data_count -= child->data->ipdata.data_count;
1313                         parent->inode_count -= child->data->ipdata.inode_count;
1314 #endif
1315                 }
1316         } else if (how > 0) {
1317                 parent->data_count += child->bytes;
1318                 if (child->bref.type == HAMMER2_BREF_TYPE_INODE) {
1319                         parent->inode_count += 1;
1320 #if 0
1321                         /* XXX child->data may be NULL atm */
1322                         parent->data_count += child->data->ipdata.data_count;
1323                         parent->inode_count += child->data->ipdata.inode_count;
1324 #endif
1325                 }
1326         }
1327         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1328                 parent->data->ipdata.data_count += parent->data_count;
1329                 parent->data->ipdata.inode_count += parent->inode_count;
1330 #if 0
1331                 for (grandp = parent->above->first_parent;
1332                      grandp;
1333                      grandp = grandp->next_parent) {
1334                         grandp->data_count += parent->data_count;
1335                         grandp->inode_count += parent->inode_count;
1336                 }
1337 #endif
1338                 parent->data_count = 0;
1339                 parent->inode_count = 0;
1340         }
1341 }