258096f0ec10be4f97f9c20854710a485e204b14
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * Flushing generally occurs bottom-up but requires a top-down scan to
42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
43  * tells how to recurse downward to find these chains.
44  */
45
46 #include <sys/cdefs.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/types.h>
50 #include <sys/lock.h>
51 #include <sys/uuid.h>
52
53 #include "hammer2.h"
54
55 #define FLUSH_DEBUG 0
56
57 #define HAMMER2_FLUSH_DEPTH_LIMIT       60      /* stack recursion limit */
58
59
60 /*
61  * Recursively flush the specified chain.  The chain is locked and
62  * referenced by the caller and will remain so on return.  The chain
63  * will remain referenced throughout but can temporarily lose its
64  * lock during the recursion to avoid unnecessarily stalling user
65  * processes.
66  */
67 struct hammer2_flush_info {
68         hammer2_chain_t *parent;
69         int             depth;
70         int             error;                  /* cumulative error */
71         int             flags;
72 #ifdef HAMMER2_SCAN_DEBUG
73         long            scan_count;
74         long            scan_mod_count;
75         long            scan_upd_count;
76         long            scan_onf_count;
77         long            scan_del_count;
78         long            scan_btype[7];
79 #endif
80         hammer2_chain_t *debug;
81 };
82
83 typedef struct hammer2_flush_info hammer2_flush_info_t;
84
85 static int hammer2_flush_core(hammer2_flush_info_t *info,
86                                 hammer2_chain_t *chain, int flags);
87 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
88
89 /*
90  * Any per-pfs transaction initialization goes here.
91  */
92 void
93 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
94 {
95 }
96
97 /*
98  * Transaction support for any modifying operation.  Transactions are used
99  * in the pmp layer by the frontend and in the spmp layer by the backend.
100  *
101  * 0                    - Normal transaction.  No interlock currently.
102  *
103  * TRANS_ISFLUSH        - Flush transaction.  Interlocks against other flush
104  *                        transactions.
105  *
106  * TRANS_BUFCACHE       - Buffer cache transaction.  No interlock.
107  *
108  * TRANS_SIDEQ          - Run the sideq (only tested in trans_done())
109  *
110  * Initializing a new transaction allocates a transaction ID.  Typically
111  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
112  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
113  * media target.  The latter mode is used by the recovery code.
114  */
115 void
116 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
117 {
118         uint32_t oflags;
119         uint32_t nflags;
120         int dowait;
121
122         for (;;) {
123                 oflags = pmp->trans.flags;
124                 cpu_ccfence();
125                 dowait = 0;
126
127                 if (flags & HAMMER2_TRANS_ISFLUSH) {
128                         /*
129                          * Interlock against other flush transactions.
130                          */
131                         if (oflags & HAMMER2_TRANS_ISFLUSH) {
132                                 nflags = oflags | HAMMER2_TRANS_WAITING;
133                                 dowait = 1;
134                         } else {
135                                 nflags = (oflags | flags) + 1;
136                         }
137                 } else if (flags & HAMMER2_TRANS_BUFCACHE) {
138                         /*
139                          * Requesting strategy transaction from buffer-cache,
140                          * or a VM getpages/putpages through the buffer cache.
141                          * We must allow such transactions in all situations
142                          * to avoid deadlocks.
143                          */
144                         nflags = (oflags | flags) + 1;
145                 } else {
146                         /*
147                          * Normal transaction.  We currently only interlock
148                          * against COPYQ.  We do not interlock against
149                          * BUFCACHE or ISFLUSH.  COPYQ is used to interlock
150                          * the transfer of SIDEQ into SYNCQ.
151                          *
152                          * Note that vnode locks may be held going into
153                          * this call.
154                          *
155                          * NOTE: Remember that non-modifying operations
156                          *       such as read, stat, readdir, etc, do
157                          *       not use transactions.
158                          */
159                         if (oflags & HAMMER2_TRANS_COPYQ) {
160                                 nflags = oflags | HAMMER2_TRANS_WAITING;
161                                 dowait = 1;
162                         } else {
163                                 nflags = (oflags | flags) + 1;
164                         }
165                 }
166                 if (dowait)
167                         tsleep_interlock(&pmp->trans.sync_wait, 0);
168                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
169                         if (dowait == 0)
170                                 break;
171                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
172                                "h2trans", hz);
173                         /* retry */
174                 } else {
175                         cpu_pause();
176                         /* retry */
177                 }
178                 /* retry */
179         }
180 }
181
182 /*
183  * Start a sub-transaction, there is no 'subdone' function.  This will
184  * issue a new modify_tid (mtid) for the current transaction, which is a
185  * CLC (cluster level change) id and not a per-node id.
186  *
187  * This function must be called for each XOP when multiple XOPs are run in
188  * sequence within a transaction.
189  *
190  * Callers typically update the inode with the transaction mtid manually
191  * to enforce sequencing.
192  */
193 hammer2_tid_t
194 hammer2_trans_sub(hammer2_pfs_t *pmp)
195 {
196         hammer2_tid_t mtid;
197
198         mtid = atomic_fetchadd_64(&pmp->modify_tid, 1);
199
200         return (mtid);
201 }
202
203 void
204 hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags)
205 {
206         atomic_set_int(&pmp->trans.flags, flags);
207 }
208
209 void
210 hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags)
211 {
212         uint32_t oflags;
213         uint32_t nflags;
214
215         for (;;) {
216                 oflags = pmp->trans.flags;
217                 cpu_ccfence();
218                 nflags = oflags & ~flags;
219                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
220                         if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
221                                 wakeup(&pmp->trans.sync_wait);
222                         break;
223                 }
224                 cpu_pause();
225                 /* retry */
226         }
227 }
228
229 void
230 hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
231 {
232         uint32_t oflags;
233         uint32_t nflags;
234
235         /*
236          * Modifying ops on the front-end can cause dirty inodes to
237          * build up in the sideq.  We don't flush these on inactive/reclaim
238          * due to potential deadlocks, so we have to deal with them from
239          * inside other nominal modifying front-end transactions.
240          */
241         if ((flags & HAMMER2_TRANS_SIDEQ) &&
242             pmp->sideq_count > (pmp->inum_count >> 3) &&
243             pmp->mp) {
244                 speedup_syncer(pmp->mp);
245         }
246
247         /*
248          * Clean-up the transaction
249          */
250         for (;;) {
251                 oflags = pmp->trans.flags;
252                 cpu_ccfence();
253                 KKASSERT(oflags & HAMMER2_TRANS_MASK);
254
255                 nflags = (oflags - 1) & ~flags;
256                 if (flags & HAMMER2_TRANS_ISFLUSH) {
257                         nflags &= ~HAMMER2_TRANS_WAITING;
258                 }
259                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
260                         if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
261                                 wakeup(&pmp->trans.sync_wait);
262                         break;
263                 }
264                 cpu_pause();
265                 /* retry */
266         }
267 }
268
269 /*
270  * Obtain new, unique inode number (not serialized by caller).
271  */
272 hammer2_tid_t
273 hammer2_trans_newinum(hammer2_pfs_t *pmp)
274 {
275         hammer2_tid_t tid;
276
277         tid = atomic_fetchadd_64(&pmp->inode_tid, 1);
278
279         return tid;
280 }
281
282 /*
283  * Assert that a strategy call is ok here.  Currently we allow strategy
284  * calls in all situations, including during flushes.  Previously:
285  *      (old) (1) In a normal transaction.
286  *      (old) (2) In a flush transaction only if PREFLUSH is also set.
287  */
288 void
289 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
290 {
291 #if 0
292         KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0 ||
293                  (pmp->trans.flags & HAMMER2_TRANS_PREFLUSH));
294 #endif
295 }
296
297 /*
298  * Flush the chain and all modified sub-chains through the specified
299  * synchronization point, propagating blockref updates back up.  As
300  * part of this propagation, mirror_tid and inode/data usage statistics
301  * propagates back upward.
302  *
303  * Returns a HAMMER2 error code, 0 if no error.  Note that I/O errors from
304  * buffers dirtied during the flush operation can occur later.
305  *
306  * modify_tid (clc - cluster level change) is not propagated.
307  *
308  * update_tid (clc) is used for validation and is not propagated by this
309  * function.
310  *
311  * This routine can be called from several places but the most important
312  * is from VFS_SYNC (frontend) via hammer2_xop_inode_flush (backend).
313  *
314  * chain is locked on call and will remain locked on return.  The chain's
315  * UPDATE flag indicates that its parent's block table (which is not yet
316  * part of the flush) should be updated.
317  *
318  * flags:
319  *      HAMMER2_FLUSH_TOP       Indicates that this is the top of the flush.
320  *                              Is cleared for the recursion.
321  *
322  *      HAMMER2_FLUSH_ALL       Recurse everything
323  *
324  *      HAMMER2_FLUSH_INODE_STOP
325  *                              Stop at PFS inode or normal inode boundary
326  */
327 int
328 hammer2_flush(hammer2_chain_t *chain, int flags)
329 {
330         hammer2_flush_info_t info;
331         hammer2_dev_t *hmp;
332         int loops;
333
334         /*
335          * Execute the recursive flush and handle deferrals.
336          *
337          * Chains can be ridiculously long (thousands deep), so to
338          * avoid blowing out the kernel stack the recursive flush has a
339          * depth limit.  Elements at the limit are placed on a list
340          * for re-execution after the stack has been popped.
341          */
342         bzero(&info, sizeof(info));
343         info.flags = flags & ~HAMMER2_FLUSH_TOP;
344
345         /*
346          * Calculate parent (can be NULL), if not NULL the flush core
347          * expects the parent to be referenced so it can easily lock/unlock
348          * it without it getting ripped up.
349          */
350         if ((info.parent = chain->parent) != NULL)
351                 hammer2_chain_ref(info.parent);
352
353         /*
354          * Extra ref needed because flush_core expects it when replacing
355          * chain.
356          */
357         hammer2_chain_ref(chain);
358         hmp = chain->hmp;
359         loops = 0;
360
361         for (;;) {
362                 /*
363                  * [re]flush chain as the deep recursion may have generated
364                  * additional modifications.
365                  */
366                 if (info.parent != chain->parent) {
367                         if (hammer2_debug & 0x0040) {
368                                 kprintf("LOST CHILD4 %p->%p "
369                                         "(actual parent %p)\n",
370                                         info.parent, chain, chain->parent);
371                         }
372                         hammer2_chain_drop(info.parent);
373                         info.parent = chain->parent;
374                         hammer2_chain_ref(info.parent);
375                 }
376                 if (hammer2_flush_core(&info, chain, flags) == 0)
377                         break;
378
379                 if (++loops % 1000 == 0) {
380                         kprintf("hammer2_flush: excessive loops on %p\n",
381                                 chain);
382                         if (hammer2_debug & 0x100000)
383                                 Debugger("hell4");
384                 }
385         }
386 #ifdef HAMMER2_SCAN_DEBUG
387         if (info.scan_count >= 10)
388         kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) "
389                 "bt(%ld,%ld,%ld,%ld,%ld,%ld)\n",
390                 info.scan_count,
391                 info.scan_mod_count,
392                 info.scan_upd_count,
393                 info.scan_onf_count,
394                 info.scan_del_count,
395                 info.scan_btype[1],
396                 info.scan_btype[2],
397                 info.scan_btype[3],
398                 info.scan_btype[4],
399                 info.scan_btype[5],
400                 info.scan_btype[6]);
401 #endif
402         hammer2_chain_drop(chain);
403         if (info.parent)
404                 hammer2_chain_drop(info.parent);
405         return (info.error);
406 }
407
408 /*
409  * This is the core of the chain flushing code.  The chain is locked by the
410  * caller and must also have an extra ref on it by the caller, and remains
411  * locked and will have an extra ref on return.  info.parent is referenced
412  * but not locked.
413  *
414  * Upon return, the caller can test the UPDATE bit on the chain to determine
415  * if the parent needs updating.
416  *
417  * If non-zero is returned, the chain's parent changed during the flush and
418  * the caller must retry the operation.
419  *
420  * (1) Determine if this node is a candidate for the flush, return if it is
421  *     not.  fchain and vchain are always candidates for the flush.
422  *
423  * (2) If we recurse too deep the chain is entered onto the deferral list and
424  *     the current flush stack is aborted until after the deferral list is
425  *     run.
426  *
427  * (3) Recursively flush live children (rbtree).  This can create deferrals.
428  *     A successful flush clears the MODIFIED and UPDATE bits on the children
429  *     and typically causes the parent to be marked MODIFIED as the children
430  *     update the parent's block table.  A parent might already be marked
431  *     MODIFIED due to a deletion (whos blocktable update in the parent is
432  *     handled by the frontend), or if the parent itself is modified by the
433  *     frontend for other reasons.
434  *
435  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
436  *     Deleted-but-open inodes can still be individually flushed via the
437  *     filesystem syncer.
438  *
439  * (5) Delete parents on the way back up if they are normal indirect blocks
440  *     and have no children.
441  *
442  * (6) Note that an unmodified child may still need the block table in its
443  *     parent updated (e.g. rename/move).  The child will have UPDATE set
444  *     in this case.
445  *
446  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
447  *
448  * blockref.modify_tid is consistent only within a PFS, and will not be
449  * consistent during synchronization.  mirror_tid is consistent across the
450  * block device regardless of the PFS.
451  */
452 static int
453 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
454                    int flags)
455 {
456         hammer2_chain_t *parent;
457         hammer2_dev_t *hmp;
458         int save_error;
459         int retry;
460
461         retry = 0;
462
463         /*
464          * (1) Optimize downward recursion to locate nodes needing action.
465          *     Nothing to do if none of these flags are set.
466          */
467         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
468                 if (hammer2_debug & 0x200) {
469                         if (info->debug == NULL)
470                                 info->debug = chain;
471                 } else {
472                         return 0;
473                 }
474         }
475
476         hmp = chain->hmp;
477
478         /*
479          * NOTE: parent can be NULL, usually due to destroy races.
480          */
481         parent = info->parent;
482         KKASSERT(chain->parent == parent);
483
484         /*
485          * Downward search recursion
486          *
487          * We must be careful on cold stops.  If CHAIN_UPDATE is set and
488          * we stop cold, the update can wind up never being applied.  This
489          * situation most typically occurs on inode boundaries due to the way
490          * hammer2_vfs_sync() breaks-up the flush.  As a safety, we
491          * flush-through such situations. XXX removed
492          */
493         if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
494                    /* (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && */
495                    (flags & HAMMER2_FLUSH_ALL) == 0 &&
496                    (flags & HAMMER2_FLUSH_TOP) == 0 &&
497                    chain->pmp && chain->pmp->mp) {
498                 /*
499                  * If FLUSH_ALL is not specified the caller does not want
500                  * to recurse through PFS roots that have been mounted.
501                  *
502                  * (If the PFS has not been mounted there may not be
503                  *  anything monitoring its chains and its up to us
504                  *  to flush it).
505                  *
506                  * The typical sequence is to flush dirty PFS's starting at
507                  * their root downward, then flush the device root (vchain).
508                  * It is this second flush that typically leaves out the
509                  * ALL flag.
510                  *
511                  * However we must still process the PFSROOT chains for block
512                  * table updates in their parent (which IS part of our flush).
513                  *
514                  * NOTE: The volume root, vchain, does not set PFSBOUNDARY.
515                  *
516                  * NOTE: We must re-set ONFLUSH in the parent to retain if
517                  *       this chain (that we are skipping) requires work.
518                  */
519                 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
520                                     HAMMER2_CHAIN_DESTROY |
521                                     HAMMER2_CHAIN_MODIFIED)) {
522                         hammer2_chain_setflush(parent);
523                 }
524                 goto done;
525         } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
526                    /* (chain->flags & HAMMER2_CHAIN_UPDATE) == 0 && */
527                    (flags & HAMMER2_FLUSH_INODE_STOP) &&
528                    (flags & HAMMER2_FLUSH_ALL) == 0 &&
529                    (flags & HAMMER2_FLUSH_TOP) == 0 &&
530                    chain->pmp && chain->pmp->mp) {
531                 /*
532                  * When FLUSH_INODE_STOP is specified we are being asked not
533                  * to include any inode changes for inodes we encounter,
534                  * with the exception of the inode that the flush began with.
535                  * So: INODE, INODE_STOP, and TOP==0 basically.
536                  */
537                 goto done;
538 #if 0
539                 /*
540                  * If FLUSH_INODE_STOP is specified and both ALL and TOP
541                  * are clear, we must not flush the chain.  The chain should
542                  * have already been flushed and any further ONFLUSH/UPDATE
543                  * setting will be related to the next flush.
544                  *
545                  * This features allows us to flush inodes independently of
546                  * each other and meta-data above the inodes separately.
547                  */
548                 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
549                                     HAMMER2_CHAIN_DESTROY |
550                                     HAMMER2_CHAIN_MODIFIED)) {
551                         if (parent)
552                                 hammer2_chain_setflush(parent);
553                 }
554 #endif
555         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
556                 /*
557                  * Recursion depth reached.
558                  */
559                 panic("hammer2: flush depth limit");
560         } else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
561                                    HAMMER2_CHAIN_DESTROY)) {
562                 /*
563                  * Downward recursion search (actual flush occurs bottom-up).
564                  * pre-clear ONFLUSH.  It can get set again due to races or
565                  * flush errors, which we want so the scan finds us again in
566                  * the next flush.
567                  *
568                  * We must also recurse if DESTROY is set so we can finally
569                  * get rid of the related children, otherwise the node will
570                  * just get re-flushed on lastdrop.
571                  *
572                  * WARNING!  The recursion will unlock/relock info->parent
573                  *           (which is 'chain'), potentially allowing it
574                  *           to be ripped up.
575                  */
576                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
577                 save_error = info->error;
578                 info->error = 0;
579                 info->parent = chain;
580
581                 /*
582                  * We may have to do this twice to catch any indirect
583                  * block maintenance that occurs.
584                  */
585                 hammer2_spin_ex(&chain->core.spin);
586                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
587                         NULL, hammer2_flush_recurse, info);
588                 if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
589                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
590                         RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
591                                 NULL, hammer2_flush_recurse, info);
592                 }
593                 hammer2_spin_unex(&chain->core.spin);
594                 info->parent = parent;
595
596                 /*
597                  * Re-set the flush bits if the flush was incomplete or
598                  * an error occurred.  If an error occurs it is typically
599                  * an allocation error.  Errors do not cause deferrals.
600                  */
601                 if (info->error)
602                         hammer2_chain_setflush(chain);
603                 info->error |= save_error;
604
605                 /*
606                  * If we lost the parent->chain association we have to
607                  * stop processing this chain because it is no longer
608                  * in this recursion.  If it moved, it will be handled
609                  * by the ONFLUSH flag elsewhere.
610                  */
611                 if (chain->parent != parent) {
612                         kprintf("LOST CHILD2 %p->%p (actual parent %p)\n",
613                                 parent, chain, chain->parent);
614                         goto done;
615                 }
616         }
617
618         /*
619          * Now we are in the bottom-up part of the recursion.
620          *
621          * We continue to try to update the chain on lower-level errors, but
622          * the flush code may decide not to flush the volume root.
623          *
624          * XXX should we continue to try to update the chain if an error
625          *     occurred?
626          */
627
628         /*
629          * Both parent and chain must be locked in order to flush chain,
630          * in order to properly update the parent under certain conditions.
631          *
632          * In addition, we can't safely unlock/relock the chain once we
633          * start flushing the chain itself, which we would have to do later
634          * on in order to lock the parent if we didn't do that now.
635          */
636         hammer2_chain_ref_hold(chain);
637         hammer2_chain_unlock(chain);
638         if (parent)
639                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
640         hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
641         hammer2_chain_drop_unhold(chain);
642
643         /*
644          * Can't process if we can't access their content.
645          */
646         if ((parent && parent->error) || chain->error) {
647                 kprintf("hammer2: chain error during flush\n");
648                 info->error |= chain->error;
649                 if (parent) {
650                         info->error |= parent->error;
651                         hammer2_chain_unlock(parent);
652                 }
653                 goto done;
654         }
655
656         if (chain->parent != parent) {
657                 if (hammer2_debug & 0x0040) {
658                         kprintf("LOST CHILD3 %p->%p (actual parent %p)\n",
659                                 parent, chain, chain->parent);
660                 }
661                 KKASSERT(parent != NULL);
662                 hammer2_chain_unlock(parent);
663                 retry = 1;
664                 goto done;
665         }
666
667         /*
668          * Propagate the DESTROY flag downwards.  This dummies up the flush
669          * code and tries to invalidate related buffer cache buffers to
670          * avoid the disk write.
671          */
672         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
673                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
674
675         /*
676          * Dispose of the modified bit.
677          *
678          * If parent is present, the UPDATE bit should already be set.
679          * UPDATE should already be set.
680          * bref.mirror_tid should already be set.
681          */
682         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
683                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
684                          chain->parent == NULL);
685                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
686                 atomic_add_long(&hammer2_count_modified_chains, -1);
687
688                 /*
689                  * Manage threads waiting for excessive dirty memory to
690                  * be retired.
691                  */
692                 if (chain->pmp)
693                         hammer2_pfs_memory_wakeup(chain->pmp);
694
695 #if 0
696                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
697                     chain != &hmp->vchain &&
698                     chain != &hmp->fchain) {
699                         /*
700                          * Set UPDATE bit indicating that the parent block
701                          * table requires updating.
702                          */
703                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
704                 }
705 #endif
706
707                 /*
708                  * Issue the flush.  This is indirect via the DIO.
709                  *
710                  * NOTE: A DELETED node that reaches this point must be
711                  *       flushed for synchronization point consistency.
712                  *
713                  * NOTE: Even though MODIFIED was already set, the related DIO
714                  *       might not be dirty due to a system buffer cache
715                  *       flush and must be set dirty if we are going to make
716                  *       further modifications to the buffer.  Chains with
717                  *       embedded data don't need this.
718                  */
719                 if (hammer2_debug & 0x1000) {
720                         kprintf("Flush %p.%d %016jx/%d data=%016jx\n",
721                                 chain, chain->bref.type,
722                                 (uintmax_t)chain->bref.key,
723                                 chain->bref.keybits,
724                                 (uintmax_t)chain->bref.data_off);
725                 }
726                 if (hammer2_debug & 0x2000) {
727                         Debugger("Flush hell");
728                 }
729
730                 /*
731                  * Update chain CRCs for flush.
732                  *
733                  * NOTE: Volume headers are NOT flushed here as they require
734                  *       special processing.
735                  */
736                 switch(chain->bref.type) {
737                 case HAMMER2_BREF_TYPE_FREEMAP:
738                         /*
739                          * Update the volume header's freemap_tid to the
740                          * freemap's flushing mirror_tid.
741                          *
742                          * (note: embedded data, do not call setdirty)
743                          */
744                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
745                         KKASSERT(chain == &hmp->fchain);
746                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
747                         if (hammer2_debug & 0x8000) {
748                                 /* debug only, avoid syslogd loop */
749                                 kprintf("sync freemap mirror_tid %08jx\n",
750                                         (intmax_t)chain->bref.mirror_tid);
751                         }
752
753                         /*
754                          * The freemap can be flushed independently of the
755                          * main topology, but for the case where it is
756                          * flushed in the same transaction, and flushed
757                          * before vchain (a case we want to allow for
758                          * performance reasons), make sure modifications
759                          * made during the flush under vchain use a new
760                          * transaction id.
761                          *
762                          * Otherwise the mount recovery code will get confused.
763                          */
764                         ++hmp->voldata.mirror_tid;
765                         break;
766                 case HAMMER2_BREF_TYPE_VOLUME:
767                         /*
768                          * The free block table is flushed by
769                          * hammer2_vfs_sync() before it flushes vchain.
770                          * We must still hold fchain locked while copying
771                          * voldata to volsync, however.
772                          *
773                          * These do not error per-say since their data does
774                          * not need to be re-read from media on lock.
775                          *
776                          * (note: embedded data, do not call setdirty)
777                          */
778                         hammer2_chain_lock(&hmp->fchain,
779                                            HAMMER2_RESOLVE_ALWAYS);
780                         hammer2_voldata_lock(hmp);
781                         if (hammer2_debug & 0x8000) {
782                                 /* debug only, avoid syslogd loop */
783                                 kprintf("sync volume  mirror_tid %08jx\n",
784                                         (intmax_t)chain->bref.mirror_tid);
785                         }
786
787                         /*
788                          * Update the volume header's mirror_tid to the
789                          * main topology's flushing mirror_tid.  It is
790                          * possible that voldata.mirror_tid is already
791                          * beyond bref.mirror_tid due to the bump we made
792                          * above in BREF_TYPE_FREEMAP.
793                          */
794                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
795                                 hmp->voldata.mirror_tid =
796                                         chain->bref.mirror_tid;
797                         }
798
799                         /*
800                          * The volume header is flushed manually by the
801                          * syncer, not here.  All we do here is adjust the
802                          * crc's.
803                          */
804                         KKASSERT(chain->data != NULL);
805                         KKASSERT(chain->dio == NULL);
806
807                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
808                                 hammer2_icrc32(
809                                         (char *)&hmp->voldata +
810                                          HAMMER2_VOLUME_ICRC1_OFF,
811                                         HAMMER2_VOLUME_ICRC1_SIZE);
812                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
813                                 hammer2_icrc32(
814                                         (char *)&hmp->voldata +
815                                          HAMMER2_VOLUME_ICRC0_OFF,
816                                         HAMMER2_VOLUME_ICRC0_SIZE);
817                         hmp->voldata.icrc_volheader =
818                                 hammer2_icrc32(
819                                         (char *)&hmp->voldata +
820                                          HAMMER2_VOLUME_ICRCVH_OFF,
821                                         HAMMER2_VOLUME_ICRCVH_SIZE);
822
823                         if (hammer2_debug & 0x8000) {
824                                 /* debug only, avoid syslogd loop */
825                                 kprintf("syncvolhdr %016jx %016jx\n",
826                                         hmp->voldata.mirror_tid,
827                                         hmp->vchain.bref.mirror_tid);
828                         }
829                         hmp->volsync = hmp->voldata;
830                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
831                         hammer2_voldata_unlock(hmp);
832                         hammer2_chain_unlock(&hmp->fchain);
833                         break;
834                 case HAMMER2_BREF_TYPE_DATA:
835                         /*
836                          * Data elements have already been flushed via the
837                          * logical file buffer cache.  Their hash was set in
838                          * the bref by the vop_write code.  Do not re-dirty.
839                          *
840                          * Make sure any device buffer(s) have been flushed
841                          * out here (there aren't usually any to flush) XXX.
842                          */
843                         break;
844                 case HAMMER2_BREF_TYPE_INDIRECT:
845                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
846                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
847                         /*
848                          * Buffer I/O will be cleaned up when the volume is
849                          * flushed (but the kernel is free to flush it before
850                          * then, as well).
851                          */
852                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
853                         hammer2_chain_setcheck(chain, chain->data);
854                         break;
855                 case HAMMER2_BREF_TYPE_DIRENT:
856                         /*
857                          * A directory entry can use the check area to store
858                          * the filename for filenames <= 64 bytes, don't blow
859                          * it up!
860                          */
861                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
862                         if (chain->bytes)
863                                 hammer2_chain_setcheck(chain, chain->data);
864                         break;
865                 case HAMMER2_BREF_TYPE_INODE:
866                         /*
867                          * NOTE: We must call io_setdirty() to make any late
868                          *       changes to the inode data, the system might
869                          *       have already flushed the buffer.
870                          */
871                         if (chain->data->ipdata.meta.op_flags &
872                             HAMMER2_OPFLAG_PFSROOT) {
873                                 /*
874                                  * non-NULL pmp if mounted as a PFS.  We must
875                                  * sync fields cached in the pmp? XXX
876                                  */
877                                 hammer2_inode_data_t *ipdata;
878
879                                 hammer2_io_setdirty(chain->dio);
880                                 ipdata = &chain->data->ipdata;
881                                 if (chain->pmp) {
882                                         ipdata->meta.pfs_inum =
883                                                 chain->pmp->inode_tid;
884                                 }
885                         } else {
886                                 /* can't be mounted as a PFS */
887                         }
888
889                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
890                         hammer2_chain_setcheck(chain, chain->data);
891
892                                 hammer2_inode_data_t *ipdata;
893                         ipdata = &chain->data->ipdata;
894                         break;
895                 default:
896                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
897                         panic("hammer2_flush_core: unsupported "
898                               "embedded bref %d",
899                               chain->bref.type);
900                         /* NOT REACHED */
901                 }
902
903                 /*
904                  * If the chain was destroyed try to avoid unnecessary I/O
905                  * that might not have yet occurred.  Remove the data range
906                  * from dedup candidacy and attempt to invalidation that
907                  * potentially dirty portion of the I/O buffer.
908                  */
909                 if (chain->flags & HAMMER2_CHAIN_DESTROY) {
910                         hammer2_io_dedup_delete(hmp,
911                                                 chain->bref.type,
912                                                 chain->bref.data_off,
913                                                 chain->bytes);
914 #if 0
915                         hammer2_io_t *dio;
916                         if (chain->dio) {
917                                 hammer2_io_inval(chain->dio,
918                                                  chain->bref.data_off,
919                                                  chain->bytes);
920                         } else if ((dio = hammer2_io_getquick(hmp,
921                                                   chain->bref.data_off,
922                                                   chain->bytes,
923                                                   1)) != NULL) {
924                                 hammer2_io_inval(dio,
925                                                  chain->bref.data_off,
926                                                  chain->bytes);
927                                 hammer2_io_putblk(&dio);
928                         }
929 #endif
930                 }
931         }
932
933         /*
934          * If UPDATE is set the parent block table may need to be updated.
935          * This can fail if the hammer2_chain_modify() fails.
936          *
937          * NOTE: UPDATE may be set on vchain or fchain in which case
938          *       parent could be NULL, or on an inode that has not yet
939          *       been inserted into the radix tree.  It's easiest to allow
940          *       the case and test for NULL.  parent can also wind up being
941          *       NULL due to a deletion so we need to handle the case anyway.
942          *
943          * NOTE: UPDATE can be set when chains are renamed into or out of
944          *       an indirect block, without the chain itself being flagged
945          *       MODIFIED.
946          *
947          * If no parent exists we can just clear the UPDATE bit.  If the
948          * chain gets reattached later on the bit will simply get set
949          * again.
950          */
951         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL)
952                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
953
954         /*
955          * When flushing an inode outside of a FLUSH_FSSYNC we must NOT
956          * update the parent block table to point at the flushed inode.
957          * The block table should only ever be updated by the filesystem
958          * sync code.  If we do, inode<->inode dependencies (such as
959          * directory entries vs inode nlink count) can wind up not being
960          * flushed together and result in a broken topology if a crash/reboot
961          * occurs at the wrong time.
962          */
963         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
964             (flags & HAMMER2_FLUSH_FSSYNC) == 0 &&
965             (flags & HAMMER2_FLUSH_ALL) == 0 &&
966             chain->pmp && chain->pmp->mp) {
967                 goto skipupdate;
968         }
969
970         /*
971          * The chain may need its blockrefs updated in the parent, normal
972          * path.
973          */
974         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
975                 hammer2_blockref_t *base;
976                 int count;
977
978                 /*
979                  * Clear UPDATE flag, mark parent modified, update its
980                  * modify_tid if necessary, and adjust the parent blockmap.
981                  */
982                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
983
984                 /*
985                  * (optional code)
986                  *
987                  * Avoid actually modifying and updating the parent if it
988                  * was flagged for destruction.  This can greatly reduce
989                  * disk I/O in large tree removals because the
990                  * hammer2_io_setinval() call in the upward recursion
991                  * (see MODIFIED code above) can only handle a few cases.
992                  */
993                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
994                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
995                                 parent->bref.modify_tid =
996                                         chain->bref.modify_tid;
997                         }
998                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
999                                                         HAMMER2_CHAIN_BMAPUPD);
1000                         goto skipupdate;
1001                 }
1002
1003                 /*
1004                  * The flusher is responsible for deleting empty indirect
1005                  * blocks at this point.  If we don't do this, no major harm
1006                  * will be done but the empty indirect blocks will stay in
1007                  * the topology and make it a messy and inefficient.
1008                  *
1009                  * The flusher is also responsible for collapsing the
1010                  * content of an indirect block into its parent whenever
1011                  * possible (with some hysteresis).  Not doing this will also
1012                  * not harm the topology, but would make it messy and
1013                  * inefficient.
1014                  */
1015                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1016                         if (hammer2_chain_indirect_maintenance(parent, chain))
1017                                 goto skipupdate;
1018                 }
1019
1020                 /*
1021                  * We are updating the parent's blockmap, the parent must
1022                  * be set modified.  If this fails we re-set the UPDATE flag
1023                  * in the child.
1024                  *
1025                  * NOTE! A modification error can be ENOSPC.  We still want
1026                  *       to flush modified chains recursively, not break out,
1027                  *       so we just skip the update in this situation and
1028                  *       continue.  That is, we still need to try to clean
1029                  *       out dirty chains and buffers.
1030                  *
1031                  *       This may not help bulkfree though. XXX
1032                  */
1033                 save_error = hammer2_chain_modify(parent, 0, 0, 0);
1034                 if (save_error) {
1035                         info->error |= save_error;
1036                         kprintf("hammer2_flush: %016jx.%02x error=%08x\n",
1037                                 parent->bref.data_off, parent->bref.type,
1038                                 save_error);
1039                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1040                         goto skipupdate;
1041                 }
1042                 if (parent->bref.modify_tid < chain->bref.modify_tid)
1043                         parent->bref.modify_tid = chain->bref.modify_tid;
1044
1045                 /*
1046                  * Calculate blockmap pointer
1047                  */
1048                 switch(parent->bref.type) {
1049                 case HAMMER2_BREF_TYPE_INODE:
1050                         /*
1051                          * Access the inode's block array.  However, there is
1052                          * no block array if the inode is flagged DIRECTDATA.
1053                          */
1054                         if (parent->data &&
1055                             (parent->data->ipdata.meta.op_flags &
1056                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1057                                 base = &parent->data->
1058                                         ipdata.u.blockset.blockref[0];
1059                         } else {
1060                                 base = NULL;
1061                         }
1062                         count = HAMMER2_SET_COUNT;
1063                         break;
1064                 case HAMMER2_BREF_TYPE_INDIRECT:
1065                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1066                         if (parent->data)
1067                                 base = &parent->data->npdata[0];
1068                         else
1069                                 base = NULL;
1070                         count = parent->bytes / sizeof(hammer2_blockref_t);
1071                         break;
1072                 case HAMMER2_BREF_TYPE_VOLUME:
1073                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
1074                         count = HAMMER2_SET_COUNT;
1075                         break;
1076                 case HAMMER2_BREF_TYPE_FREEMAP:
1077                         base = &parent->data->npdata[0];
1078                         count = HAMMER2_SET_COUNT;
1079                         break;
1080                 default:
1081                         base = NULL;
1082                         count = 0;
1083                         panic("hammer2_flush_core: "
1084                               "unrecognized blockref type: %d",
1085                               parent->bref.type);
1086                 }
1087
1088                 /*
1089                  * Blocktable updates
1090                  *
1091                  * We synchronize pending statistics at this time.  Delta
1092                  * adjustments designated for the current and upper level
1093                  * are synchronized.
1094                  */
1095                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
1096                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
1097                                 hammer2_spin_ex(&parent->core.spin);
1098                                 hammer2_base_delete(parent, base, count, chain,
1099                                                     NULL);
1100                                 hammer2_spin_unex(&parent->core.spin);
1101                                 /* base_delete clears both bits */
1102                         } else {
1103                                 atomic_clear_int(&chain->flags,
1104                                                  HAMMER2_CHAIN_BMAPUPD);
1105                         }
1106                 }
1107                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
1108                         hammer2_spin_ex(&parent->core.spin);
1109                         hammer2_base_insert(parent, base, count,
1110                                             chain, &chain->bref);
1111                         hammer2_spin_unex(&parent->core.spin);
1112                         /* base_insert sets BMAPPED */
1113                 }
1114         }
1115 skipupdate:
1116         if (parent)
1117                 hammer2_chain_unlock(parent);
1118
1119         /*
1120          * Final cleanup after flush
1121          */
1122 done:
1123         KKASSERT(chain->refs > 0);
1124         if (hammer2_debug & 0x200) {
1125                 if (info->debug == chain)
1126                         info->debug = NULL;
1127         }
1128         return retry;
1129 }
1130
1131 /*
1132  * Flush recursion helper, called from flush_core, calls flush_core.
1133  *
1134  * Flushes the children of the caller's chain (info->parent), restricted
1135  * by sync_tid.  Set info->domodify if the child's blockref must propagate
1136  * back up to the parent.
1137  *
1138  * This function may set info->error as a side effect.
1139  *
1140  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
1141  * flush scan order prevents any chains from being lost.  A child can be
1142  * executes more than once.
1143  *
1144  * WARNING! If we do not call hammer2_flush_core() we must update
1145  *          bref.mirror_tid ourselves to indicate that the flush has
1146  *          processed the child.
1147  *
1148  * WARNING! parent->core spinlock is held on entry and return.
1149  */
1150 static int
1151 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1152 {
1153         hammer2_flush_info_t *info = data;
1154         hammer2_chain_t *parent = info->parent;
1155
1156 #ifdef HAMMER2_SCAN_DEBUG
1157         ++info->scan_count;
1158         if (child->flags & HAMMER2_CHAIN_MODIFIED)
1159                 ++info->scan_mod_count;
1160         if (child->flags & HAMMER2_CHAIN_UPDATE)
1161                 ++info->scan_upd_count;
1162         if (child->flags & HAMMER2_CHAIN_ONFLUSH)
1163                 ++info->scan_onf_count;
1164 #endif
1165
1166         /*
1167          * (child can never be fchain or vchain so a special check isn't
1168          *  needed).
1169          *
1170          * We must ref the child before unlocking the spinlock.
1171          *
1172          * The caller has added a ref to the parent so we can temporarily
1173          * unlock it in order to lock the child.  However, if it no longer
1174          * winds up being the child of the parent we must skip this child.
1175          *
1176          * NOTE! chain locking errors are fatal.  They are never out-of-space
1177          *       errors.
1178          */
1179         hammer2_chain_ref(child);
1180         hammer2_spin_unex(&parent->core.spin);
1181
1182         hammer2_chain_ref_hold(parent);
1183         hammer2_chain_unlock(parent);
1184         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1185         if (child->parent != parent) {
1186                 kprintf("LOST CHILD1 %p->%p (actual parent %p)\n",
1187                         parent, child, child->parent);
1188                 goto done;
1189         }
1190         if (child->error) {
1191                 kprintf("CHILD ERROR DURING FLUSH LOCK %p->%p\n",
1192                         parent, child);
1193                 info->error |= child->error;
1194                 goto done;
1195         }
1196
1197         /*
1198          * Must propagate the DESTROY flag downwards, otherwise the
1199          * parent could end up never being removed because it will
1200          * be requeued to the flusher if it survives this run due to
1201          * the flag.
1202          */
1203         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
1204                 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROY);
1205 #ifdef HAMMER2_SCAN_DEBUG
1206         if (child->flags & HAMMER2_CHAIN_DESTROY)
1207                 ++info->scan_del_count;
1208 #endif
1209         /*
1210          * Special handling of the root inode.  Because the root inode
1211          * contains an index of all the inodes in the PFS in addition to
1212          * its normal directory entries, any flush that is not part of a
1213          * filesystem sync must only flush the directory entries, and not
1214          * anything else.
1215          *
1216          * The child might be an indirect block, but H2 guarantees that
1217          * the key-range will fully partition the inode index from the
1218          * directory entries so the case just works naturally.
1219          */
1220         if ((parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
1221             (child->flags & HAMMER2_CHAIN_DESTROY) == 0 &&
1222             parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1223             (info->flags & HAMMER2_FLUSH_FSSYNC) == 0) {
1224                 if ((child->bref.key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1225                         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1226                                 hammer2_chain_setflush(parent);
1227                         }
1228                         goto done;
1229                 }
1230         }
1231
1232         /*
1233          * Recurse and collect deferral data.  We're in the media flush,
1234          * this can cross PFS boundaries.
1235          */
1236         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1237 #ifdef HAMMER2_SCAN_DEBUG
1238                 if (child->bref.type < 7)
1239                         ++info->scan_btype[child->bref.type];
1240 #endif
1241                 ++info->depth;
1242                 hammer2_flush_core(info, child, info->flags);
1243                 --info->depth;
1244         } else if (hammer2_debug & 0x200) {
1245                 if (info->debug == NULL)
1246                         info->debug = child;
1247                 ++info->depth;
1248                 hammer2_flush_core(info, child, info->flags);
1249                 --info->depth;
1250                 if (info->debug == child)
1251                         info->debug = NULL;
1252         }
1253
1254 done:
1255         /*
1256          * Relock to continue the loop.
1257          */
1258         hammer2_chain_unlock(child);
1259         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1260         hammer2_chain_drop_unhold(parent);
1261         if (parent->error) {
1262                 kprintf("PARENT ERROR DURING FLUSH LOCK %p->%p\n",
1263                         parent, child);
1264                 info->error |= parent->error;
1265         }
1266         hammer2_chain_drop(child);
1267         KKASSERT(info->parent == parent);
1268         hammer2_spin_ex(&parent->core.spin);
1269
1270         return (0);
1271 }
1272
1273 /*
1274  * flush helper (backend threaded)
1275  *
1276  * Flushes chain topology for the specified inode.
1277  *
1278  * HAMMER2_XOP_INODE_STOP       The flush recursion stops at inode boundaries.
1279  *                              Inodes belonging to the same flush are flushed
1280  *                              separately.
1281  *
1282  * HAMMER2_XOP_PARENTONFLUSH    After flushing if the starting chain indicates
1283  *                              a parent update is needed, we setflush the
1284  *                              parent to propogate the flush request across
1285  *                              the inode.
1286  *
1287  * chain->parent can be NULL, usually due to destroy races or detached inodes.
1288  *
1289  * Primarily called from vfs_sync().
1290  */
1291 void
1292 hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
1293 {
1294         hammer2_xop_flush_t *xop = &arg->xop_flush;
1295         hammer2_chain_t *chain;
1296         hammer2_dev_t *hmp;
1297         int flush_error = 0;
1298         int fsync_error = 0;
1299         int total_error = 0;
1300         int j;
1301         int xflags;
1302         int ispfsroot = 0;
1303
1304         xflags = HAMMER2_FLUSH_TOP;
1305         if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
1306                 xflags |= HAMMER2_FLUSH_INODE_STOP;
1307         if (xop->head.flags & HAMMER2_XOP_FSSYNC)
1308                 xflags |= HAMMER2_FLUSH_FSSYNC;
1309
1310         /*
1311          * Flush core chains
1312          */
1313         chain = hammer2_inode_chain(xop->head.ip1, clindex,
1314                                     HAMMER2_RESOLVE_ALWAYS);
1315         if (chain) {
1316                 hmp = chain->hmp;
1317                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1318                         /*
1319                          * Due to flush partitioning the chain topology
1320                          * above the inode's chain may no longer be flagged.
1321                          * When asked to flush an inode, remark the topology
1322                          * leading to that inode.
1323                          */
1324                         if (chain->parent)
1325                                 hammer2_chain_setflush(chain->parent);
1326                         hammer2_flush(chain, xflags);
1327
1328 #if 0
1329                         /*
1330                          * Propogate upwards but only cross an inode boundary
1331                          * for inodes associated with the current filesystem
1332                          * sync.
1333                          */
1334                         if ((xop->head.flags & HAMMER2_XOP_PARENTONFLUSH) ||
1335                             chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
1336                                 parent = chain->parent;
1337                                 if (parent)
1338                                         hammer2_chain_setflush(parent);
1339                         }
1340 #endif
1341                 }
1342                 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1343                         ispfsroot = 1;
1344                 hammer2_chain_unlock(chain);
1345                 hammer2_chain_drop(chain);
1346                 chain = NULL;
1347         } else {
1348                 hmp = NULL;
1349         }
1350
1351         /*
1352          * Only flush the volume header if asked to, plus the inode must also
1353          * be the PFS root.
1354          */
1355         if ((xop->head.flags & HAMMER2_XOP_VOLHDR) == 0)
1356                 goto skip;
1357         if (ispfsroot == 0)
1358                 goto skip;
1359
1360         /*
1361          * Flush volume roots.  Avoid replication, we only want to
1362          * flush each hammer2_dev (hmp) once.
1363          */
1364         for (j = clindex - 1; j >= 0; --j) {
1365                 if ((chain = xop->head.ip1->cluster.array[j].chain) != NULL) {
1366                         if (chain->hmp == hmp) {
1367                                 chain = NULL;   /* safety */
1368                                 goto skip;
1369                         }
1370                 }
1371         }
1372         chain = NULL;   /* safety */
1373
1374         /*
1375          * spmp transaction.  The super-root is never directly mounted so
1376          * there shouldn't be any vnodes, let alone any dirty vnodes
1377          * associated with it, so we shouldn't have to mess around with any
1378          * vnode flushes here.
1379          */
1380         hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1381
1382         /*
1383          * Media mounts have two 'roots', vchain for the topology
1384          * and fchain for the free block table.  Flush both.
1385          *
1386          * Note that the topology and free block table are handled
1387          * independently, so the free block table can wind up being
1388          * ahead of the topology.  We depend on the bulk free scan
1389          * code to deal with any loose ends.
1390          *
1391          * vchain and fchain do not error on-lock since their data does
1392          * not have to be re-read from media.
1393          */
1394         hammer2_chain_ref(&hmp->vchain);
1395         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1396         hammer2_chain_ref(&hmp->fchain);
1397         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1398         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1399                 /*
1400                  * This will also modify vchain as a side effect,
1401                  * mark vchain as modified now.
1402                  */
1403                 hammer2_voldata_modify(hmp);
1404                 chain = &hmp->fchain;
1405                 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1406                 KKASSERT(chain == &hmp->fchain);
1407         }
1408         hammer2_chain_unlock(&hmp->fchain);
1409         hammer2_chain_unlock(&hmp->vchain);
1410         hammer2_chain_drop(&hmp->fchain);
1411         /* vchain dropped down below */
1412
1413         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1414         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1415                 chain = &hmp->vchain;
1416                 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1417                 KKASSERT(chain == &hmp->vchain);
1418         }
1419         hammer2_chain_unlock(&hmp->vchain);
1420         hammer2_chain_drop(&hmp->vchain);
1421
1422         /*
1423          * We can't safely flush the volume header until we have
1424          * flushed any device buffers which have built up.
1425          *
1426          * XXX this isn't being incremental
1427          */
1428         vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1429         fsync_error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1430         vn_unlock(hmp->devvp);
1431         if (fsync_error || flush_error) {
1432                 kprintf("hammer2: sync error fsync=%d h2flush=0x%04x dev=%s\n",
1433                         fsync_error, flush_error, hmp->devrepname);
1434         }
1435
1436         /*
1437          * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1438          * volume header needs synchronization via hmp->volsync.
1439          *
1440          * XXX synchronize the flag & data with only this flush XXX
1441          */
1442         if (fsync_error == 0 && flush_error == 0 &&
1443             (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1444                 struct buf *bp;
1445                 int vol_error = 0;
1446
1447                 /*
1448                  * Synchronize the disk before flushing the volume
1449                  * header.
1450                  */
1451                 bp = getpbuf(NULL);
1452                 bp->b_bio1.bio_offset = 0;
1453                 bp->b_bufsize = 0;
1454                 bp->b_bcount = 0;
1455                 bp->b_cmd = BUF_CMD_FLUSH;
1456                 bp->b_bio1.bio_done = biodone_sync;
1457                 bp->b_bio1.bio_flags |= BIO_SYNC;
1458                 vn_strategy(hmp->devvp, &bp->b_bio1);
1459                 fsync_error = biowait(&bp->b_bio1, "h2vol");
1460                 relpbuf(bp, NULL);
1461
1462                 /*
1463                  * Then we can safely flush the version of the
1464                  * volume header synchronized by the flush code.
1465                  */
1466                 j = hmp->volhdrno + 1;
1467                 if (j < 0)
1468                         j = 0;
1469                 if (j >= HAMMER2_NUM_VOLHDRS)
1470                         j = 0;
1471                 if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1472                     hmp->volsync.volu_size) {
1473                         j = 0;
1474                 }
1475                 if (hammer2_debug & 0x8000) {
1476                         /* debug only, avoid syslogd loop */
1477                         kprintf("sync volhdr %d %jd\n",
1478                                 j, (intmax_t)hmp->volsync.volu_size);
1479                 }
1480                 bp = getblk(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1481                             HAMMER2_PBUFSIZE, GETBLK_KVABIO, 0);
1482                 atomic_clear_int(&hmp->vchain.flags,
1483                                  HAMMER2_CHAIN_VOLUMESYNC);
1484                 bkvasync(bp);
1485                 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1486                 vol_error = bwrite(bp);
1487                 hmp->volhdrno = j;
1488                 if (vol_error)
1489                         fsync_error = vol_error;
1490         }
1491         if (flush_error)
1492                 total_error = flush_error;
1493         if (fsync_error)
1494                 total_error = hammer2_errno_to_error(fsync_error);
1495
1496         /* spmp trans */
1497         hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1498 skip:
1499         hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1500 }