Merge branch 'vendor/DHCPCD'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * Flushing generally occurs bottom-up but requires a top-down scan to
42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
43  * tells how to recurse downward to find these chains.
44  */
45
46 #include <sys/cdefs.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/types.h>
50 #include <sys/lock.h>
51 #include <sys/vnode.h>
52 #include <sys/buf.h>
53
54 #include "hammer2.h"
55
56 #define HAMMER2_FLUSH_DEPTH_LIMIT       60      /* stack recursion limit */
57
58
59 /*
60  * Recursively flush the specified chain.  The chain is locked and
61  * referenced by the caller and will remain so on return.  The chain
62  * will remain referenced throughout but can temporarily lose its
63  * lock during the recursion to avoid unnecessarily stalling user
64  * processes.
65  */
66 struct hammer2_flush_info {
67         hammer2_chain_t *parent;
68         int             depth;
69         int             error;                  /* cumulative error */
70         int             flags;
71 #ifdef HAMMER2_SCAN_DEBUG
72         long            scan_count;
73         long            scan_mod_count;
74         long            scan_upd_count;
75         long            scan_onf_count;
76         long            scan_del_count;
77         long            scan_btype[7];
78 #endif
79 };
80
81 typedef struct hammer2_flush_info hammer2_flush_info_t;
82
83 static int hammer2_flush_core(hammer2_flush_info_t *info,
84                                 hammer2_chain_t *chain, int flags);
85 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
86
87 /*
88  * Any per-pfs transaction initialization goes here.
89  */
90 void
91 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
92 {
93 }
94
95 /*
96  * Transaction support for any modifying operation.  Transactions are used
97  * in the pmp layer by the frontend and in the spmp layer by the backend.
98  *
99  * 0                    - Normal transaction.  Interlocks against just the
100  *                        COPYQ portion of an ISFLUSH transaction.
101  *
102  * TRANS_ISFLUSH        - Flush transaction.  Interlocks against other flush
103  *                        transactions.
104  *
105  *                        When COPYQ is also specified, waits for the count
106  *                        to drop to 1.
107  *
108  * TRANS_BUFCACHE       - Buffer cache transaction.  No interlock.
109  *
110  * TRANS_SIDEQ          - Run the sideq (only tested in trans_done())
111  *
112  * Initializing a new transaction allocates a transaction ID.  Typically
113  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
114  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
115  * media target.  The latter mode is used by the recovery code.
116  */
117 void
118 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
119 {
120         uint32_t oflags;
121         uint32_t nflags;
122         int dowait;
123
124         for (;;) {
125                 oflags = pmp->trans.flags;
126                 cpu_ccfence();
127                 dowait = 0;
128
129                 if (flags & HAMMER2_TRANS_ISFLUSH) {
130                         /*
131                          * Interlock against other flush transactions.
132                          */
133                         if (oflags & HAMMER2_TRANS_ISFLUSH) {
134                                 nflags = oflags | HAMMER2_TRANS_WAITING;
135                                 dowait = 1;
136                         } else {
137                                 nflags = (oflags | flags) + 1;
138                         }
139                 } else if (flags & HAMMER2_TRANS_BUFCACHE) {
140                         /*
141                          * Requesting strategy transaction from buffer-cache,
142                          * or a VM getpages/putpages through the buffer cache.
143                          * We must allow such transactions in all situations
144                          * to avoid deadlocks.
145                          */
146                         nflags = (oflags | flags) + 1;
147                 } else {
148                         /*
149                          * Normal transaction.  We do not interlock against
150                          * BUFCACHE or ISFLUSH.
151                          *
152                          * Note that vnode locks may be held going into
153                          * this call.
154                          *
155                          * NOTE: Remember that non-modifying operations
156                          *       such as read, stat, readdir, etc, do
157                          *       not use transactions.
158                          */
159                         nflags = (oflags | flags) + 1;
160                 }
161                 if (dowait)
162                         tsleep_interlock(&pmp->trans.sync_wait, 0);
163                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
164                         if (dowait == 0)
165                                 break;
166                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
167                                "h2trans", hz);
168                         /* retry */
169                 } else {
170                         cpu_pause();
171                         /* retry */
172                 }
173                 /* retry */
174         }
175
176 #if 0
177         /*
178          * When entering a FLUSH transaction with COPYQ set, wait for the
179          * transaction count to drop to 1 (our flush transaction only)
180          * before proceeding.
181          *
182          * This waits for all non-flush transactions to complete and blocks
183          * new non-flush transactions from starting until COPYQ is cleared.
184          * (the flush will then proceed after clearing COPYQ).  This should
185          * be a very short stall on modifying operations.
186          */
187         while ((flags & HAMMER2_TRANS_ISFLUSH) &&
188                (flags & HAMMER2_TRANS_COPYQ)) {
189                 oflags = pmp->trans.flags;
190                 cpu_ccfence();
191                 if ((oflags & HAMMER2_TRANS_MASK) == 1)
192                         break;
193                 nflags = oflags | HAMMER2_TRANS_WAITING;
194                 tsleep_interlock(&pmp->trans.sync_wait, 0);
195                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
196                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
197                                "h2trans2", hz);
198                 }
199         }
200 #endif
201 }
202
203 /*
204  * Start a sub-transaction, there is no 'subdone' function.  This will
205  * issue a new modify_tid (mtid) for the current transaction, which is a
206  * CLC (cluster level change) id and not a per-node id.
207  *
208  * This function must be called for each XOP when multiple XOPs are run in
209  * sequence within a transaction.
210  *
211  * Callers typically update the inode with the transaction mtid manually
212  * to enforce sequencing.
213  */
214 hammer2_tid_t
215 hammer2_trans_sub(hammer2_pfs_t *pmp)
216 {
217         hammer2_tid_t mtid;
218
219         mtid = atomic_fetchadd_64(&pmp->modify_tid, 1);
220
221         return (mtid);
222 }
223
224 void
225 hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags)
226 {
227         atomic_set_int(&pmp->trans.flags, flags);
228 }
229
230 /*
231  * Typically used to clear trans flags asynchronously.  If TRANS_WAITING
232  * is in the mask, and was previously set, this function will wake up
233  * any waiters.
234  */
235 void
236 hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags)
237 {
238         uint32_t oflags;
239         uint32_t nflags;
240
241         for (;;) {
242                 oflags = pmp->trans.flags;
243                 cpu_ccfence();
244                 nflags = oflags & ~flags;
245                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
246                         if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
247                                 wakeup(&pmp->trans.sync_wait);
248                         break;
249                 }
250                 cpu_pause();
251                 /* retry */
252         }
253 }
254
255 void
256 hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
257 {
258         uint32_t oflags;
259         uint32_t nflags;
260
261 #if 0
262         /*
263          * Modifying ops on the front-end can cause dirty inodes to
264          * build up in the sideq.  We don't flush these on inactive/reclaim
265          * due to potential deadlocks, so we have to deal with them from
266          * inside other nominal modifying front-end transactions.
267          */
268         if ((flags & HAMMER2_TRANS_SIDEQ) &&
269             pmp->sideq_count > hammer2_limit_dirty_inodes / 2 &&
270             pmp->sideq_count > (pmp->inum_count >> 3) &&
271             pmp->mp) {
272                 speedup_syncer(pmp->mp);
273         }
274 #endif
275
276         /*
277          * Clean-up the transaction.  Wakeup any waiters when finishing
278          * a flush transaction or transitioning the non-flush transaction
279          * count from 2->1 while a flush transaction is pending.
280          */
281         for (;;) {
282                 oflags = pmp->trans.flags;
283                 cpu_ccfence();
284                 KKASSERT(oflags & HAMMER2_TRANS_MASK);
285
286                 nflags = (oflags - 1) & ~flags;
287                 if (flags & HAMMER2_TRANS_ISFLUSH) {
288                         nflags &= ~HAMMER2_TRANS_WAITING;
289                 }
290                 if ((oflags & (HAMMER2_TRANS_ISFLUSH|HAMMER2_TRANS_MASK)) ==
291                     (HAMMER2_TRANS_ISFLUSH|2)) {
292                         nflags &= ~HAMMER2_TRANS_WAITING;
293                 }
294                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
295                         if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
296                                 wakeup(&pmp->trans.sync_wait);
297                         break;
298                 }
299                 cpu_pause();
300                 /* retry */
301         }
302 }
303
304 /*
305  * Obtain new, unique inode number (not serialized by caller).
306  */
307 hammer2_tid_t
308 hammer2_trans_newinum(hammer2_pfs_t *pmp)
309 {
310         hammer2_tid_t tid;
311
312         tid = atomic_fetchadd_64(&pmp->inode_tid, 1);
313
314         return tid;
315 }
316
317 /*
318  * Assert that a strategy call is ok here.  Currently we allow strategy
319  * calls in all situations, including during flushes.  Previously:
320  *      (old) (1) In a normal transaction.
321  */
322 void
323 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
324 {
325 #if 0
326         KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0);
327 #endif
328 }
329
330 /*
331  * Flush the chain and all modified sub-chains through the specified
332  * synchronization point, propagating blockref updates back up.  As
333  * part of this propagation, mirror_tid and inode/data usage statistics
334  * propagates back upward.
335  *
336  * Returns a HAMMER2 error code, 0 if no error.  Note that I/O errors from
337  * buffers dirtied during the flush operation can occur later.
338  *
339  * modify_tid (clc - cluster level change) is not propagated.
340  *
341  * update_tid (clc) is used for validation and is not propagated by this
342  * function.
343  *
344  * This routine can be called from several places but the most important
345  * is from VFS_SYNC (frontend) via hammer2_xop_inode_flush (backend).
346  *
347  * chain is locked on call and will remain locked on return.  The chain's
348  * UPDATE flag indicates that its parent's block table (which is not yet
349  * part of the flush) should be updated.
350  *
351  * flags:
352  *      HAMMER2_FLUSH_TOP       Indicates that this is the top of the flush.
353  *                              Is cleared for the recursion.
354  *
355  *      HAMMER2_FLUSH_ALL       Recurse everything
356  *
357  *      HAMMER2_FLUSH_INODE_STOP
358  *                              Stop at PFS inode or normal inode boundary
359  */
360 int
361 hammer2_flush(hammer2_chain_t *chain, int flags)
362 {
363         hammer2_flush_info_t info;
364         hammer2_dev_t *hmp;
365         int loops;
366
367         /*
368          * Execute the recursive flush and handle deferrals.
369          *
370          * Chains can be ridiculously long (thousands deep), so to
371          * avoid blowing out the kernel stack the recursive flush has a
372          * depth limit.  Elements at the limit are placed on a list
373          * for re-execution after the stack has been popped.
374          */
375         bzero(&info, sizeof(info));
376         info.flags = flags & ~HAMMER2_FLUSH_TOP;
377
378         /*
379          * Calculate parent (can be NULL), if not NULL the flush core
380          * expects the parent to be referenced so it can easily lock/unlock
381          * it without it getting ripped up.
382          */
383         if ((info.parent = chain->parent) != NULL)
384                 hammer2_chain_ref(info.parent);
385
386         /*
387          * Extra ref needed because flush_core expects it when replacing
388          * chain.
389          */
390         hammer2_chain_ref(chain);
391         hmp = chain->hmp;
392         loops = 0;
393
394         for (;;) {
395                 /*
396                  * [re]flush chain as the deep recursion may have generated
397                  * additional modifications.
398                  */
399                 if (info.parent != chain->parent) {
400                         if (hammer2_debug & 0x0040) {
401                                 kprintf("LOST CHILD4 %p->%p "
402                                         "(actual parent %p)\n",
403                                         info.parent, chain, chain->parent);
404                         }
405                         hammer2_chain_drop(info.parent);
406                         info.parent = chain->parent;
407                         hammer2_chain_ref(info.parent);
408                 }
409                 if (hammer2_flush_core(&info, chain, flags) == 0)
410                         break;
411
412                 if (++loops % 1000 == 0) {
413                         kprintf("hammer2_flush: excessive loops on %p\n",
414                                 chain);
415                         if (hammer2_debug & 0x100000)
416                                 Debugger("hell4");
417                 }
418         }
419 #ifdef HAMMER2_SCAN_DEBUG
420         if (info.scan_count >= 10)
421         kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) "
422                 "bt(%ld,%ld,%ld,%ld,%ld,%ld)\n",
423                 info.scan_count,
424                 info.scan_mod_count,
425                 info.scan_upd_count,
426                 info.scan_onf_count,
427                 info.scan_del_count,
428                 info.scan_btype[1],
429                 info.scan_btype[2],
430                 info.scan_btype[3],
431                 info.scan_btype[4],
432                 info.scan_btype[5],
433                 info.scan_btype[6]);
434 #endif
435         hammer2_chain_drop(chain);
436         if (info.parent)
437                 hammer2_chain_drop(info.parent);
438         return (info.error);
439 }
440
441 /*
442  * This is the core of the chain flushing code.  The chain is locked by the
443  * caller and must also have an extra ref on it by the caller, and remains
444  * locked and will have an extra ref on return.  info.parent is referenced
445  * but not locked.
446  *
447  * Upon return, the caller can test the UPDATE bit on the chain to determine
448  * if the parent needs updating.
449  *
450  * If non-zero is returned, the chain's parent changed during the flush and
451  * the caller must retry the operation.
452  *
453  * (1) Determine if this node is a candidate for the flush, return if it is
454  *     not.  fchain and vchain are always candidates for the flush.
455  *
456  * (2) If we recurse too deep the chain is entered onto the deferral list and
457  *     the current flush stack is aborted until after the deferral list is
458  *     run.
459  *
460  * (3) Recursively flush live children (rbtree).  This can create deferrals.
461  *     A successful flush clears the MODIFIED and UPDATE bits on the children
462  *     and typically causes the parent to be marked MODIFIED as the children
463  *     update the parent's block table.  A parent might already be marked
464  *     MODIFIED due to a deletion (whos blocktable update in the parent is
465  *     handled by the frontend), or if the parent itself is modified by the
466  *     frontend for other reasons.
467  *
468  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
469  *     Deleted-but-open inodes can still be individually flushed via the
470  *     filesystem syncer.
471  *
472  * (5) Delete parents on the way back up if they are normal indirect blocks
473  *     and have no children.
474  *
475  * (6) Note that an unmodified child may still need the block table in its
476  *     parent updated (e.g. rename/move).  The child will have UPDATE set
477  *     in this case.
478  *
479  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
480  *
481  * blockref.modify_tid is consistent only within a PFS, and will not be
482  * consistent during synchronization.  mirror_tid is consistent across the
483  * block device regardless of the PFS.
484  */
485 static int
486 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
487                    int flags)
488 {
489         hammer2_chain_t *parent;
490         hammer2_dev_t *hmp;
491         int save_error;
492         int retry;
493
494         retry = 0;
495
496         /*
497          * (1) Optimize downward recursion to locate nodes needing action.
498          *     Nothing to do if none of these flags are set.
499          */
500         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0)
501                 return 0;
502
503         hmp = chain->hmp;
504
505         /*
506          * NOTE: parent can be NULL, usually due to destroy races.
507          */
508         parent = info->parent;
509         KKASSERT(chain->parent == parent);
510
511         /*
512          * Downward search recursion
513          *
514          * We must be careful on cold stops, which often occur on inode
515          * boundaries due to the way hammer2_vfs_sync() sequences the flush.
516          * Be sure to issue an appropriate chain_setflush()
517          */
518         if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
519             (flags & HAMMER2_FLUSH_ALL) == 0 &&
520             (flags & HAMMER2_FLUSH_TOP) == 0 &&
521             chain->pmp && chain->pmp->mp) {
522                 /*
523                  * If FLUSH_ALL is not specified the caller does not want
524                  * to recurse through PFS roots that have been mounted.
525                  *
526                  * (If the PFS has not been mounted there may not be
527                  *  anything monitoring its chains and its up to us
528                  *  to flush it).
529                  *
530                  * The typical sequence is to flush dirty PFS's starting at
531                  * their root downward, then flush the device root (vchain).
532                  * It is this second flush that typically leaves out the
533                  * ALL flag.
534                  *
535                  * However we must still process the PFSROOT chains for block
536                  * table updates in their parent (which IS part of our flush).
537                  *
538                  * NOTE: The volume root, vchain, does not set PFSBOUNDARY.
539                  *
540                  * NOTE: We must re-set ONFLUSH in the parent to retain if
541                  *       this chain (that we are skipping) requires work.
542                  */
543                 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
544                                     HAMMER2_CHAIN_DESTROY |
545                                     HAMMER2_CHAIN_MODIFIED)) {
546                         hammer2_chain_setflush(parent);
547                 }
548                 goto done;
549         } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
550                    (flags & HAMMER2_FLUSH_INODE_STOP) &&
551                    (flags & HAMMER2_FLUSH_ALL) == 0 &&
552                    (flags & HAMMER2_FLUSH_TOP) == 0 &&
553                    chain->pmp && chain->pmp->mp) {
554                 /*
555                  * When FLUSH_INODE_STOP is specified we are being asked not
556                  * to include any inode changes for inodes we encounter,
557                  * with the exception of the inode that the flush began with.
558                  * So: INODE, INODE_STOP, and TOP==0 basically.
559                  *
560                  * Dirty inodes are flushed based on the hammer2_inode
561                  * in-memory structure, issuing a chain_setflush() here
562                  * will only cause unnecessary traversals of the topology.
563                  */
564                 goto done;
565 #if 0
566                 /*
567                  * If FLUSH_INODE_STOP is specified and both ALL and TOP
568                  * are clear, we must not flush the chain.  The chain should
569                  * have already been flushed and any further ONFLUSH/UPDATE
570                  * setting will be related to the next flush.
571                  *
572                  * This features allows us to flush inodes independently of
573                  * each other and meta-data above the inodes separately.
574                  */
575                 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
576                                     HAMMER2_CHAIN_DESTROY |
577                                     HAMMER2_CHAIN_MODIFIED)) {
578                         if (parent)
579                                 hammer2_chain_setflush(parent);
580                 }
581 #endif
582         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
583                 /*
584                  * Recursion depth reached.
585                  */
586                 panic("hammer2: flush depth limit");
587         } else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
588                                    HAMMER2_CHAIN_DESTROY)) {
589                 /*
590                  * Downward recursion search (actual flush occurs bottom-up).
591                  * pre-clear ONFLUSH.  It can get set again due to races or
592                  * flush errors, which we want so the scan finds us again in
593                  * the next flush.
594                  *
595                  * We must also recurse if DESTROY is set so we can finally
596                  * get rid of the related children, otherwise the node will
597                  * just get re-flushed on lastdrop.
598                  *
599                  * WARNING!  The recursion will unlock/relock info->parent
600                  *           (which is 'chain'), potentially allowing it
601                  *           to be ripped up.
602                  */
603                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
604                 save_error = info->error;
605                 info->error = 0;
606                 info->parent = chain;
607
608                 /*
609                  * We may have to do this twice to catch any indirect
610                  * block maintenance that occurs.
611                  */
612                 hammer2_spin_ex(&chain->core.spin);
613                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
614                         NULL, hammer2_flush_recurse, info);
615                 if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
616                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
617                         RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
618                                 NULL, hammer2_flush_recurse, info);
619                 }
620                 hammer2_spin_unex(&chain->core.spin);
621                 info->parent = parent;
622
623                 /*
624                  * Re-set the flush bits if the flush was incomplete or
625                  * an error occurred.  If an error occurs it is typically
626                  * an allocation error.  Errors do not cause deferrals.
627                  */
628                 if (info->error)
629                         hammer2_chain_setflush(chain);
630                 info->error |= save_error;
631
632                 /*
633                  * If we lost the parent->chain association we have to
634                  * stop processing this chain because it is no longer
635                  * in this recursion.  If it moved, it will be handled
636                  * by the ONFLUSH flag elsewhere.
637                  */
638                 if (chain->parent != parent) {
639                         kprintf("LOST CHILD2 %p->%p (actual parent %p)\n",
640                                 parent, chain, chain->parent);
641                         goto done;
642                 }
643         }
644
645         /*
646          * Now we are in the bottom-up part of the recursion.
647          *
648          * We continue to try to update the chain on lower-level errors, but
649          * the flush code may decide not to flush the volume root.
650          *
651          * XXX should we continue to try to update the chain if an error
652          *     occurred?
653          */
654
655         /*
656          * Both parent and chain must be locked in order to flush chain,
657          * in order to properly update the parent under certain conditions.
658          *
659          * In addition, we can't safely unlock/relock the chain once we
660          * start flushing the chain itself, which we would have to do later
661          * on in order to lock the parent if we didn't do that now.
662          */
663         hammer2_chain_ref_hold(chain);
664         hammer2_chain_unlock(chain);
665         if (parent)
666                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
667         hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
668         hammer2_chain_drop_unhold(chain);
669
670         /*
671          * Can't process if we can't access their content.
672          */
673         if ((parent && parent->error) || chain->error) {
674                 kprintf("hammer2: chain error during flush\n");
675                 info->error |= chain->error;
676                 if (parent) {
677                         info->error |= parent->error;
678                         hammer2_chain_unlock(parent);
679                 }
680                 goto done;
681         }
682
683         if (chain->parent != parent) {
684                 if (hammer2_debug & 0x0040) {
685                         kprintf("LOST CHILD3 %p->%p (actual parent %p)\n",
686                                 parent, chain, chain->parent);
687                 }
688                 KKASSERT(parent != NULL);
689                 hammer2_chain_unlock(parent);
690                 retry = 1;
691                 goto done;
692         }
693
694         /*
695          * Propagate the DESTROY flag downwards.  This dummies up the flush
696          * code and tries to invalidate related buffer cache buffers to
697          * avoid the disk write.
698          */
699         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
700                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
701
702         /*
703          * Dispose of the modified bit.
704          *
705          * If parent is present, the UPDATE bit should already be set.
706          * UPDATE should already be set.
707          * bref.mirror_tid should already be set.
708          */
709         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
710                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
711                          chain->parent == NULL);
712                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
713                 atomic_add_long(&hammer2_count_modified_chains, -1);
714
715                 /*
716                  * Manage threads waiting for excessive dirty memory to
717                  * be retired.
718                  */
719                 if (chain->pmp)
720                         hammer2_pfs_memory_wakeup(chain->pmp, -1);
721
722 #if 0
723                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
724                     chain != &hmp->vchain &&
725                     chain != &hmp->fchain) {
726                         /*
727                          * Set UPDATE bit indicating that the parent block
728                          * table requires updating.
729                          */
730                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
731                 }
732 #endif
733
734                 /*
735                  * Issue the flush.  This is indirect via the DIO.
736                  *
737                  * NOTE: A DELETED node that reaches this point must be
738                  *       flushed for synchronization point consistency.
739                  *
740                  * NOTE: Even though MODIFIED was already set, the related DIO
741                  *       might not be dirty due to a system buffer cache
742                  *       flush and must be set dirty if we are going to make
743                  *       further modifications to the buffer.  Chains with
744                  *       embedded data don't need this.
745                  */
746                 if (hammer2_debug & 0x1000) {
747                         kprintf("Flush %p.%d %016jx/%d data=%016jx\n",
748                                 chain, chain->bref.type,
749                                 (uintmax_t)chain->bref.key,
750                                 chain->bref.keybits,
751                                 (uintmax_t)chain->bref.data_off);
752                 }
753
754                 /*
755                  * Update chain CRCs for flush.
756                  *
757                  * NOTE: Volume headers are NOT flushed here as they require
758                  *       special processing.
759                  */
760                 switch(chain->bref.type) {
761                 case HAMMER2_BREF_TYPE_FREEMAP:
762                         /*
763                          * Update the volume header's freemap_tid to the
764                          * freemap's flushing mirror_tid.
765                          *
766                          * (note: embedded data, do not call setdirty)
767                          */
768                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
769                         KKASSERT(chain == &hmp->fchain);
770                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
771                         if (hammer2_debug & 0x8000) {
772                                 /* debug only, avoid syslogd loop */
773                                 kprintf("sync freemap mirror_tid %08jx\n",
774                                         (intmax_t)chain->bref.mirror_tid);
775                         }
776
777                         /*
778                          * The freemap can be flushed independently of the
779                          * main topology, but for the case where it is
780                          * flushed in the same transaction, and flushed
781                          * before vchain (a case we want to allow for
782                          * performance reasons), make sure modifications
783                          * made during the flush under vchain use a new
784                          * transaction id.
785                          *
786                          * Otherwise the mount recovery code will get confused.
787                          */
788                         ++hmp->voldata.mirror_tid;
789                         break;
790                 case HAMMER2_BREF_TYPE_VOLUME:
791                         /*
792                          * The free block table is flushed by
793                          * hammer2_vfs_sync() before it flushes vchain.
794                          * We must still hold fchain locked while copying
795                          * voldata to volsync, however.
796                          *
797                          * These do not error per-say since their data does
798                          * not need to be re-read from media on lock.
799                          *
800                          * (note: embedded data, do not call setdirty)
801                          */
802                         hammer2_chain_lock(&hmp->fchain,
803                                            HAMMER2_RESOLVE_ALWAYS);
804                         hammer2_voldata_lock(hmp);
805                         if (hammer2_debug & 0x8000) {
806                                 /* debug only, avoid syslogd loop */
807                                 kprintf("sync volume  mirror_tid %08jx\n",
808                                         (intmax_t)chain->bref.mirror_tid);
809                         }
810
811                         /*
812                          * Update the volume header's mirror_tid to the
813                          * main topology's flushing mirror_tid.  It is
814                          * possible that voldata.mirror_tid is already
815                          * beyond bref.mirror_tid due to the bump we made
816                          * above in BREF_TYPE_FREEMAP.
817                          */
818                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
819                                 hmp->voldata.mirror_tid =
820                                         chain->bref.mirror_tid;
821                         }
822
823                         /*
824                          * The volume header is flushed manually by the
825                          * syncer, not here.  All we do here is adjust the
826                          * crc's.
827                          */
828                         KKASSERT(chain->data != NULL);
829                         KKASSERT(chain->dio == NULL);
830
831                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
832                                 hammer2_icrc32(
833                                         (char *)&hmp->voldata +
834                                          HAMMER2_VOLUME_ICRC1_OFF,
835                                         HAMMER2_VOLUME_ICRC1_SIZE);
836                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
837                                 hammer2_icrc32(
838                                         (char *)&hmp->voldata +
839                                          HAMMER2_VOLUME_ICRC0_OFF,
840                                         HAMMER2_VOLUME_ICRC0_SIZE);
841                         hmp->voldata.icrc_volheader =
842                                 hammer2_icrc32(
843                                         (char *)&hmp->voldata +
844                                          HAMMER2_VOLUME_ICRCVH_OFF,
845                                         HAMMER2_VOLUME_ICRCVH_SIZE);
846
847                         if (hammer2_debug & 0x8000) {
848                                 /* debug only, avoid syslogd loop */
849                                 kprintf("syncvolhdr %016jx %016jx\n",
850                                         hmp->voldata.mirror_tid,
851                                         hmp->vchain.bref.mirror_tid);
852                         }
853                         hmp->volsync = hmp->voldata;
854                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
855                         hammer2_voldata_unlock(hmp);
856                         hammer2_chain_unlock(&hmp->fchain);
857                         break;
858                 case HAMMER2_BREF_TYPE_DATA:
859                         /*
860                          * Data elements have already been flushed via the
861                          * logical file buffer cache.  Their hash was set in
862                          * the bref by the vop_write code.  Do not re-dirty.
863                          *
864                          * Make sure any device buffer(s) have been flushed
865                          * out here (there aren't usually any to flush) XXX.
866                          */
867                         break;
868                 case HAMMER2_BREF_TYPE_INDIRECT:
869                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
870                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
871                         /*
872                          * Buffer I/O will be cleaned up when the volume is
873                          * flushed (but the kernel is free to flush it before
874                          * then, as well).
875                          */
876                         hammer2_chain_setcheck(chain, chain->data);
877                         break;
878                 case HAMMER2_BREF_TYPE_DIRENT:
879                         /*
880                          * A directory entry can use the check area to store
881                          * the filename for filenames <= 64 bytes, don't blow
882                          * it up!
883                          */
884                         if (chain->bytes)
885                                 hammer2_chain_setcheck(chain, chain->data);
886                         break;
887                 case HAMMER2_BREF_TYPE_INODE:
888                         /*
889                          * NOTE: We must call io_setdirty() to make any late
890                          *       changes to the inode data, the system might
891                          *       have already flushed the buffer.
892                          */
893                         if (chain->data->ipdata.meta.op_flags &
894                             HAMMER2_OPFLAG_PFSROOT) {
895                                 /*
896                                  * non-NULL pmp if mounted as a PFS.  We must
897                                  * sync fields cached in the pmp? XXX
898                                  */
899                                 hammer2_inode_data_t *ipdata;
900
901                                 hammer2_io_setdirty(chain->dio);
902                                 ipdata = &chain->data->ipdata;
903                                 if (chain->pmp) {
904                                         ipdata->meta.pfs_inum =
905                                                 chain->pmp->inode_tid;
906                                 }
907                         } else {
908                                 /* can't be mounted as a PFS */
909                         }
910
911                         hammer2_chain_setcheck(chain, chain->data);
912                         break;
913                 default:
914                         panic("hammer2_flush_core: unsupported "
915                               "embedded bref %d",
916                               chain->bref.type);
917                         /* NOT REACHED */
918                 }
919
920                 /*
921                  * If the chain was destroyed try to avoid unnecessary I/O
922                  * that might not have yet occurred.  Remove the data range
923                  * from dedup candidacy and attempt to invalidation that
924                  * potentially dirty portion of the I/O buffer.
925                  */
926                 if (chain->flags & HAMMER2_CHAIN_DESTROY) {
927                         hammer2_io_dedup_delete(hmp,
928                                                 chain->bref.type,
929                                                 chain->bref.data_off,
930                                                 chain->bytes);
931 #if 0
932                         hammer2_io_t *dio;
933                         if (chain->dio) {
934                                 hammer2_io_inval(chain->dio,
935                                                  chain->bref.data_off,
936                                                  chain->bytes);
937                         } else if ((dio = hammer2_io_getquick(hmp,
938                                                   chain->bref.data_off,
939                                                   chain->bytes,
940                                                   1)) != NULL) {
941                                 hammer2_io_inval(dio,
942                                                  chain->bref.data_off,
943                                                  chain->bytes);
944                                 hammer2_io_putblk(&dio);
945                         }
946 #endif
947                 }
948         }
949
950         /*
951          * If UPDATE is set the parent block table may need to be updated.
952          * This can fail if the hammer2_chain_modify() fails.
953          *
954          * NOTE: UPDATE may be set on vchain or fchain in which case
955          *       parent could be NULL, or on an inode that has not yet
956          *       been inserted into the radix tree.  It's easiest to allow
957          *       the case and test for NULL.  parent can also wind up being
958          *       NULL due to a deletion so we need to handle the case anyway.
959          *
960          * NOTE: UPDATE can be set when chains are renamed into or out of
961          *       an indirect block, without the chain itself being flagged
962          *       MODIFIED.
963          *
964          * If no parent exists we can just clear the UPDATE bit.  If the
965          * chain gets reattached later on the bit will simply get set
966          * again.
967          */
968         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL)
969                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
970
971         /*
972          * When flushing an inode outside of a FLUSH_FSSYNC we must NOT
973          * update the parent block table to point at the flushed inode.
974          * The block table should only ever be updated by the filesystem
975          * sync code.  If we do, inode<->inode dependencies (such as
976          * directory entries vs inode nlink count) can wind up not being
977          * flushed together and result in a broken topology if a crash/reboot
978          * occurs at the wrong time.
979          */
980         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
981             (flags & HAMMER2_FLUSH_INODE_STOP) &&
982             (flags & HAMMER2_FLUSH_FSSYNC) == 0 &&
983             (flags & HAMMER2_FLUSH_ALL) == 0 &&
984             chain->pmp && chain->pmp->mp) {
985 #ifdef HAMMER2_DEBUG_SYNC
986                 kprintf("inum %ld do not update parent, non-fssync\n",
987                         (long)chain->bref.key);
988 #endif
989                 goto skipupdate;
990         }
991 #ifdef HAMMER2_DEBUG_SYNC
992         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
993                 kprintf("inum %ld update parent\n", (long)chain->bref.key);
994 #endif
995
996         /*
997          * The chain may need its blockrefs updated in the parent, normal
998          * path.
999          */
1000         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
1001                 hammer2_blockref_t *base;
1002                 int count;
1003
1004                 /*
1005                  * Clear UPDATE flag, mark parent modified, update its
1006                  * modify_tid if necessary, and adjust the parent blockmap.
1007                  */
1008                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1009
1010                 /*
1011                  * (optional code)
1012                  *
1013                  * Avoid actually modifying and updating the parent if it
1014                  * was flagged for destruction.  This can greatly reduce
1015                  * disk I/O in large tree removals because the
1016                  * hammer2_io_setinval() call in the upward recursion
1017                  * (see MODIFIED code above) can only handle a few cases.
1018                  */
1019                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
1020                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
1021                                 parent->bref.modify_tid =
1022                                         chain->bref.modify_tid;
1023                         }
1024                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED |
1025                                                         HAMMER2_CHAIN_BLKMAPUPD);
1026                         goto skipupdate;
1027                 }
1028
1029                 /*
1030                  * The flusher is responsible for deleting empty indirect
1031                  * blocks at this point.  If we don't do this, no major harm
1032                  * will be done but the empty indirect blocks will stay in
1033                  * the topology and make it a messy and inefficient.
1034                  *
1035                  * The flusher is also responsible for collapsing the
1036                  * content of an indirect block into its parent whenever
1037                  * possible (with some hysteresis).  Not doing this will also
1038                  * not harm the topology, but would make it messy and
1039                  * inefficient.
1040                  */
1041                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1042                         if (hammer2_chain_indirect_maintenance(parent, chain))
1043                                 goto skipupdate;
1044                 }
1045
1046                 /*
1047                  * We are updating the parent's blockmap, the parent must
1048                  * be set modified.  If this fails we re-set the UPDATE flag
1049                  * in the child.
1050                  *
1051                  * NOTE! A modification error can be ENOSPC.  We still want
1052                  *       to flush modified chains recursively, not break out,
1053                  *       so we just skip the update in this situation and
1054                  *       continue.  That is, we still need to try to clean
1055                  *       out dirty chains and buffers.
1056                  *
1057                  *       This may not help bulkfree though. XXX
1058                  */
1059                 save_error = hammer2_chain_modify(parent, 0, 0, 0);
1060                 if (save_error) {
1061                         info->error |= save_error;
1062                         kprintf("hammer2_flush: %016jx.%02x error=%08x\n",
1063                                 parent->bref.data_off, parent->bref.type,
1064                                 save_error);
1065                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1066                         goto skipupdate;
1067                 }
1068                 if (parent->bref.modify_tid < chain->bref.modify_tid)
1069                         parent->bref.modify_tid = chain->bref.modify_tid;
1070
1071                 /*
1072                  * Calculate blockmap pointer
1073                  */
1074                 switch(parent->bref.type) {
1075                 case HAMMER2_BREF_TYPE_INODE:
1076                         /*
1077                          * Access the inode's block array.  However, there is
1078                          * no block array if the inode is flagged DIRECTDATA.
1079                          */
1080                         if (parent->data &&
1081                             (parent->data->ipdata.meta.op_flags &
1082                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1083                                 base = &parent->data->
1084                                         ipdata.u.blockset.blockref[0];
1085                         } else {
1086                                 base = NULL;
1087                         }
1088                         count = HAMMER2_SET_COUNT;
1089                         break;
1090                 case HAMMER2_BREF_TYPE_INDIRECT:
1091                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1092                         if (parent->data)
1093                                 base = &parent->data->npdata[0];
1094                         else
1095                                 base = NULL;
1096                         count = parent->bytes / sizeof(hammer2_blockref_t);
1097                         break;
1098                 case HAMMER2_BREF_TYPE_VOLUME:
1099                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
1100                         count = HAMMER2_SET_COUNT;
1101                         break;
1102                 case HAMMER2_BREF_TYPE_FREEMAP:
1103                         base = &parent->data->npdata[0];
1104                         count = HAMMER2_SET_COUNT;
1105                         break;
1106                 default:
1107                         base = NULL;
1108                         count = 0;
1109                         panic("hammer2_flush_core: "
1110                               "unrecognized blockref type: %d",
1111                               parent->bref.type);
1112                         break;
1113                 }
1114
1115                 /*
1116                  * Blocktable updates
1117                  *
1118                  * We synchronize pending statistics at this time.  Delta
1119                  * adjustments designated for the current and upper level
1120                  * are synchronized.
1121                  */
1122                 if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPUPD)) {
1123                         if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) {
1124                                 hammer2_spin_ex(&parent->core.spin);
1125                                 hammer2_base_delete(parent, base, count, chain,
1126                                                     NULL);
1127                                 hammer2_spin_unex(&parent->core.spin);
1128                                 /* base_delete clears both bits */
1129                         } else {
1130                                 atomic_clear_int(&chain->flags,
1131                                                  HAMMER2_CHAIN_BLKMAPUPD);
1132                         }
1133                 }
1134                 if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) {
1135                         hammer2_spin_ex(&parent->core.spin);
1136                         hammer2_base_insert(parent, base, count,
1137                                             chain, &chain->bref);
1138                         hammer2_spin_unex(&parent->core.spin);
1139                         /* base_insert sets BLKMAPPED */
1140                 }
1141         }
1142 skipupdate:
1143         if (parent)
1144                 hammer2_chain_unlock(parent);
1145
1146         /*
1147          * Final cleanup after flush
1148          */
1149 done:
1150         KKASSERT(chain->refs > 0);
1151
1152         return retry;
1153 }
1154
1155 /*
1156  * Flush recursion helper, called from flush_core, calls flush_core.
1157  *
1158  * Flushes the children of the caller's chain (info->parent), restricted
1159  * by sync_tid.
1160  *
1161  * This function may set info->error as a side effect.
1162  *
1163  * WARNING! If we do not call hammer2_flush_core() we must update
1164  *          bref.mirror_tid ourselves to indicate that the flush has
1165  *          processed the child.
1166  *
1167  * WARNING! parent->core spinlock is held on entry and return.
1168  */
1169 static int
1170 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1171 {
1172         hammer2_flush_info_t *info = data;
1173         hammer2_chain_t *parent = info->parent;
1174
1175 #ifdef HAMMER2_SCAN_DEBUG
1176         ++info->scan_count;
1177         if (child->flags & HAMMER2_CHAIN_MODIFIED)
1178                 ++info->scan_mod_count;
1179         if (child->flags & HAMMER2_CHAIN_UPDATE)
1180                 ++info->scan_upd_count;
1181         if (child->flags & HAMMER2_CHAIN_ONFLUSH)
1182                 ++info->scan_onf_count;
1183 #endif
1184
1185         /*
1186          * (child can never be fchain or vchain so a special check isn't
1187          *  needed).
1188          *
1189          * We must ref the child before unlocking the spinlock.
1190          *
1191          * The caller has added a ref to the parent so we can temporarily
1192          * unlock it in order to lock the child.  However, if it no longer
1193          * winds up being the child of the parent we must skip this child.
1194          *
1195          * NOTE! chain locking errors are fatal.  They are never out-of-space
1196          *       errors.
1197          */
1198         hammer2_chain_ref(child);
1199         hammer2_spin_unex(&parent->core.spin);
1200
1201         hammer2_chain_ref_hold(parent);
1202         hammer2_chain_unlock(parent);
1203         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1204         if (child->parent != parent) {
1205                 kprintf("LOST CHILD1 %p->%p (actual parent %p)\n",
1206                         parent, child, child->parent);
1207                 goto done;
1208         }
1209         if (child->error) {
1210                 kprintf("CHILD ERROR DURING FLUSH LOCK %p->%p\n",
1211                         parent, child);
1212                 info->error |= child->error;
1213                 goto done;
1214         }
1215
1216         /*
1217          * Must propagate the DESTROY flag downwards, otherwise the
1218          * parent could end up never being removed because it will
1219          * be requeued to the flusher if it survives this run due to
1220          * the flag.
1221          */
1222         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
1223                 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROY);
1224 #ifdef HAMMER2_SCAN_DEBUG
1225         if (child->flags & HAMMER2_CHAIN_DESTROY)
1226                 ++info->scan_del_count;
1227 #endif
1228         /*
1229          * Special handling of the root inode.  Because the root inode
1230          * contains an index of all the inodes in the PFS in addition to
1231          * its normal directory entries, any flush that is not part of a
1232          * filesystem sync must only flush the directory entries, and not
1233          * anything else.
1234          *
1235          * The child might be an indirect block, but H2 guarantees that
1236          * the key-range will fully partition the inode index from the
1237          * directory entries so the case just works naturally.
1238          */
1239         if ((parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
1240             (child->flags & HAMMER2_CHAIN_DESTROY) == 0 &&
1241             parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1242             (info->flags & HAMMER2_FLUSH_FSSYNC) == 0) {
1243                 if ((child->bref.key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1244                         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1245                                 hammer2_chain_setflush(parent);
1246                         }
1247                         goto done;
1248                 }
1249         }
1250
1251         /*
1252          * Recurse and collect deferral data.  We're in the media flush,
1253          * this can cross PFS boundaries.
1254          */
1255         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1256 #ifdef HAMMER2_SCAN_DEBUG
1257                 if (child->bref.type < 7)
1258                         ++info->scan_btype[child->bref.type];
1259 #endif
1260                 ++info->depth;
1261                 hammer2_flush_core(info, child, info->flags);
1262                 --info->depth;
1263         }
1264
1265 done:
1266         /*
1267          * Relock to continue the loop.
1268          */
1269         hammer2_chain_unlock(child);
1270         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1271         hammer2_chain_drop_unhold(parent);
1272         if (parent->error) {
1273                 kprintf("PARENT ERROR DURING FLUSH LOCK %p->%p\n",
1274                         parent, child);
1275                 info->error |= parent->error;
1276         }
1277         hammer2_chain_drop(child);
1278         KKASSERT(info->parent == parent);
1279         hammer2_spin_ex(&parent->core.spin);
1280
1281         return (0);
1282 }
1283
1284 /*
1285  * flush helper (backend threaded)
1286  *
1287  * Flushes chain topology for the specified inode.
1288  *
1289  * HAMMER2_XOP_INODE_STOP       The flush recursion stops at inode boundaries.
1290  *                              Inodes belonging to the same flush are flushed
1291  *                              separately.
1292  *
1293  * chain->parent can be NULL, usually due to destroy races or detached inodes.
1294  *
1295  * Primarily called from vfs_sync().
1296  */
1297 void
1298 hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
1299 {
1300         hammer2_xop_flush_t *xop = &arg->xop_flush;
1301         hammer2_chain_t *chain;
1302         hammer2_inode_t *ip;
1303         hammer2_dev_t *hmp;
1304         hammer2_pfs_t *pmp;
1305         hammer2_devvp_t *e;
1306         struct vnode *devvp;
1307         int flush_error = 0;
1308         int fsync_error = 0;
1309         int total_error = 0;
1310         int j;
1311         int xflags;
1312         int ispfsroot = 0;
1313
1314         xflags = HAMMER2_FLUSH_TOP;
1315         if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
1316                 xflags |= HAMMER2_FLUSH_INODE_STOP;
1317         if (xop->head.flags & HAMMER2_XOP_FSSYNC)
1318                 xflags |= HAMMER2_FLUSH_FSSYNC;
1319
1320         /*
1321          * Flush core chains
1322          */
1323         ip = xop->head.ip1;
1324         pmp = ip->pmp;
1325         chain = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
1326         if (chain) {
1327                 hmp = chain->hmp;
1328                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1329                         /*
1330                          * Due to flush partitioning the chain topology
1331                          * above the inode's chain may no longer be flagged.
1332                          * When asked to flush an inode, remark the topology
1333                          * leading to that inode.
1334                          */
1335                         if (chain->parent)
1336                                 hammer2_chain_setflush(chain->parent);
1337                         hammer2_flush(chain, xflags);
1338
1339                         /* XXX cluster */
1340                         if (ip == pmp->iroot && pmp != hmp->spmp) {
1341                                 hammer2_spin_ex(&pmp->inum_spin);
1342                                 pmp->pfs_iroot_blocksets[clindex] =
1343                                         chain->data->ipdata.u.blockset;
1344                                 hammer2_spin_unex(&pmp->inum_spin);
1345                         }
1346
1347 #if 0
1348                         /*
1349                          * Propogate upwards but only cross an inode boundary
1350                          * for inodes associated with the current filesystem
1351                          * sync.
1352                          */
1353                         if ((xop->head.flags & HAMMER2_XOP_PARENTONFLUSH) ||
1354                             chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
1355                                 parent = chain->parent;
1356                                 if (parent)
1357                                         hammer2_chain_setflush(parent);
1358                         }
1359 #endif
1360                 }
1361                 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1362                         ispfsroot = 1;
1363                 hammer2_chain_unlock(chain);
1364                 hammer2_chain_drop(chain);
1365                 chain = NULL;
1366         } else {
1367                 hmp = NULL;
1368         }
1369
1370         /*
1371          * Only flush the volume header if asked to, plus the inode must also
1372          * be the PFS root.
1373          */
1374         if ((xop->head.flags & HAMMER2_XOP_VOLHDR) == 0)
1375                 goto skip;
1376         if (ispfsroot == 0)
1377                 goto skip;
1378
1379         /*
1380          * Flush volume roots.  Avoid replication, we only want to
1381          * flush each hammer2_dev (hmp) once.
1382          */
1383         for (j = clindex - 1; j >= 0; --j) {
1384                 if ((chain = ip->cluster.array[j].chain) != NULL) {
1385                         if (chain->hmp == hmp) {
1386                                 chain = NULL;   /* safety */
1387                                 goto skip;
1388                         }
1389                 }
1390         }
1391         chain = NULL;   /* safety */
1392
1393         /*
1394          * spmp transaction.  The super-root is never directly mounted so
1395          * there shouldn't be any vnodes, let alone any dirty vnodes
1396          * associated with it, so we shouldn't have to mess around with any
1397          * vnode flushes here.
1398          */
1399         hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1400
1401         /*
1402          * We must flush the superroot down to the PFS iroot.  Remember
1403          * that hammer2_chain_setflush() stops at inode boundaries, so
1404          * the pmp->iroot has been flushed and flagged down to the superroot,
1405          * but the volume root (vchain) probably has not yet been flagged.
1406          */
1407         if (hmp->spmp->iroot) {
1408                 chain = hmp->spmp->iroot->cluster.array[0].chain;
1409                 if (chain) {
1410                         hammer2_chain_ref(chain);
1411                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1412                         flush_error |=
1413                                 hammer2_flush(chain,
1414                                               HAMMER2_FLUSH_TOP |
1415                                               HAMMER2_FLUSH_INODE_STOP |
1416                                               HAMMER2_FLUSH_FSSYNC);
1417                         hammer2_chain_unlock(chain);
1418                         hammer2_chain_drop(chain);
1419                 }
1420         }
1421
1422         /*
1423          * Media mounts have two 'roots', vchain for the topology
1424          * and fchain for the free block table.  Flush both.
1425          *
1426          * Note that the topology and free block table are handled
1427          * independently, so the free block table can wind up being
1428          * ahead of the topology.  We depend on the bulk free scan
1429          * code to deal with any loose ends.
1430          *
1431          * vchain and fchain do not error on-lock since their data does
1432          * not have to be re-read from media.
1433          */
1434         hammer2_chain_ref(&hmp->vchain);
1435         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1436         hammer2_chain_ref(&hmp->fchain);
1437         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1438         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1439                 /*
1440                  * This will also modify vchain as a side effect,
1441                  * mark vchain as modified now.
1442                  */
1443                 hammer2_voldata_modify(hmp);
1444                 chain = &hmp->fchain;
1445                 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1446                 KKASSERT(chain == &hmp->fchain);
1447         }
1448         hammer2_chain_unlock(&hmp->fchain);
1449         hammer2_chain_unlock(&hmp->vchain);
1450         hammer2_chain_drop(&hmp->fchain);
1451         /* vchain dropped down below */
1452
1453         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1454         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1455                 chain = &hmp->vchain;
1456                 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1457                 KKASSERT(chain == &hmp->vchain);
1458         }
1459         hammer2_chain_unlock(&hmp->vchain);
1460         hammer2_chain_drop(&hmp->vchain);
1461
1462         /*
1463          * We can't safely flush the volume header until we have
1464          * flushed any device buffers which have built up.
1465          *
1466          * XXX this isn't being incremental
1467          */
1468         TAILQ_FOREACH(e, &hmp->devvpl, entry) {
1469                 devvp = e->devvp;
1470                 KKASSERT(devvp);
1471                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1472                 fsync_error = VOP_FSYNC(devvp, MNT_WAIT, 0);
1473                 vn_unlock(devvp);
1474                 if (fsync_error || flush_error) {
1475                         kprintf("hammer2: sync error fsync=%d h2flush=0x%04x dev=%s\n",
1476                                 fsync_error, flush_error, e->path);
1477                 }
1478         }
1479
1480         /*
1481          * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1482          * volume header needs synchronization via hmp->volsync.
1483          *
1484          * XXX synchronize the flag & data with only this flush XXX
1485          */
1486         if (fsync_error == 0 && flush_error == 0 &&
1487             (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1488                 struct buf *bp;
1489                 int vol_error = 0;
1490
1491                 /*
1492                  * Synchronize the disk before flushing the volume
1493                  * header.
1494                  */
1495                 bp = getpbuf(NULL);
1496                 bp->b_bio1.bio_offset = 0;
1497                 bp->b_bufsize = 0;
1498                 bp->b_bcount = 0;
1499                 bp->b_cmd = BUF_CMD_FLUSH;
1500                 bp->b_bio1.bio_done = biodone_sync;
1501                 bp->b_bio1.bio_flags |= BIO_SYNC;
1502                 vn_strategy(hmp->devvp, &bp->b_bio1);
1503                 fsync_error = biowait(&bp->b_bio1, "h2vol");
1504                 relpbuf(bp, NULL);
1505
1506                 /*
1507                  * Then we can safely flush the version of the
1508                  * volume header synchronized by the flush code.
1509                  */
1510                 j = hmp->volhdrno + 1;
1511                 if (j < 0)
1512                         j = 0;
1513                 if (j >= HAMMER2_NUM_VOLHDRS)
1514                         j = 0;
1515                 if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1516                     hmp->volsync.volu_size) {
1517                         j = 0;
1518                 }
1519                 if (hammer2_debug & 0x8000) {
1520                         /* debug only, avoid syslogd loop */
1521                         kprintf("sync volhdr %d %jd\n",
1522                                 j, (intmax_t)hmp->volsync.volu_size);
1523                 }
1524                 bp = getblk(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1525                             HAMMER2_PBUFSIZE, GETBLK_KVABIO, 0);
1526                 atomic_clear_int(&hmp->vchain.flags,
1527                                  HAMMER2_CHAIN_VOLUMESYNC);
1528                 bkvasync(bp);
1529                 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1530                 vol_error = bwrite(bp);
1531                 hmp->volhdrno = j;
1532                 if (vol_error)
1533                         fsync_error = vol_error;
1534         }
1535         if (flush_error)
1536                 total_error = flush_error;
1537         if (fsync_error)
1538                 total_error = hammer2_errno_to_error(fsync_error);
1539
1540         /* spmp trans */
1541         hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1542 skip:
1543         hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1544 }