sys/vfs/hammer2: Remove -Wunused-but-set-variable local variables
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * Flushing generally occurs bottom-up but requires a top-down scan to
42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
43  * tells how to recurse downward to find these chains.
44  */
45
46 #include <sys/cdefs.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/types.h>
50 #include <sys/lock.h>
51 #include <sys/vnode.h>
52 #include <sys/buf.h>
53
54 #include "hammer2.h"
55
56 #define HAMMER2_FLUSH_DEPTH_LIMIT       60      /* stack recursion limit */
57
58
59 /*
60  * Recursively flush the specified chain.  The chain is locked and
61  * referenced by the caller and will remain so on return.  The chain
62  * will remain referenced throughout but can temporarily lose its
63  * lock during the recursion to avoid unnecessarily stalling user
64  * processes.
65  */
66 struct hammer2_flush_info {
67         hammer2_chain_t *parent;
68         int             depth;
69         int             error;                  /* cumulative error */
70         int             flags;
71 #ifdef HAMMER2_SCAN_DEBUG
72         long            scan_count;
73         long            scan_mod_count;
74         long            scan_upd_count;
75         long            scan_onf_count;
76         long            scan_del_count;
77         long            scan_btype[7];
78 #endif
79 };
80
81 typedef struct hammer2_flush_info hammer2_flush_info_t;
82
83 static int hammer2_flush_core(hammer2_flush_info_t *info,
84                                 hammer2_chain_t *chain, int flags);
85 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
86
87 /*
88  * Any per-pfs transaction initialization goes here.
89  */
90 void
91 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
92 {
93 }
94
95 /*
96  * Transaction support for any modifying operation.  Transactions are used
97  * in the pmp layer by the frontend and in the spmp layer by the backend.
98  *
99  * 0                    - Normal transaction.  Interlocks against just the
100  *                        COPYQ portion of an ISFLUSH transaction.
101  *
102  * TRANS_ISFLUSH        - Flush transaction.  Interlocks against other flush
103  *                        transactions.
104  *
105  *                        When COPYQ is also specified, waits for the count
106  *                        to drop to 1.
107  *
108  * TRANS_BUFCACHE       - Buffer cache transaction.  No interlock.
109  *
110  * TRANS_SIDEQ          - Run the sideq (only tested in trans_done())
111  *
112  * Initializing a new transaction allocates a transaction ID.  Typically
113  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
114  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
115  * media target.  The latter mode is used by the recovery code.
116  */
117 void
118 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
119 {
120         uint32_t oflags;
121         uint32_t nflags;
122         int dowait;
123
124         for (;;) {
125                 oflags = pmp->trans.flags;
126                 cpu_ccfence();
127                 dowait = 0;
128
129                 if (flags & HAMMER2_TRANS_ISFLUSH) {
130                         /*
131                          * Interlock against other flush transactions.
132                          */
133                         if (oflags & HAMMER2_TRANS_ISFLUSH) {
134                                 nflags = oflags | HAMMER2_TRANS_WAITING;
135                                 dowait = 1;
136                         } else {
137                                 nflags = (oflags | flags) + 1;
138                         }
139                 } else if (flags & HAMMER2_TRANS_BUFCACHE) {
140                         /*
141                          * Requesting strategy transaction from buffer-cache,
142                          * or a VM getpages/putpages through the buffer cache.
143                          * We must allow such transactions in all situations
144                          * to avoid deadlocks.
145                          */
146                         nflags = (oflags | flags) + 1;
147                 } else {
148                         /*
149                          * Normal transaction.  We do not interlock against
150                          * BUFCACHE or ISFLUSH.
151                          *
152                          * Note that vnode locks may be held going into
153                          * this call.
154                          *
155                          * NOTE: Remember that non-modifying operations
156                          *       such as read, stat, readdir, etc, do
157                          *       not use transactions.
158                          */
159                         nflags = (oflags | flags) + 1;
160                 }
161                 if (dowait)
162                         tsleep_interlock(&pmp->trans.sync_wait, 0);
163                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
164                         if (dowait == 0)
165                                 break;
166                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
167                                "h2trans", hz);
168                         /* retry */
169                 } else {
170                         cpu_pause();
171                         /* retry */
172                 }
173                 /* retry */
174         }
175
176 #if 0
177         /*
178          * When entering a FLUSH transaction with COPYQ set, wait for the
179          * transaction count to drop to 1 (our flush transaction only)
180          * before proceeding.
181          *
182          * This waits for all non-flush transactions to complete and blocks
183          * new non-flush transactions from starting until COPYQ is cleared.
184          * (the flush will then proceed after clearing COPYQ).  This should
185          * be a very short stall on modifying operations.
186          */
187         while ((flags & HAMMER2_TRANS_ISFLUSH) &&
188                (flags & HAMMER2_TRANS_COPYQ)) {
189                 oflags = pmp->trans.flags;
190                 cpu_ccfence();
191                 if ((oflags & HAMMER2_TRANS_MASK) == 1)
192                         break;
193                 nflags = oflags | HAMMER2_TRANS_WAITING;
194                 tsleep_interlock(&pmp->trans.sync_wait, 0);
195                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
196                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
197                                "h2trans2", hz);
198                 }
199         }
200 #endif
201 }
202
203 /*
204  * Start a sub-transaction, there is no 'subdone' function.  This will
205  * issue a new modify_tid (mtid) for the current transaction, which is a
206  * CLC (cluster level change) id and not a per-node id.
207  *
208  * This function must be called for each XOP when multiple XOPs are run in
209  * sequence within a transaction.
210  *
211  * Callers typically update the inode with the transaction mtid manually
212  * to enforce sequencing.
213  */
214 hammer2_tid_t
215 hammer2_trans_sub(hammer2_pfs_t *pmp)
216 {
217         hammer2_tid_t mtid;
218
219         mtid = atomic_fetchadd_64(&pmp->modify_tid, 1);
220
221         return (mtid);
222 }
223
224 void
225 hammer2_trans_setflags(hammer2_pfs_t *pmp, uint32_t flags)
226 {
227         atomic_set_int(&pmp->trans.flags, flags);
228 }
229
230 /*
231  * Typically used to clear trans flags asynchronously.  If TRANS_WAITING
232  * is in the mask, and was previously set, this function will wake up
233  * any waiters.
234  */
235 void
236 hammer2_trans_clearflags(hammer2_pfs_t *pmp, uint32_t flags)
237 {
238         uint32_t oflags;
239         uint32_t nflags;
240
241         for (;;) {
242                 oflags = pmp->trans.flags;
243                 cpu_ccfence();
244                 nflags = oflags & ~flags;
245                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
246                         if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
247                                 wakeup(&pmp->trans.sync_wait);
248                         break;
249                 }
250                 cpu_pause();
251                 /* retry */
252         }
253 }
254
255 void
256 hammer2_trans_done(hammer2_pfs_t *pmp, uint32_t flags)
257 {
258         uint32_t oflags;
259         uint32_t nflags;
260
261 #if 0
262         /*
263          * Modifying ops on the front-end can cause dirty inodes to
264          * build up in the sideq.  We don't flush these on inactive/reclaim
265          * due to potential deadlocks, so we have to deal with them from
266          * inside other nominal modifying front-end transactions.
267          */
268         if ((flags & HAMMER2_TRANS_SIDEQ) &&
269             pmp->sideq_count > hammer2_limit_dirty_inodes / 2 &&
270             pmp->sideq_count > (pmp->inum_count >> 3) &&
271             pmp->mp) {
272                 speedup_syncer(pmp->mp);
273         }
274 #endif
275
276         /*
277          * Clean-up the transaction.  Wakeup any waiters when finishing
278          * a flush transaction or transitioning the non-flush transaction
279          * count from 2->1 while a flush transaction is pending.
280          */
281         for (;;) {
282                 oflags = pmp->trans.flags;
283                 cpu_ccfence();
284                 KKASSERT(oflags & HAMMER2_TRANS_MASK);
285
286                 nflags = (oflags - 1) & ~flags;
287                 if (flags & HAMMER2_TRANS_ISFLUSH) {
288                         nflags &= ~HAMMER2_TRANS_WAITING;
289                 }
290                 if ((oflags & (HAMMER2_TRANS_ISFLUSH|HAMMER2_TRANS_MASK)) ==
291                     (HAMMER2_TRANS_ISFLUSH|2)) {
292                         nflags &= ~HAMMER2_TRANS_WAITING;
293                 }
294                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
295                         if ((oflags ^ nflags) & HAMMER2_TRANS_WAITING)
296                                 wakeup(&pmp->trans.sync_wait);
297                         break;
298                 }
299                 cpu_pause();
300                 /* retry */
301         }
302 }
303
304 /*
305  * Obtain new, unique inode number (not serialized by caller).
306  */
307 hammer2_tid_t
308 hammer2_trans_newinum(hammer2_pfs_t *pmp)
309 {
310         hammer2_tid_t tid;
311
312         tid = atomic_fetchadd_64(&pmp->inode_tid, 1);
313
314         return tid;
315 }
316
317 /*
318  * Assert that a strategy call is ok here.  Currently we allow strategy
319  * calls in all situations, including during flushes.  Previously:
320  *      (old) (1) In a normal transaction.
321  */
322 void
323 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
324 {
325 #if 0
326         KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0);
327 #endif
328 }
329
330 /*
331  * Flush the chain and all modified sub-chains through the specified
332  * synchronization point, propagating blockref updates back up.  As
333  * part of this propagation, mirror_tid and inode/data usage statistics
334  * propagates back upward.
335  *
336  * Returns a HAMMER2 error code, 0 if no error.  Note that I/O errors from
337  * buffers dirtied during the flush operation can occur later.
338  *
339  * modify_tid (clc - cluster level change) is not propagated.
340  *
341  * update_tid (clc) is used for validation and is not propagated by this
342  * function.
343  *
344  * This routine can be called from several places but the most important
345  * is from VFS_SYNC (frontend) via hammer2_xop_inode_flush (backend).
346  *
347  * chain is locked on call and will remain locked on return.  The chain's
348  * UPDATE flag indicates that its parent's block table (which is not yet
349  * part of the flush) should be updated.
350  *
351  * flags:
352  *      HAMMER2_FLUSH_TOP       Indicates that this is the top of the flush.
353  *                              Is cleared for the recursion.
354  *
355  *      HAMMER2_FLUSH_ALL       Recurse everything
356  *
357  *      HAMMER2_FLUSH_INODE_STOP
358  *                              Stop at PFS inode or normal inode boundary
359  */
360 int
361 hammer2_flush(hammer2_chain_t *chain, int flags)
362 {
363         hammer2_flush_info_t info;
364         int loops;
365
366         /*
367          * Execute the recursive flush and handle deferrals.
368          *
369          * Chains can be ridiculously long (thousands deep), so to
370          * avoid blowing out the kernel stack the recursive flush has a
371          * depth limit.  Elements at the limit are placed on a list
372          * for re-execution after the stack has been popped.
373          */
374         bzero(&info, sizeof(info));
375         info.flags = flags & ~HAMMER2_FLUSH_TOP;
376
377         /*
378          * Calculate parent (can be NULL), if not NULL the flush core
379          * expects the parent to be referenced so it can easily lock/unlock
380          * it without it getting ripped up.
381          */
382         if ((info.parent = chain->parent) != NULL)
383                 hammer2_chain_ref(info.parent);
384
385         /*
386          * Extra ref needed because flush_core expects it when replacing
387          * chain.
388          */
389         hammer2_chain_ref(chain);
390         loops = 0;
391
392         for (;;) {
393                 /*
394                  * [re]flush chain as the deep recursion may have generated
395                  * additional modifications.
396                  */
397                 if (info.parent != chain->parent) {
398                         if (hammer2_debug & 0x0040) {
399                                 kprintf("LOST CHILD4 %p->%p "
400                                         "(actual parent %p)\n",
401                                         info.parent, chain, chain->parent);
402                         }
403                         hammer2_chain_drop(info.parent);
404                         info.parent = chain->parent;
405                         hammer2_chain_ref(info.parent);
406                 }
407                 if (hammer2_flush_core(&info, chain, flags) == 0)
408                         break;
409
410                 if (++loops % 1000 == 0) {
411                         kprintf("hammer2_flush: excessive loops on %p\n",
412                                 chain);
413                         if (hammer2_debug & 0x100000)
414                                 Debugger("hell4");
415                 }
416         }
417 #ifdef HAMMER2_SCAN_DEBUG
418         if (info.scan_count >= 10)
419         kprintf("hammer2_flush: scan_count %ld (%ld,%ld,%ld,%ld) "
420                 "bt(%ld,%ld,%ld,%ld,%ld,%ld)\n",
421                 info.scan_count,
422                 info.scan_mod_count,
423                 info.scan_upd_count,
424                 info.scan_onf_count,
425                 info.scan_del_count,
426                 info.scan_btype[1],
427                 info.scan_btype[2],
428                 info.scan_btype[3],
429                 info.scan_btype[4],
430                 info.scan_btype[5],
431                 info.scan_btype[6]);
432 #endif
433         hammer2_chain_drop(chain);
434         if (info.parent)
435                 hammer2_chain_drop(info.parent);
436         return (info.error);
437 }
438
439 /*
440  * This is the core of the chain flushing code.  The chain is locked by the
441  * caller and must also have an extra ref on it by the caller, and remains
442  * locked and will have an extra ref on return.  info.parent is referenced
443  * but not locked.
444  *
445  * Upon return, the caller can test the UPDATE bit on the chain to determine
446  * if the parent needs updating.
447  *
448  * If non-zero is returned, the chain's parent changed during the flush and
449  * the caller must retry the operation.
450  *
451  * (1) Determine if this node is a candidate for the flush, return if it is
452  *     not.  fchain and vchain are always candidates for the flush.
453  *
454  * (2) If we recurse too deep the chain is entered onto the deferral list and
455  *     the current flush stack is aborted until after the deferral list is
456  *     run.
457  *
458  * (3) Recursively flush live children (rbtree).  This can create deferrals.
459  *     A successful flush clears the MODIFIED and UPDATE bits on the children
460  *     and typically causes the parent to be marked MODIFIED as the children
461  *     update the parent's block table.  A parent might already be marked
462  *     MODIFIED due to a deletion (whos blocktable update in the parent is
463  *     handled by the frontend), or if the parent itself is modified by the
464  *     frontend for other reasons.
465  *
466  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
467  *     Deleted-but-open inodes can still be individually flushed via the
468  *     filesystem syncer.
469  *
470  * (5) Delete parents on the way back up if they are normal indirect blocks
471  *     and have no children.
472  *
473  * (6) Note that an unmodified child may still need the block table in its
474  *     parent updated (e.g. rename/move).  The child will have UPDATE set
475  *     in this case.
476  *
477  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
478  *
479  * blockref.modify_tid is consistent only within a PFS, and will not be
480  * consistent during synchronization.  mirror_tid is consistent across the
481  * block device regardless of the PFS.
482  */
483 static int
484 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
485                    int flags)
486 {
487         hammer2_chain_t *parent;
488         hammer2_dev_t *hmp;
489         int save_error;
490         int retry;
491
492         retry = 0;
493
494         /*
495          * (1) Optimize downward recursion to locate nodes needing action.
496          *     Nothing to do if none of these flags are set.
497          */
498         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0)
499                 return 0;
500
501         hmp = chain->hmp;
502
503         /*
504          * NOTE: parent can be NULL, usually due to destroy races.
505          */
506         parent = info->parent;
507         KKASSERT(chain->parent == parent);
508
509         /*
510          * Downward search recursion
511          *
512          * We must be careful on cold stops, which often occur on inode
513          * boundaries due to the way hammer2_vfs_sync() sequences the flush.
514          * Be sure to issue an appropriate chain_setflush()
515          */
516         if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) &&
517             (flags & HAMMER2_FLUSH_ALL) == 0 &&
518             (flags & HAMMER2_FLUSH_TOP) == 0 &&
519             chain->pmp && chain->pmp->mp) {
520                 /*
521                  * If FLUSH_ALL is not specified the caller does not want
522                  * to recurse through PFS roots that have been mounted.
523                  *
524                  * (If the PFS has not been mounted there may not be
525                  *  anything monitoring its chains and its up to us
526                  *  to flush it).
527                  *
528                  * The typical sequence is to flush dirty PFS's starting at
529                  * their root downward, then flush the device root (vchain).
530                  * It is this second flush that typically leaves out the
531                  * ALL flag.
532                  *
533                  * However we must still process the PFSROOT chains for block
534                  * table updates in their parent (which IS part of our flush).
535                  *
536                  * NOTE: The volume root, vchain, does not set PFSBOUNDARY.
537                  *
538                  * NOTE: We must re-set ONFLUSH in the parent to retain if
539                  *       this chain (that we are skipping) requires work.
540                  */
541                 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
542                                     HAMMER2_CHAIN_DESTROY |
543                                     HAMMER2_CHAIN_MODIFIED)) {
544                         hammer2_chain_setflush(parent);
545                 }
546                 goto done;
547         } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
548                    (flags & HAMMER2_FLUSH_INODE_STOP) &&
549                    (flags & HAMMER2_FLUSH_ALL) == 0 &&
550                    (flags & HAMMER2_FLUSH_TOP) == 0 &&
551                    chain->pmp && chain->pmp->mp) {
552                 /*
553                  * When FLUSH_INODE_STOP is specified we are being asked not
554                  * to include any inode changes for inodes we encounter,
555                  * with the exception of the inode that the flush began with.
556                  * So: INODE, INODE_STOP, and TOP==0 basically.
557                  *
558                  * Dirty inodes are flushed based on the hammer2_inode
559                  * in-memory structure, issuing a chain_setflush() here
560                  * will only cause unnecessary traversals of the topology.
561                  */
562                 goto done;
563 #if 0
564                 /*
565                  * If FLUSH_INODE_STOP is specified and both ALL and TOP
566                  * are clear, we must not flush the chain.  The chain should
567                  * have already been flushed and any further ONFLUSH/UPDATE
568                  * setting will be related to the next flush.
569                  *
570                  * This features allows us to flush inodes independently of
571                  * each other and meta-data above the inodes separately.
572                  */
573                 if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
574                                     HAMMER2_CHAIN_DESTROY |
575                                     HAMMER2_CHAIN_MODIFIED)) {
576                         if (parent)
577                                 hammer2_chain_setflush(parent);
578                 }
579 #endif
580         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
581                 /*
582                  * Recursion depth reached.
583                  */
584                 panic("hammer2: flush depth limit");
585         } else if (chain->flags & (HAMMER2_CHAIN_ONFLUSH |
586                                    HAMMER2_CHAIN_DESTROY)) {
587                 /*
588                  * Downward recursion search (actual flush occurs bottom-up).
589                  * pre-clear ONFLUSH.  It can get set again due to races or
590                  * flush errors, which we want so the scan finds us again in
591                  * the next flush.
592                  *
593                  * We must also recurse if DESTROY is set so we can finally
594                  * get rid of the related children, otherwise the node will
595                  * just get re-flushed on lastdrop.
596                  *
597                  * WARNING!  The recursion will unlock/relock info->parent
598                  *           (which is 'chain'), potentially allowing it
599                  *           to be ripped up.
600                  */
601                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
602                 save_error = info->error;
603                 info->error = 0;
604                 info->parent = chain;
605
606                 /*
607                  * We may have to do this twice to catch any indirect
608                  * block maintenance that occurs.
609                  */
610                 hammer2_spin_ex(&chain->core.spin);
611                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
612                         NULL, hammer2_flush_recurse, info);
613                 if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
614                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
615                         RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
616                                 NULL, hammer2_flush_recurse, info);
617                 }
618                 hammer2_spin_unex(&chain->core.spin);
619                 info->parent = parent;
620
621                 /*
622                  * Re-set the flush bits if the flush was incomplete or
623                  * an error occurred.  If an error occurs it is typically
624                  * an allocation error.  Errors do not cause deferrals.
625                  */
626                 if (info->error)
627                         hammer2_chain_setflush(chain);
628                 info->error |= save_error;
629
630                 /*
631                  * If we lost the parent->chain association we have to
632                  * stop processing this chain because it is no longer
633                  * in this recursion.  If it moved, it will be handled
634                  * by the ONFLUSH flag elsewhere.
635                  */
636                 if (chain->parent != parent) {
637                         kprintf("LOST CHILD2 %p->%p (actual parent %p)\n",
638                                 parent, chain, chain->parent);
639                         goto done;
640                 }
641         }
642
643         /*
644          * Now we are in the bottom-up part of the recursion.
645          *
646          * We continue to try to update the chain on lower-level errors, but
647          * the flush code may decide not to flush the volume root.
648          *
649          * XXX should we continue to try to update the chain if an error
650          *     occurred?
651          */
652
653         /*
654          * Both parent and chain must be locked in order to flush chain,
655          * in order to properly update the parent under certain conditions.
656          *
657          * In addition, we can't safely unlock/relock the chain once we
658          * start flushing the chain itself, which we would have to do later
659          * on in order to lock the parent if we didn't do that now.
660          */
661         hammer2_chain_ref_hold(chain);
662         hammer2_chain_unlock(chain);
663         if (parent)
664                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
665         hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
666         hammer2_chain_drop_unhold(chain);
667
668         /*
669          * Can't process if we can't access their content.
670          */
671         if ((parent && parent->error) || chain->error) {
672                 kprintf("hammer2: chain error during flush\n");
673                 info->error |= chain->error;
674                 if (parent) {
675                         info->error |= parent->error;
676                         hammer2_chain_unlock(parent);
677                 }
678                 goto done;
679         }
680
681         if (chain->parent != parent) {
682                 if (hammer2_debug & 0x0040) {
683                         kprintf("LOST CHILD3 %p->%p (actual parent %p)\n",
684                                 parent, chain, chain->parent);
685                 }
686                 KKASSERT(parent != NULL);
687                 hammer2_chain_unlock(parent);
688                 retry = 1;
689                 goto done;
690         }
691
692         /*
693          * Propagate the DESTROY flag downwards.  This dummies up the flush
694          * code and tries to invalidate related buffer cache buffers to
695          * avoid the disk write.
696          */
697         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
698                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
699
700         /*
701          * Dispose of the modified bit.
702          *
703          * If parent is present, the UPDATE bit should already be set.
704          * UPDATE should already be set.
705          * bref.mirror_tid should already be set.
706          */
707         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
708                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
709                          chain->parent == NULL);
710                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
711                 atomic_add_long(&hammer2_count_modified_chains, -1);
712
713                 /*
714                  * Manage threads waiting for excessive dirty memory to
715                  * be retired.
716                  */
717                 if (chain->pmp)
718                         hammer2_pfs_memory_wakeup(chain->pmp, -1);
719
720 #if 0
721                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) == 0 &&
722                     chain != &hmp->vchain &&
723                     chain != &hmp->fchain) {
724                         /*
725                          * Set UPDATE bit indicating that the parent block
726                          * table requires updating.
727                          */
728                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
729                 }
730 #endif
731
732                 /*
733                  * Issue the flush.  This is indirect via the DIO.
734                  *
735                  * NOTE: A DELETED node that reaches this point must be
736                  *       flushed for synchronization point consistency.
737                  *
738                  * NOTE: Even though MODIFIED was already set, the related DIO
739                  *       might not be dirty due to a system buffer cache
740                  *       flush and must be set dirty if we are going to make
741                  *       further modifications to the buffer.  Chains with
742                  *       embedded data don't need this.
743                  */
744                 if (hammer2_debug & 0x1000) {
745                         kprintf("Flush %p.%d %016jx/%d data=%016jx\n",
746                                 chain, chain->bref.type,
747                                 (uintmax_t)chain->bref.key,
748                                 chain->bref.keybits,
749                                 (uintmax_t)chain->bref.data_off);
750                 }
751
752                 /*
753                  * Update chain CRCs for flush.
754                  *
755                  * NOTE: Volume headers are NOT flushed here as they require
756                  *       special processing.
757                  */
758                 switch(chain->bref.type) {
759                 case HAMMER2_BREF_TYPE_FREEMAP:
760                         /*
761                          * Update the volume header's freemap_tid to the
762                          * freemap's flushing mirror_tid.
763                          *
764                          * (note: embedded data, do not call setdirty)
765                          */
766                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
767                         KKASSERT(chain == &hmp->fchain);
768                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
769                         if (hammer2_debug & 0x8000) {
770                                 /* debug only, avoid syslogd loop */
771                                 kprintf("sync freemap mirror_tid %08jx\n",
772                                         (intmax_t)chain->bref.mirror_tid);
773                         }
774
775                         /*
776                          * The freemap can be flushed independently of the
777                          * main topology, but for the case where it is
778                          * flushed in the same transaction, and flushed
779                          * before vchain (a case we want to allow for
780                          * performance reasons), make sure modifications
781                          * made during the flush under vchain use a new
782                          * transaction id.
783                          *
784                          * Otherwise the mount recovery code will get confused.
785                          */
786                         ++hmp->voldata.mirror_tid;
787                         break;
788                 case HAMMER2_BREF_TYPE_VOLUME:
789                         /*
790                          * The free block table is flushed by
791                          * hammer2_vfs_sync() before it flushes vchain.
792                          * We must still hold fchain locked while copying
793                          * voldata to volsync, however.
794                          *
795                          * These do not error per-say since their data does
796                          * not need to be re-read from media on lock.
797                          *
798                          * (note: embedded data, do not call setdirty)
799                          */
800                         hammer2_chain_lock(&hmp->fchain,
801                                            HAMMER2_RESOLVE_ALWAYS);
802                         hammer2_voldata_lock(hmp);
803                         if (hammer2_debug & 0x8000) {
804                                 /* debug only, avoid syslogd loop */
805                                 kprintf("sync volume  mirror_tid %08jx\n",
806                                         (intmax_t)chain->bref.mirror_tid);
807                         }
808
809                         /*
810                          * Update the volume header's mirror_tid to the
811                          * main topology's flushing mirror_tid.  It is
812                          * possible that voldata.mirror_tid is already
813                          * beyond bref.mirror_tid due to the bump we made
814                          * above in BREF_TYPE_FREEMAP.
815                          */
816                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
817                                 hmp->voldata.mirror_tid =
818                                         chain->bref.mirror_tid;
819                         }
820
821                         /*
822                          * The volume header is flushed manually by the
823                          * syncer, not here.  All we do here is adjust the
824                          * crc's.
825                          */
826                         KKASSERT(chain->data != NULL);
827                         KKASSERT(chain->dio == NULL);
828
829                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
830                                 hammer2_icrc32(
831                                         (char *)&hmp->voldata +
832                                          HAMMER2_VOLUME_ICRC1_OFF,
833                                         HAMMER2_VOLUME_ICRC1_SIZE);
834                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
835                                 hammer2_icrc32(
836                                         (char *)&hmp->voldata +
837                                          HAMMER2_VOLUME_ICRC0_OFF,
838                                         HAMMER2_VOLUME_ICRC0_SIZE);
839                         hmp->voldata.icrc_volheader =
840                                 hammer2_icrc32(
841                                         (char *)&hmp->voldata +
842                                          HAMMER2_VOLUME_ICRCVH_OFF,
843                                         HAMMER2_VOLUME_ICRCVH_SIZE);
844
845                         if (hammer2_debug & 0x8000) {
846                                 /* debug only, avoid syslogd loop */
847                                 kprintf("syncvolhdr %016jx %016jx\n",
848                                         hmp->voldata.mirror_tid,
849                                         hmp->vchain.bref.mirror_tid);
850                         }
851                         hmp->volsync = hmp->voldata;
852                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
853                         hammer2_voldata_unlock(hmp);
854                         hammer2_chain_unlock(&hmp->fchain);
855                         break;
856                 case HAMMER2_BREF_TYPE_DATA:
857                         /*
858                          * Data elements have already been flushed via the
859                          * logical file buffer cache.  Their hash was set in
860                          * the bref by the vop_write code.  Do not re-dirty.
861                          *
862                          * Make sure any device buffer(s) have been flushed
863                          * out here (there aren't usually any to flush) XXX.
864                          */
865                         break;
866                 case HAMMER2_BREF_TYPE_INDIRECT:
867                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
868                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
869                         /*
870                          * Buffer I/O will be cleaned up when the volume is
871                          * flushed (but the kernel is free to flush it before
872                          * then, as well).
873                          */
874                         hammer2_chain_setcheck(chain, chain->data);
875                         break;
876                 case HAMMER2_BREF_TYPE_DIRENT:
877                         /*
878                          * A directory entry can use the check area to store
879                          * the filename for filenames <= 64 bytes, don't blow
880                          * it up!
881                          */
882                         if (chain->bytes)
883                                 hammer2_chain_setcheck(chain, chain->data);
884                         break;
885                 case HAMMER2_BREF_TYPE_INODE:
886                         /*
887                          * NOTE: We must call io_setdirty() to make any late
888                          *       changes to the inode data, the system might
889                          *       have already flushed the buffer.
890                          */
891                         if (chain->data->ipdata.meta.op_flags &
892                             HAMMER2_OPFLAG_PFSROOT) {
893                                 /*
894                                  * non-NULL pmp if mounted as a PFS.  We must
895                                  * sync fields cached in the pmp? XXX
896                                  */
897                                 hammer2_inode_data_t *ipdata;
898
899                                 hammer2_io_setdirty(chain->dio);
900                                 ipdata = &chain->data->ipdata;
901                                 if (chain->pmp) {
902                                         ipdata->meta.pfs_inum =
903                                                 chain->pmp->inode_tid;
904                                 }
905                         } else {
906                                 /* can't be mounted as a PFS */
907                         }
908
909                         hammer2_chain_setcheck(chain, chain->data);
910                         break;
911                 default:
912                         panic("hammer2_flush_core: unsupported "
913                               "embedded bref %d",
914                               chain->bref.type);
915                         /* NOT REACHED */
916                 }
917
918                 /*
919                  * If the chain was destroyed try to avoid unnecessary I/O
920                  * that might not have yet occurred.  Remove the data range
921                  * from dedup candidacy and attempt to invalidation that
922                  * potentially dirty portion of the I/O buffer.
923                  */
924                 if (chain->flags & HAMMER2_CHAIN_DESTROY) {
925                         hammer2_io_dedup_delete(hmp,
926                                                 chain->bref.type,
927                                                 chain->bref.data_off,
928                                                 chain->bytes);
929 #if 0
930                         hammer2_io_t *dio;
931                         if (chain->dio) {
932                                 hammer2_io_inval(chain->dio,
933                                                  chain->bref.data_off,
934                                                  chain->bytes);
935                         } else if ((dio = hammer2_io_getquick(hmp,
936                                                   chain->bref.data_off,
937                                                   chain->bytes,
938                                                   1)) != NULL) {
939                                 hammer2_io_inval(dio,
940                                                  chain->bref.data_off,
941                                                  chain->bytes);
942                                 hammer2_io_putblk(&dio);
943                         }
944 #endif
945                 }
946         }
947
948         /*
949          * If UPDATE is set the parent block table may need to be updated.
950          * This can fail if the hammer2_chain_modify() fails.
951          *
952          * NOTE: UPDATE may be set on vchain or fchain in which case
953          *       parent could be NULL, or on an inode that has not yet
954          *       been inserted into the radix tree.  It's easiest to allow
955          *       the case and test for NULL.  parent can also wind up being
956          *       NULL due to a deletion so we need to handle the case anyway.
957          *
958          * NOTE: UPDATE can be set when chains are renamed into or out of
959          *       an indirect block, without the chain itself being flagged
960          *       MODIFIED.
961          *
962          * If no parent exists we can just clear the UPDATE bit.  If the
963          * chain gets reattached later on the bit will simply get set
964          * again.
965          */
966         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL)
967                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
968
969         /*
970          * When flushing an inode outside of a FLUSH_FSSYNC we must NOT
971          * update the parent block table to point at the flushed inode.
972          * The block table should only ever be updated by the filesystem
973          * sync code.  If we do, inode<->inode dependencies (such as
974          * directory entries vs inode nlink count) can wind up not being
975          * flushed together and result in a broken topology if a crash/reboot
976          * occurs at the wrong time.
977          */
978         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
979             (flags & HAMMER2_FLUSH_INODE_STOP) &&
980             (flags & HAMMER2_FLUSH_FSSYNC) == 0 &&
981             (flags & HAMMER2_FLUSH_ALL) == 0 &&
982             chain->pmp && chain->pmp->mp) {
983 #ifdef HAMMER2_DEBUG_SYNC
984                 kprintf("inum %ld do not update parent, non-fssync\n",
985                         (long)chain->bref.key);
986 #endif
987                 goto skipupdate;
988         }
989 #ifdef HAMMER2_DEBUG_SYNC
990         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
991                 kprintf("inum %ld update parent\n", (long)chain->bref.key);
992 #endif
993
994         /*
995          * The chain may need its blockrefs updated in the parent, normal
996          * path.
997          */
998         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
999                 hammer2_blockref_t *base;
1000                 int count;
1001
1002                 /*
1003                  * Clear UPDATE flag, mark parent modified, update its
1004                  * modify_tid if necessary, and adjust the parent blockmap.
1005                  */
1006                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1007
1008                 /*
1009                  * (optional code)
1010                  *
1011                  * Avoid actually modifying and updating the parent if it
1012                  * was flagged for destruction.  This can greatly reduce
1013                  * disk I/O in large tree removals because the
1014                  * hammer2_io_setinval() call in the upward recursion
1015                  * (see MODIFIED code above) can only handle a few cases.
1016                  */
1017                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
1018                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
1019                                 parent->bref.modify_tid =
1020                                         chain->bref.modify_tid;
1021                         }
1022                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BLKMAPPED |
1023                                                         HAMMER2_CHAIN_BLKMAPUPD);
1024                         goto skipupdate;
1025                 }
1026
1027                 /*
1028                  * The flusher is responsible for deleting empty indirect
1029                  * blocks at this point.  If we don't do this, no major harm
1030                  * will be done but the empty indirect blocks will stay in
1031                  * the topology and make it a messy and inefficient.
1032                  *
1033                  * The flusher is also responsible for collapsing the
1034                  * content of an indirect block into its parent whenever
1035                  * possible (with some hysteresis).  Not doing this will also
1036                  * not harm the topology, but would make it messy and
1037                  * inefficient.
1038                  */
1039                 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1040                         if (hammer2_chain_indirect_maintenance(parent, chain))
1041                                 goto skipupdate;
1042                 }
1043
1044                 /*
1045                  * We are updating the parent's blockmap, the parent must
1046                  * be set modified.  If this fails we re-set the UPDATE flag
1047                  * in the child.
1048                  *
1049                  * NOTE! A modification error can be ENOSPC.  We still want
1050                  *       to flush modified chains recursively, not break out,
1051                  *       so we just skip the update in this situation and
1052                  *       continue.  That is, we still need to try to clean
1053                  *       out dirty chains and buffers.
1054                  *
1055                  *       This may not help bulkfree though. XXX
1056                  */
1057                 save_error = hammer2_chain_modify(parent, 0, 0, 0);
1058                 if (save_error) {
1059                         info->error |= save_error;
1060                         kprintf("hammer2_flush: %016jx.%02x error=%08x\n",
1061                                 parent->bref.data_off, parent->bref.type,
1062                                 save_error);
1063                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
1064                         goto skipupdate;
1065                 }
1066                 if (parent->bref.modify_tid < chain->bref.modify_tid)
1067                         parent->bref.modify_tid = chain->bref.modify_tid;
1068
1069                 /*
1070                  * Calculate blockmap pointer
1071                  */
1072                 switch(parent->bref.type) {
1073                 case HAMMER2_BREF_TYPE_INODE:
1074                         /*
1075                          * Access the inode's block array.  However, there is
1076                          * no block array if the inode is flagged DIRECTDATA.
1077                          */
1078                         if (parent->data &&
1079                             (parent->data->ipdata.meta.op_flags &
1080                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1081                                 base = &parent->data->
1082                                         ipdata.u.blockset.blockref[0];
1083                         } else {
1084                                 base = NULL;
1085                         }
1086                         count = HAMMER2_SET_COUNT;
1087                         break;
1088                 case HAMMER2_BREF_TYPE_INDIRECT:
1089                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
1090                         if (parent->data)
1091                                 base = &parent->data->npdata[0];
1092                         else
1093                                 base = NULL;
1094                         count = parent->bytes / sizeof(hammer2_blockref_t);
1095                         break;
1096                 case HAMMER2_BREF_TYPE_VOLUME:
1097                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
1098                         count = HAMMER2_SET_COUNT;
1099                         break;
1100                 case HAMMER2_BREF_TYPE_FREEMAP:
1101                         base = &parent->data->npdata[0];
1102                         count = HAMMER2_SET_COUNT;
1103                         break;
1104                 default:
1105                         base = NULL;
1106                         count = 0;
1107                         panic("hammer2_flush_core: "
1108                               "unrecognized blockref type: %d",
1109                               parent->bref.type);
1110                         break;
1111                 }
1112
1113                 /*
1114                  * Blocktable updates
1115                  *
1116                  * We synchronize pending statistics at this time.  Delta
1117                  * adjustments designated for the current and upper level
1118                  * are synchronized.
1119                  */
1120                 if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPUPD)) {
1121                         if (chain->flags & HAMMER2_CHAIN_BLKMAPPED) {
1122                                 hammer2_spin_ex(&parent->core.spin);
1123                                 hammer2_base_delete(parent, base, count, chain,
1124                                                     NULL);
1125                                 hammer2_spin_unex(&parent->core.spin);
1126                                 /* base_delete clears both bits */
1127                         } else {
1128                                 atomic_clear_int(&chain->flags,
1129                                                  HAMMER2_CHAIN_BLKMAPUPD);
1130                         }
1131                 }
1132                 if (base && (chain->flags & HAMMER2_CHAIN_BLKMAPPED) == 0) {
1133                         hammer2_spin_ex(&parent->core.spin);
1134                         hammer2_base_insert(parent, base, count,
1135                                             chain, &chain->bref);
1136                         hammer2_spin_unex(&parent->core.spin);
1137                         /* base_insert sets BLKMAPPED */
1138                 }
1139         }
1140 skipupdate:
1141         if (parent)
1142                 hammer2_chain_unlock(parent);
1143
1144         /*
1145          * Final cleanup after flush
1146          */
1147 done:
1148         KKASSERT(chain->refs > 0);
1149
1150         return retry;
1151 }
1152
1153 /*
1154  * Flush recursion helper, called from flush_core, calls flush_core.
1155  *
1156  * Flushes the children of the caller's chain (info->parent), restricted
1157  * by sync_tid.
1158  *
1159  * This function may set info->error as a side effect.
1160  *
1161  * WARNING! If we do not call hammer2_flush_core() we must update
1162  *          bref.mirror_tid ourselves to indicate that the flush has
1163  *          processed the child.
1164  *
1165  * WARNING! parent->core spinlock is held on entry and return.
1166  */
1167 static int
1168 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1169 {
1170         hammer2_flush_info_t *info = data;
1171         hammer2_chain_t *parent = info->parent;
1172
1173 #ifdef HAMMER2_SCAN_DEBUG
1174         ++info->scan_count;
1175         if (child->flags & HAMMER2_CHAIN_MODIFIED)
1176                 ++info->scan_mod_count;
1177         if (child->flags & HAMMER2_CHAIN_UPDATE)
1178                 ++info->scan_upd_count;
1179         if (child->flags & HAMMER2_CHAIN_ONFLUSH)
1180                 ++info->scan_onf_count;
1181 #endif
1182
1183         /*
1184          * (child can never be fchain or vchain so a special check isn't
1185          *  needed).
1186          *
1187          * We must ref the child before unlocking the spinlock.
1188          *
1189          * The caller has added a ref to the parent so we can temporarily
1190          * unlock it in order to lock the child.  However, if it no longer
1191          * winds up being the child of the parent we must skip this child.
1192          *
1193          * NOTE! chain locking errors are fatal.  They are never out-of-space
1194          *       errors.
1195          */
1196         hammer2_chain_ref(child);
1197         hammer2_spin_unex(&parent->core.spin);
1198
1199         hammer2_chain_ref_hold(parent);
1200         hammer2_chain_unlock(parent);
1201         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1202         if (child->parent != parent) {
1203                 kprintf("LOST CHILD1 %p->%p (actual parent %p)\n",
1204                         parent, child, child->parent);
1205                 goto done;
1206         }
1207         if (child->error) {
1208                 kprintf("CHILD ERROR DURING FLUSH LOCK %p->%p\n",
1209                         parent, child);
1210                 info->error |= child->error;
1211                 goto done;
1212         }
1213
1214         /*
1215          * Must propagate the DESTROY flag downwards, otherwise the
1216          * parent could end up never being removed because it will
1217          * be requeued to the flusher if it survives this run due to
1218          * the flag.
1219          */
1220         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
1221                 atomic_set_int(&child->flags, HAMMER2_CHAIN_DESTROY);
1222 #ifdef HAMMER2_SCAN_DEBUG
1223         if (child->flags & HAMMER2_CHAIN_DESTROY)
1224                 ++info->scan_del_count;
1225 #endif
1226         /*
1227          * Special handling of the root inode.  Because the root inode
1228          * contains an index of all the inodes in the PFS in addition to
1229          * its normal directory entries, any flush that is not part of a
1230          * filesystem sync must only flush the directory entries, and not
1231          * anything else.
1232          *
1233          * The child might be an indirect block, but H2 guarantees that
1234          * the key-range will fully partition the inode index from the
1235          * directory entries so the case just works naturally.
1236          */
1237         if ((parent->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
1238             (child->flags & HAMMER2_CHAIN_DESTROY) == 0 &&
1239             parent->bref.type == HAMMER2_BREF_TYPE_INODE &&
1240             (info->flags & HAMMER2_FLUSH_FSSYNC) == 0) {
1241                 if ((child->bref.key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1242                         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1243                                 hammer2_chain_setflush(parent);
1244                         }
1245                         goto done;
1246                 }
1247         }
1248
1249         /*
1250          * Recurse and collect deferral data.  We're in the media flush,
1251          * this can cross PFS boundaries.
1252          */
1253         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1254 #ifdef HAMMER2_SCAN_DEBUG
1255                 if (child->bref.type < 7)
1256                         ++info->scan_btype[child->bref.type];
1257 #endif
1258                 ++info->depth;
1259                 hammer2_flush_core(info, child, info->flags);
1260                 --info->depth;
1261         }
1262
1263 done:
1264         /*
1265          * Relock to continue the loop.
1266          */
1267         hammer2_chain_unlock(child);
1268         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1269         hammer2_chain_drop_unhold(parent);
1270         if (parent->error) {
1271                 kprintf("PARENT ERROR DURING FLUSH LOCK %p->%p\n",
1272                         parent, child);
1273                 info->error |= parent->error;
1274         }
1275         hammer2_chain_drop(child);
1276         KKASSERT(info->parent == parent);
1277         hammer2_spin_ex(&parent->core.spin);
1278
1279         return (0);
1280 }
1281
1282 /*
1283  * flush helper (backend threaded)
1284  *
1285  * Flushes chain topology for the specified inode.
1286  *
1287  * HAMMER2_XOP_INODE_STOP       The flush recursion stops at inode boundaries.
1288  *                              Inodes belonging to the same flush are flushed
1289  *                              separately.
1290  *
1291  * chain->parent can be NULL, usually due to destroy races or detached inodes.
1292  *
1293  * Primarily called from vfs_sync().
1294  */
1295 void
1296 hammer2_xop_inode_flush(hammer2_xop_t *arg, void *scratch __unused, int clindex)
1297 {
1298         hammer2_xop_flush_t *xop = &arg->xop_flush;
1299         hammer2_chain_t *chain;
1300         hammer2_inode_t *ip;
1301         hammer2_dev_t *hmp;
1302         hammer2_pfs_t *pmp;
1303         hammer2_devvp_t *e;
1304         struct vnode *devvp;
1305         int flush_error = 0;
1306         int fsync_error = 0;
1307         int total_error = 0;
1308         int j;
1309         int xflags;
1310         int ispfsroot = 0;
1311
1312         xflags = HAMMER2_FLUSH_TOP;
1313         if (xop->head.flags & HAMMER2_XOP_INODE_STOP)
1314                 xflags |= HAMMER2_FLUSH_INODE_STOP;
1315         if (xop->head.flags & HAMMER2_XOP_FSSYNC)
1316                 xflags |= HAMMER2_FLUSH_FSSYNC;
1317
1318         /*
1319          * Flush core chains
1320          */
1321         ip = xop->head.ip1;
1322         pmp = ip->pmp;
1323         chain = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
1324         if (chain) {
1325                 hmp = chain->hmp;
1326                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1327                         /*
1328                          * Due to flush partitioning the chain topology
1329                          * above the inode's chain may no longer be flagged.
1330                          * When asked to flush an inode, remark the topology
1331                          * leading to that inode.
1332                          */
1333                         if (chain->parent)
1334                                 hammer2_chain_setflush(chain->parent);
1335                         hammer2_flush(chain, xflags);
1336
1337                         /* XXX cluster */
1338                         if (ip == pmp->iroot && pmp != hmp->spmp) {
1339                                 hammer2_spin_ex(&pmp->inum_spin);
1340                                 pmp->pfs_iroot_blocksets[clindex] =
1341                                         chain->data->ipdata.u.blockset;
1342                                 hammer2_spin_unex(&pmp->inum_spin);
1343                         }
1344
1345 #if 0
1346                         /*
1347                          * Propogate upwards but only cross an inode boundary
1348                          * for inodes associated with the current filesystem
1349                          * sync.
1350                          */
1351                         if ((xop->head.flags & HAMMER2_XOP_PARENTONFLUSH) ||
1352                             chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
1353                                 parent = chain->parent;
1354                                 if (parent)
1355                                         hammer2_chain_setflush(parent);
1356                         }
1357 #endif
1358                 }
1359                 if (chain->flags & HAMMER2_CHAIN_PFSBOUNDARY)
1360                         ispfsroot = 1;
1361                 hammer2_chain_unlock(chain);
1362                 hammer2_chain_drop(chain);
1363                 chain = NULL;
1364         } else {
1365                 hmp = NULL;
1366         }
1367
1368         /*
1369          * Only flush the volume header if asked to, plus the inode must also
1370          * be the PFS root.
1371          */
1372         if ((xop->head.flags & HAMMER2_XOP_VOLHDR) == 0)
1373                 goto skip;
1374         if (ispfsroot == 0)
1375                 goto skip;
1376
1377         /*
1378          * Flush volume roots.  Avoid replication, we only want to
1379          * flush each hammer2_dev (hmp) once.
1380          */
1381         for (j = clindex - 1; j >= 0; --j) {
1382                 if ((chain = ip->cluster.array[j].chain) != NULL) {
1383                         if (chain->hmp == hmp) {
1384                                 chain = NULL;   /* safety */
1385                                 goto skip;
1386                         }
1387                 }
1388         }
1389         chain = NULL;   /* safety */
1390
1391         /*
1392          * spmp transaction.  The super-root is never directly mounted so
1393          * there shouldn't be any vnodes, let alone any dirty vnodes
1394          * associated with it, so we shouldn't have to mess around with any
1395          * vnode flushes here.
1396          */
1397         hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1398
1399         /*
1400          * We must flush the superroot down to the PFS iroot.  Remember
1401          * that hammer2_chain_setflush() stops at inode boundaries, so
1402          * the pmp->iroot has been flushed and flagged down to the superroot,
1403          * but the volume root (vchain) probably has not yet been flagged.
1404          */
1405         if (hmp->spmp->iroot) {
1406                 chain = hmp->spmp->iroot->cluster.array[0].chain;
1407                 if (chain) {
1408                         hammer2_chain_ref(chain);
1409                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
1410                         flush_error |=
1411                                 hammer2_flush(chain,
1412                                               HAMMER2_FLUSH_TOP |
1413                                               HAMMER2_FLUSH_INODE_STOP |
1414                                               HAMMER2_FLUSH_FSSYNC);
1415                         hammer2_chain_unlock(chain);
1416                         hammer2_chain_drop(chain);
1417                 }
1418         }
1419
1420         /*
1421          * Media mounts have two 'roots', vchain for the topology
1422          * and fchain for the free block table.  Flush both.
1423          *
1424          * Note that the topology and free block table are handled
1425          * independently, so the free block table can wind up being
1426          * ahead of the topology.  We depend on the bulk free scan
1427          * code to deal with any loose ends.
1428          *
1429          * vchain and fchain do not error on-lock since their data does
1430          * not have to be re-read from media.
1431          */
1432         hammer2_chain_ref(&hmp->vchain);
1433         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1434         hammer2_chain_ref(&hmp->fchain);
1435         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1436         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1437                 /*
1438                  * This will also modify vchain as a side effect,
1439                  * mark vchain as modified now.
1440                  */
1441                 hammer2_voldata_modify(hmp);
1442                 chain = &hmp->fchain;
1443                 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1444                 KKASSERT(chain == &hmp->fchain);
1445         }
1446         hammer2_chain_unlock(&hmp->fchain);
1447         hammer2_chain_unlock(&hmp->vchain);
1448         hammer2_chain_drop(&hmp->fchain);
1449         /* vchain dropped down below */
1450
1451         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1452         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1453                 chain = &hmp->vchain;
1454                 flush_error |= hammer2_flush(chain, HAMMER2_FLUSH_TOP);
1455                 KKASSERT(chain == &hmp->vchain);
1456         }
1457         hammer2_chain_unlock(&hmp->vchain);
1458         hammer2_chain_drop(&hmp->vchain);
1459
1460         /*
1461          * We can't safely flush the volume header until we have
1462          * flushed any device buffers which have built up.
1463          *
1464          * XXX this isn't being incremental
1465          */
1466         TAILQ_FOREACH(e, &hmp->devvpl, entry) {
1467                 devvp = e->devvp;
1468                 KKASSERT(devvp);
1469                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1470                 fsync_error = VOP_FSYNC(devvp, MNT_WAIT, 0);
1471                 vn_unlock(devvp);
1472                 if (fsync_error || flush_error) {
1473                         kprintf("hammer2: sync error fsync=%d h2flush=0x%04x dev=%s\n",
1474                                 fsync_error, flush_error, e->path);
1475                 }
1476         }
1477
1478         /*
1479          * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1480          * volume header needs synchronization via hmp->volsync.
1481          *
1482          * XXX synchronize the flag & data with only this flush XXX
1483          */
1484         if (fsync_error == 0 && flush_error == 0 &&
1485             (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1486                 struct buf *bp;
1487                 int vol_error = 0;
1488
1489                 /*
1490                  * Synchronize the disk before flushing the volume
1491                  * header.
1492                  */
1493                 bp = getpbuf(NULL);
1494                 bp->b_bio1.bio_offset = 0;
1495                 bp->b_bufsize = 0;
1496                 bp->b_bcount = 0;
1497                 bp->b_cmd = BUF_CMD_FLUSH;
1498                 bp->b_bio1.bio_done = biodone_sync;
1499                 bp->b_bio1.bio_flags |= BIO_SYNC;
1500                 vn_strategy(hmp->devvp, &bp->b_bio1);
1501                 fsync_error = biowait(&bp->b_bio1, "h2vol");
1502                 relpbuf(bp, NULL);
1503
1504                 /*
1505                  * Then we can safely flush the version of the
1506                  * volume header synchronized by the flush code.
1507                  */
1508                 j = hmp->volhdrno + 1;
1509                 if (j < 0)
1510                         j = 0;
1511                 if (j >= HAMMER2_NUM_VOLHDRS)
1512                         j = 0;
1513                 if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1514                     hmp->volsync.volu_size) {
1515                         j = 0;
1516                 }
1517                 if (hammer2_debug & 0x8000) {
1518                         /* debug only, avoid syslogd loop */
1519                         kprintf("sync volhdr %d %jd\n",
1520                                 j, (intmax_t)hmp->volsync.volu_size);
1521                 }
1522                 bp = getblk(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1523                             HAMMER2_PBUFSIZE, GETBLK_KVABIO, 0);
1524                 atomic_clear_int(&hmp->vchain.flags,
1525                                  HAMMER2_CHAIN_VOLUMESYNC);
1526                 bkvasync(bp);
1527                 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1528                 vol_error = bwrite(bp);
1529                 hmp->volhdrno = j;
1530                 if (vol_error)
1531                         fsync_error = vol_error;
1532         }
1533         if (flush_error)
1534                 total_error = flush_error;
1535         if (fsync_error)
1536                 total_error = hammer2_errno_to_error(fsync_error);
1537
1538         /* spmp trans */
1539         hammer2_trans_done(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1540 skip:
1541         hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1542 }