hammer2 - Refactor frontend part 13/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * Flushing generally occurs bottom-up but requires a top-down scan to
42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
43  * tells how to recurse downward to find these chains.
44  */
45
46 #include <sys/cdefs.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/types.h>
50 #include <sys/lock.h>
51 #include <sys/uuid.h>
52
53 #include "hammer2.h"
54
55 #define FLUSH_DEBUG 0
56
57 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
58
59
60 /*
61  * Recursively flush the specified chain.  The chain is locked and
62  * referenced by the caller and will remain so on return.  The chain
63  * will remain referenced throughout but can temporarily lose its
64  * lock during the recursion to avoid unnecessarily stalling user
65  * processes.
66  */
67 struct hammer2_flush_info {
68         hammer2_chain_t *parent;
69         int             depth;
70         int             diddeferral;
71         int             cache_index;
72         struct h2_flush_list flushq;
73         hammer2_chain_t *debug;
74 };
75
76 typedef struct hammer2_flush_info hammer2_flush_info_t;
77
78 static void hammer2_flush_core(hammer2_flush_info_t *info,
79                                 hammer2_chain_t *chain, int deleting);
80 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
81
82 /*
83  * Any per-pfs transaction initialization goes here.
84  */
85 void
86 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
87 {
88 }
89
90 /*
91  * Transaction support for any modifying operation.  Transactions are used
92  * in the pmp layer by the frontend and in the spmp layer by the backend.
93  *
94  * 0                    - Normal transaction, interlocked against flush
95  *                        transaction.
96  *
97  * TRANS_ISFLUSH        - Flush transaction, interlocked against normal
98  *                        transaction.
99  *
100  * TRANS_BUFCACHE       - Buffer cache transaction, no interlock.
101  *
102  * Initializing a new transaction allocates a transaction ID.  Typically
103  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
104  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
105  * media target.  The latter mode is used by the recovery code.
106  *
107  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
108  * other is a set of any number of concurrent filesystem operations.  We
109  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
110  * or we can have <running_flush> + <concurrent_fs_ops>.
111  *
112  * During a flush, new fs_ops are only blocked until the fs_ops prior to
113  * the flush complete.  The new fs_ops can then run concurrent with the flush.
114  *
115  * Buffer-cache transactions operate as fs_ops but never block.  A
116  * buffer-cache flush will run either before or after the current pending
117  * flush depending on its state.
118  */
119 void
120 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
121 {
122         uint32_t oflags;
123         uint32_t nflags;
124         int dowait;
125
126         for (;;) {
127                 oflags = pmp->trans.flags;
128                 cpu_ccfence();
129                 dowait = 0;
130
131                 if (flags & HAMMER2_TRANS_ISFLUSH) {
132                         /*
133                          * Requesting flush transaction.  Wait for all
134                          * currently running transactions to finish.
135                          */
136                         if (oflags & HAMMER2_TRANS_MASK) {
137                                 nflags = oflags | HAMMER2_TRANS_FPENDING |
138                                                   HAMMER2_TRANS_WAITING;
139                                 dowait = 1;
140                         } else {
141                                 nflags = (oflags | flags) + 1;
142                         }
143                         ++pmp->modify_tid;
144                 } else if (flags & HAMMER2_TRANS_BUFCACHE) {
145                         /*
146                          * Requesting strategy transaction.  Generally
147                          * allowed in all situations unless a flush
148                          * is running without the preflush flag.
149                          */
150                         if ((oflags & (HAMMER2_TRANS_ISFLUSH |
151                                        HAMMER2_TRANS_PREFLUSH)) ==
152                             HAMMER2_TRANS_ISFLUSH) {
153                                 nflags = oflags | HAMMER2_TRANS_WAITING;
154                                 dowait = 1;
155                         } else {
156                                 nflags = (oflags | flags) + 1;
157                         }
158                 } else {
159                         /*
160                          * Requesting normal transaction.  Wait for any
161                          * flush to finish before allowing.
162                          */
163                         if (oflags & HAMMER2_TRANS_ISFLUSH) {
164                                 nflags = oflags | HAMMER2_TRANS_WAITING;
165                                 dowait = 1;
166                         } else {
167                                 nflags = (oflags | flags) + 1;
168                         }
169                 }
170                 if (dowait)
171                         tsleep_interlock(&pmp->trans.sync_wait, 0);
172                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
173                         if (dowait == 0)
174                                 break;
175                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
176                                "h2trans", hz);
177                 } else {
178                         cpu_pause();
179                 }
180                 /* retry */
181         }
182 }
183
184 void
185 hammer2_trans_done(hammer2_pfs_t *pmp)
186 {
187         uint32_t oflags;
188         uint32_t nflags;
189
190         for (;;) {
191                 oflags = pmp->trans.flags;
192                 cpu_ccfence();
193                 KKASSERT(oflags & HAMMER2_TRANS_MASK);
194                 if ((oflags & HAMMER2_TRANS_MASK) == 1) {
195                         /*
196                          * This was the last transaction
197                          */
198                         nflags = (oflags - 1) & ~(HAMMER2_TRANS_ISFLUSH |
199                                                   HAMMER2_TRANS_BUFCACHE |
200                                                   HAMMER2_TRANS_PREFLUSH |
201                                                   HAMMER2_TRANS_FPENDING |
202                                                   HAMMER2_TRANS_WAITING);
203                 } else {
204                         /*
205                          * Still transactions pending
206                          */
207                         nflags = oflags - 1;
208                 }
209                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
210                         if ((nflags & HAMMER2_TRANS_MASK) == 0 &&
211                             (oflags & HAMMER2_TRANS_WAITING)) {
212                                 wakeup(&pmp->trans.sync_wait);
213                         }
214                         break;
215                 } else {
216                         cpu_pause();
217                 }
218                 /* retry */
219         }
220 }
221
222 /*
223  * Obtain new, unique inode number (not serialized by caller).
224  */
225 hammer2_tid_t
226 hammer2_trans_newinum(hammer2_pfs_t *pmp)
227 {
228         hammer2_tid_t tid;
229
230         KKASSERT(sizeof(long) == 8);
231         tid = atomic_fetchadd_long(&pmp->inode_tid, 1);
232
233         return tid;
234 }
235
236 /*
237  * Assert that a strategy call is ok here.  Strategy calls are legal
238  *
239  * (1) In a normal transaction.
240  * (2) In a flush transaction only if PREFLUSH is also set.
241  */
242 void
243 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
244 {
245         KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0 ||
246                  (pmp->trans.flags & HAMMER2_TRANS_PREFLUSH));
247 }
248
249
250 /*
251  * Chains undergoing destruction are removed from the in-memory topology.
252  * To avoid getting lost these chains are placed on the delayed flush
253  * queue which will properly dispose of them.
254  *
255  * We do this instead of issuing an immediate flush in order to give
256  * recursive deletions (rm -rf, etc) a chance to remove more of the
257  * hierarchy, potentially allowing an enormous amount of write I/O to
258  * be avoided.
259  */
260 void
261 hammer2_delayed_flush(hammer2_chain_t *chain)
262 {
263         if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) {
264                 hammer2_spin_ex(&chain->hmp->list_spin);
265                 if ((chain->flags & (HAMMER2_CHAIN_DELAYED |
266                                      HAMMER2_CHAIN_DEFERRED)) == 0) {
267                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED |
268                                                       HAMMER2_CHAIN_DEFERRED);
269                         TAILQ_INSERT_TAIL(&chain->hmp->flushq,
270                                           chain, flush_node);
271                         hammer2_chain_ref(chain);
272                 }
273                 hammer2_spin_unex(&chain->hmp->list_spin);
274         }
275 }
276
277 /*
278  * Flush the chain and all modified sub-chains through the specified
279  * synchronization point, propagating parent chain modifications, modify_tid,
280  * and mirror_tid updates back up as needed.
281  *
282  * Caller must have already vetted synchronization points to ensure they
283  * are properly flushed.  Only snapshots and cluster flushes can create
284  * these sorts of synchronization points.
285  *
286  * This routine can be called from several places but the most important
287  * is from VFS_SYNC.
288  *
289  * chain is locked on call and will remain locked on return.  The chain's
290  * UPDATE flag indicates that its parent's block table (which is not yet
291  * part of the flush) should be updated.  The chain may be replaced by
292  * the call if it was modified.
293  */
294 void
295 hammer2_flush(hammer2_chain_t *chain, int istop)
296 {
297         hammer2_chain_t *scan;
298         hammer2_flush_info_t info;
299         hammer2_dev_t *hmp;
300         int loops;
301
302         /*
303          * Execute the recursive flush and handle deferrals.
304          *
305          * Chains can be ridiculously long (thousands deep), so to
306          * avoid blowing out the kernel stack the recursive flush has a
307          * depth limit.  Elements at the limit are placed on a list
308          * for re-execution after the stack has been popped.
309          */
310         bzero(&info, sizeof(info));
311         TAILQ_INIT(&info.flushq);
312         info.cache_index = -1;
313
314         /*
315          * Calculate parent (can be NULL), if not NULL the flush core
316          * expects the parent to be referenced so it can easily lock/unlock
317          * it without it getting ripped up.
318          */
319         if ((info.parent = chain->parent) != NULL)
320                 hammer2_chain_ref(info.parent);
321
322         /*
323          * Extra ref needed because flush_core expects it when replacing
324          * chain.
325          */
326         hammer2_chain_ref(chain);
327         hmp = chain->hmp;
328         loops = 0;
329
330         for (;;) {
331                 /*
332                  * Move hmp->flushq to info.flushq if non-empty so it can
333                  * be processed.
334                  */
335                 if (TAILQ_FIRST(&hmp->flushq) != NULL) {
336                         hammer2_spin_ex(&chain->hmp->list_spin);
337                         TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node);
338                         hammer2_spin_unex(&chain->hmp->list_spin);
339                 }
340
341                 /*
342                  * Unwind deep recursions which had been deferred.  This
343                  * can leave the FLUSH_* bits set for these chains, which
344                  * will be handled when we [re]flush chain after the unwind.
345                  */
346                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
347                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
348                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
349                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED |
350                                                        HAMMER2_CHAIN_DELAYED);
351
352                         /*
353                          * Now that we've popped back up we can do a secondary
354                          * recursion on the deferred elements.
355                          *
356                          * NOTE: hammer2_flush() may replace scan.
357                          */
358                         if (hammer2_debug & 0x0040)
359                                 kprintf("deferred flush %p\n", scan);
360                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
361                         hammer2_flush(scan, 0);
362                         hammer2_chain_unlock(scan);
363                         hammer2_chain_drop(scan);       /* ref from deferral */
364                 }
365
366                 /*
367                  * [re]flush chain.
368                  */
369                 info.diddeferral = 0;
370                 hammer2_flush_core(&info, chain, istop);
371
372                 /*
373                  * Only loop if deep recursions have been deferred.
374                  */
375                 if (TAILQ_EMPTY(&info.flushq))
376                         break;
377
378                 if (++loops % 1000 == 0) {
379                         kprintf("hammer2_flush: excessive loops on %p\n",
380                                 chain);
381                         if (hammer2_debug & 0x100000)
382                                 Debugger("hell4");
383                 }
384         }
385         hammer2_chain_drop(chain);
386         if (info.parent)
387                 hammer2_chain_drop(info.parent);
388 }
389
390 /*
391  * This is the core of the chain flushing code.  The chain is locked by the
392  * caller and must also have an extra ref on it by the caller, and remains
393  * locked and will have an extra ref on return.  Upon return, the caller can
394  * test the UPDATE bit on the child to determine if the parent needs updating.
395  *
396  * (1) Determine if this node is a candidate for the flush, return if it is
397  *     not.  fchain and vchain are always candidates for the flush.
398  *
399  * (2) If we recurse too deep the chain is entered onto the deferral list and
400  *     the current flush stack is aborted until after the deferral list is
401  *     run.
402  *
403  * (3) Recursively flush live children (rbtree).  This can create deferrals.
404  *     A successful flush clears the MODIFIED and UPDATE bits on the children
405  *     and typically causes the parent to be marked MODIFIED as the children
406  *     update the parent's block table.  A parent might already be marked
407  *     MODIFIED due to a deletion (whos blocktable update in the parent is
408  *     handled by the frontend), or if the parent itself is modified by the
409  *     frontend for other reasons.
410  *
411  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
412  *     Deleted-but-open inodes can still be individually flushed via the
413  *     filesystem syncer.
414  *
415  * (5) Note that an unmodified child may still need the block table in its
416  *     parent updated (e.g. rename/move).  The child will have UPDATE set
417  *     in this case.
418  *
419  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
420  *
421  * blockref.modify_tid is consistent only within a PFS, and will not be
422  * consistent during synchronization.  mirror_tid is consistent across the
423  * block device regardless of the PFS.
424  */
425 static void
426 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
427                    int istop)
428 {
429         hammer2_chain_t *parent;
430         hammer2_dev_t *hmp;
431         int diddeferral;
432
433         /*
434          * (1) Optimize downward recursion to locate nodes needing action.
435          *     Nothing to do if none of these flags are set.
436          */
437         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
438                 if (hammer2_debug & 0x200) {
439                         if (info->debug == NULL)
440                                 info->debug = chain;
441                 } else {
442                         return;
443                 }
444         }
445
446         hmp = chain->hmp;
447         diddeferral = info->diddeferral;
448         parent = info->parent;          /* can be NULL */
449
450         /*
451          * Downward search recursion
452          */
453         if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) {
454                 /*
455                  * Already deferred.
456                  */
457                 ++info->diddeferral;
458         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
459                 /*
460                  * Recursion depth reached.
461                  */
462                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0);
463                 hammer2_chain_ref(chain);
464                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
465                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
466                 ++info->diddeferral;
467         } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && istop == 0) {
468                 /*
469                  * We do not recurse through PFSROOTs.  PFSROOT flushes are
470                  * handled by the related pmp's (whether mounted or not,
471                  * including during recovery).
472                  *
473                  * But we must still process the PFSROOT chains for block
474                  * table updates in their parent (which IS part of our flush).
475                  *
476                  * Note that the volume root, vchain, does not set this flag.
477                  */
478                 ;
479         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
480                 /*
481                  * Downward recursion search (actual flush occurs bottom-up).
482                  * pre-clear ONFLUSH.  It can get set again due to races,
483                  * which we want so the scan finds us again in the next flush.
484                  * These races can also include 
485                  *
486                  * Flush recursions stop at PFSROOT boundaries.  Each PFS
487                  * must be individually flushed and then the root must
488                  * be flushed.
489                  */
490                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
491                 info->parent = chain;
492                 hammer2_spin_ex(&chain->core.spin);
493                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
494                         NULL, hammer2_flush_recurse, info);
495                 hammer2_spin_unex(&chain->core.spin);
496                 info->parent = parent;
497                 if (info->diddeferral)
498                         hammer2_chain_setflush(chain);
499         }
500
501         /*
502          * Now we are in the bottom-up part of the recursion.
503          *
504          * Do not update chain if lower layers were deferred.
505          */
506         if (info->diddeferral)
507                 goto done;
508
509         /*
510          * Propagate the DESTROY flag downwards.  This dummies up the flush
511          * code and tries to invalidate related buffer cache buffers to
512          * avoid the disk write.
513          */
514         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
515                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
516
517         /*
518          * Chain was already modified or has become modified, flush it out.
519          */
520 again:
521         if ((hammer2_debug & 0x200) &&
522             info->debug &&
523             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
524                 hammer2_chain_t *scan = chain;
525
526                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
527                 while (scan) {
528                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
529                                 scan, scan->flags,
530                                 scan->bref.key, scan->bref.type);
531                         if (scan == info->debug)
532                                 break;
533                         scan = scan->parent;
534                 }
535         }
536
537         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
538                 /*
539                  * Dispose of the modified bit.
540                  *
541                  * UPDATE should already be set.
542                  * bref.mirror_tid should already be set.
543                  */
544                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
545                          chain == &hmp->vchain);
546                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
547
548                 /*
549                  * Manage threads waiting for excessive dirty memory to
550                  * be retired.
551                  */
552                 if (chain->pmp)
553                         hammer2_pfs_memory_wakeup(chain->pmp);
554
555                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
556                     chain == &hmp->vchain ||
557                     chain == &hmp->fchain) {
558                         /*
559                          * Drop the ref from the MODIFIED bit we cleared,
560                          * net -1 ref.
561                          */
562                         hammer2_chain_drop(chain);
563                 } else {
564                         /*
565                          * Drop the ref from the MODIFIED bit we cleared and
566                          * set a ref for the UPDATE bit we are setting.  Net
567                          * 0 refs.
568                          */
569                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
570                 }
571
572                 /*
573                  * Issue the flush.  This is indirect via the DIO.
574                  *
575                  * NOTE: A DELETED node that reaches this point must be
576                  *       flushed for synchronization point consistency.
577                  *
578                  * NOTE: Even though MODIFIED was already set, the related DIO
579                  *       might not be dirty due to a system buffer cache
580                  *       flush and must be set dirty if we are going to make
581                  *       further modifications to the buffer.  Chains with
582                  *       embedded data don't need this.
583                  */
584                 if (hammer2_debug & 0x1000) {
585                         kprintf("Flush %p.%d %016jx/%d data=%016jx",
586                                 chain, chain->bref.type,
587                                 (uintmax_t)chain->bref.key,
588                                 chain->bref.keybits,
589                                 (uintmax_t)chain->bref.data_off);
590                 }
591                 if (hammer2_debug & 0x2000) {
592                         Debugger("Flush hell");
593                 }
594
595                 /*
596                  * Update chain CRCs for flush.
597                  *
598                  * NOTE: Volume headers are NOT flushed here as they require
599                  *       special processing.
600                  */
601                 switch(chain->bref.type) {
602                 case HAMMER2_BREF_TYPE_FREEMAP:
603                         /*
604                          * Update the volume header's freemap_tid to the
605                          * freemap's flushing mirror_tid.
606                          *
607                          * (note: embedded data, do not call setdirty)
608                          */
609                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
610                         KKASSERT(chain == &hmp->fchain);
611                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
612                         kprintf("sync freemap mirror_tid %08jx\n",
613                                 (intmax_t)chain->bref.mirror_tid);
614
615                         /*
616                          * The freemap can be flushed independently of the
617                          * main topology, but for the case where it is
618                          * flushed in the same transaction, and flushed
619                          * before vchain (a case we want to allow for
620                          * performance reasons), make sure modifications
621                          * made during the flush under vchain use a new
622                          * transaction id.
623                          *
624                          * Otherwise the mount recovery code will get confused.
625                          */
626                         ++hmp->voldata.mirror_tid;
627                         break;
628                 case HAMMER2_BREF_TYPE_VOLUME:
629                         /*
630                          * The free block table is flushed by
631                          * hammer2_vfs_sync() before it flushes vchain.
632                          * We must still hold fchain locked while copying
633                          * voldata to volsync, however.
634                          *
635                          * (note: embedded data, do not call setdirty)
636                          */
637                         hammer2_chain_lock(&hmp->fchain,
638                                            HAMMER2_RESOLVE_ALWAYS);
639                         hammer2_voldata_lock(hmp);
640                         kprintf("sync volume  mirror_tid %08jx\n",
641                                 (intmax_t)chain->bref.mirror_tid);
642
643                         /*
644                          * Update the volume header's mirror_tid to the
645                          * main topology's flushing mirror_tid.  It is
646                          * possible that voldata.mirror_tid is already
647                          * beyond bref.mirror_tid due to the bump we made
648                          * above in BREF_TYPE_FREEMAP.
649                          */
650                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
651                                 hmp->voldata.mirror_tid =
652                                         chain->bref.mirror_tid;
653                         }
654
655                         /*
656                          * The volume header is flushed manually by the
657                          * syncer, not here.  All we do here is adjust the
658                          * crc's.
659                          */
660                         KKASSERT(chain->data != NULL);
661                         KKASSERT(chain->dio == NULL);
662
663                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
664                                 hammer2_icrc32(
665                                         (char *)&hmp->voldata +
666                                          HAMMER2_VOLUME_ICRC1_OFF,
667                                         HAMMER2_VOLUME_ICRC1_SIZE);
668                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
669                                 hammer2_icrc32(
670                                         (char *)&hmp->voldata +
671                                          HAMMER2_VOLUME_ICRC0_OFF,
672                                         HAMMER2_VOLUME_ICRC0_SIZE);
673                         hmp->voldata.icrc_volheader =
674                                 hammer2_icrc32(
675                                         (char *)&hmp->voldata +
676                                          HAMMER2_VOLUME_ICRCVH_OFF,
677                                         HAMMER2_VOLUME_ICRCVH_SIZE);
678
679                         kprintf("syncvolhdr %016jx %016jx\n",
680                                 hmp->voldata.mirror_tid,
681                                 hmp->vchain.bref.mirror_tid);
682                         hmp->volsync = hmp->voldata;
683                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
684                         hammer2_voldata_unlock(hmp);
685                         hammer2_chain_unlock(&hmp->fchain);
686                         break;
687                 case HAMMER2_BREF_TYPE_DATA:
688                         /*
689                          * Data elements have already been flushed via the
690                          * logical file buffer cache.  Their hash was set in
691                          * the bref by the vop_write code.  Do not re-dirty.
692                          *
693                          * Make sure any device buffer(s) have been flushed
694                          * out here (there aren't usually any to flush) XXX.
695                          */
696                         break;
697                 case HAMMER2_BREF_TYPE_INDIRECT:
698                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
699                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
700                         /*
701                          * Buffer I/O will be cleaned up when the volume is
702                          * flushed (but the kernel is free to flush it before
703                          * then, as well).
704                          */
705                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
706                         hammer2_chain_setcheck(chain, chain->data);
707                         break;
708                 case HAMMER2_BREF_TYPE_INODE:
709                         /*
710                          * NOTE: We must call io_setdirty() to make any late
711                          *       changes to the inode data, the system might
712                          *       have already flushed the buffer.
713                          */
714                         if (chain->data->ipdata.meta.op_flags &
715                             HAMMER2_OPFLAG_PFSROOT) {
716                                 /*
717                                  * non-NULL pmp if mounted as a PFS.  We must
718                                  * sync fields cached in the pmp? XXX
719                                  */
720                                 hammer2_inode_data_t *ipdata;
721
722                                 hammer2_io_setdirty(chain->dio);
723                                 ipdata = &chain->data->ipdata;
724                                 if (chain->pmp) {
725                                         ipdata->meta.pfs_inum =
726                                                 chain->pmp->inode_tid;
727                                 }
728                         } else {
729                                 /* can't be mounted as a PFS */
730                         }
731
732                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
733                         hammer2_chain_setcheck(chain, chain->data);
734                         break;
735                 default:
736                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
737                         panic("hammer2_flush_core: unsupported "
738                               "embedded bref %d",
739                               chain->bref.type);
740                         /* NOT REACHED */
741                 }
742
743                 /*
744                  * If the chain was destroyed try to avoid unnecessary I/O.
745                  * (this only really works if the DIO system buffer is the
746                  * same size as chain->bytes).
747                  */
748                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
749                         hammer2_io_setinval(chain->dio, chain->bytes);
750                 }
751         }
752
753         /*
754          * If UPDATE is set the parent block table may need to be updated.
755          *
756          * NOTE: UPDATE may be set on vchain or fchain in which case
757          *       parent could be NULL.  It's easiest to allow the case
758          *       and test for NULL.  parent can also wind up being NULL
759          *       due to a deletion so we need to handle the case anyway.
760          *
761          * If no parent exists we can just clear the UPDATE bit.  If the
762          * chain gets reattached later on the bit will simply get set
763          * again.
764          */
765         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
766                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
767                 hammer2_chain_drop(chain);
768         }
769
770         /*
771          * The chain may need its blockrefs updated in the parent.  This
772          * requires some fancy footwork.
773          */
774         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
775                 hammer2_blockref_t *base;
776                 int count;
777
778                 /*
779                  * Both parent and chain must be locked.  This requires
780                  * temporarily unlocking the chain.  We have to deal with
781                  * the case where the chain might be reparented or modified
782                  * while it was unlocked.
783                  */
784                 hammer2_chain_unlock(chain);
785                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
786                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
787                 if (chain->parent != parent) {
788                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n",
789                                 chain, chain->parent, parent);
790                         hammer2_chain_unlock(parent);
791                         goto done;
792                 }
793
794                 /*
795                  * Check race condition.  If someone got in and modified
796                  * it again while it was unlocked, we have to loop up.
797                  */
798                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
799                         hammer2_chain_unlock(parent);
800                         kprintf("hammer2_flush: chain %p flush-mod race\n",
801                                 chain);
802                         goto again;
803                 }
804
805                 /*
806                  * Clear UPDATE flag, mark parent modified, update its
807                  * modify_tid if necessary, and adjust the parent blockmap.
808                  */
809                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
810                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
811                         hammer2_chain_drop(chain);
812                 }
813
814                 /*
815                  * (optional code)
816                  *
817                  * Avoid actually modifying and updating the parent if it
818                  * was flagged for destruction.  This can greatly reduce
819                  * disk I/O in large tree removals because the
820                  * hammer2_io_setinval() call in the upward recursion
821                  * (see MODIFIED code above) can only handle a few cases.
822                  */
823                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
824                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
825                                 parent->bref.modify_tid =
826                                         chain->bref.modify_tid;
827                         }
828                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
829                                                         HAMMER2_CHAIN_BMAPUPD);
830                         hammer2_chain_unlock(parent);
831                         goto skipupdate;
832                 }
833
834                 /*
835                  * We are updating the parent's blockmap, the parent must
836                  * be set modified.
837                  */
838                 hammer2_chain_modify(parent, HAMMER2_MODIFY_KEEPMODIFY);
839                 if (parent->bref.modify_tid < chain->bref.modify_tid)
840                         parent->bref.modify_tid = chain->bref.modify_tid;
841
842                 /*
843                  * Calculate blockmap pointer
844                  */
845                 switch(parent->bref.type) {
846                 case HAMMER2_BREF_TYPE_INODE:
847                         /*
848                          * Access the inode's block array.  However, there is
849                          * no block array if the inode is flagged DIRECTDATA.
850                          */
851                         if (parent->data &&
852                             (parent->data->ipdata.meta.op_flags &
853                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
854                                 base = &parent->data->
855                                         ipdata.u.blockset.blockref[0];
856                         } else {
857                                 base = NULL;
858                         }
859                         count = HAMMER2_SET_COUNT;
860                         break;
861                 case HAMMER2_BREF_TYPE_INDIRECT:
862                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
863                         if (parent->data)
864                                 base = &parent->data->npdata[0];
865                         else
866                                 base = NULL;
867                         count = parent->bytes / sizeof(hammer2_blockref_t);
868                         break;
869                 case HAMMER2_BREF_TYPE_VOLUME:
870                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
871                         count = HAMMER2_SET_COUNT;
872                         break;
873                 case HAMMER2_BREF_TYPE_FREEMAP:
874                         base = &parent->data->npdata[0];
875                         count = HAMMER2_SET_COUNT;
876                         break;
877                 default:
878                         base = NULL;
879                         count = 0;
880                         panic("hammer2_flush_core: "
881                               "unrecognized blockref type: %d",
882                               parent->bref.type);
883                 }
884
885                 /*
886                  * Blocktable updates
887                  *
888                  * We synchronize pending statistics at this time.  Delta
889                  * adjustments designated for the current and upper level
890                  * are synchronized.
891                  */
892                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
893                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
894                                 hammer2_spin_ex(&parent->core.spin);
895                                 hammer2_base_delete(parent, base, count,
896                                                     &info->cache_index, chain);
897                                 hammer2_spin_unex(&parent->core.spin);
898                                 /* base_delete clears both bits */
899                         } else {
900                                 atomic_clear_int(&chain->flags,
901                                                  HAMMER2_CHAIN_BMAPUPD);
902                         }
903                 }
904                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
905                         hammer2_spin_ex(&parent->core.spin);
906                         hammer2_base_insert(parent, base, count,
907                                             &info->cache_index, chain);
908                         hammer2_spin_unex(&parent->core.spin);
909                         /* base_insert sets BMAPPED */
910                 }
911                 hammer2_chain_unlock(parent);
912         }
913 skipupdate:
914         ;
915
916         /*
917          * Final cleanup after flush
918          */
919 done:
920         KKASSERT(chain->refs > 0);
921         if (hammer2_debug & 0x200) {
922                 if (info->debug == chain)
923                         info->debug = NULL;
924         }
925 }
926
927 /*
928  * Flush recursion helper, called from flush_core, calls flush_core.
929  *
930  * Flushes the children of the caller's chain (info->parent), restricted
931  * by sync_tid.  Set info->domodify if the child's blockref must propagate
932  * back up to the parent.
933  *
934  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
935  * flush scan order prevents any chains from being lost.  A child can be
936  * executes more than once.
937  *
938  * WARNING! If we do not call hammer2_flush_core() we must update
939  *          bref.mirror_tid ourselves to indicate that the flush has
940  *          processed the child.
941  *
942  * WARNING! parent->core spinlock is held on entry and return.
943  */
944 static int
945 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
946 {
947         hammer2_flush_info_t *info = data;
948         hammer2_chain_t *parent = info->parent;
949
950         /*
951          * (child can never be fchain or vchain so a special check isn't
952          *  needed).
953          *
954          * We must ref the child before unlocking the spinlock.
955          *
956          * The caller has added a ref to the parent so we can temporarily
957          * unlock it in order to lock the child.
958          */
959         hammer2_chain_ref(child);
960         hammer2_spin_unex(&parent->core.spin);
961
962         hammer2_chain_unlock(parent);
963         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
964
965         /*
966          * Recurse and collect deferral data.  We're in the media flush,
967          * this can cross PFS boundaries.
968          */
969         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
970                 ++info->depth;
971                 hammer2_flush_core(info, child, 0);
972                 --info->depth;
973         } else if (hammer2_debug & 0x200) {
974                 if (info->debug == NULL)
975                         info->debug = child;
976                 ++info->depth;
977                 hammer2_flush_core(info, child, 0);
978                 --info->depth;
979                 if (info->debug == child)
980                         info->debug = NULL;
981         }
982
983         /*
984          * Relock to continue the loop
985          */
986         hammer2_chain_unlock(child);
987         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
988         hammer2_chain_drop(child);
989         KKASSERT(info->parent == parent);
990         hammer2_spin_ex(&parent->core.spin);
991
992         return (0);
993 }
994
995 /*
996  * flush helper (backend threaded)
997  *
998  * Flushes core chains, issues disk sync, flushes volume roots.
999  *
1000  * Primarily called from vfs_sync().
1001  */
1002 void
1003 hammer2_inode_xop_flush(hammer2_xop_t *arg, int clindex)
1004 {
1005         hammer2_xop_flush_t *xop = &arg->xop_flush;
1006         hammer2_chain_t *chain;
1007         hammer2_chain_t *parent;
1008         hammer2_dev_t *hmp;
1009         int error = 0;
1010         int total_error = 0;
1011         int j;
1012
1013         /*
1014          * Flush core chains
1015          */
1016         chain = hammer2_inode_chain(xop->head.ip, clindex,
1017                                     HAMMER2_RESOLVE_ALWAYS);
1018         if (chain) {
1019                 hmp = chain->hmp;
1020                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1021                         hammer2_flush(chain, 1);
1022                         parent = chain->parent;
1023                         KKASSERT(chain->pmp != parent->pmp);
1024                         hammer2_chain_setflush(parent);
1025                 }
1026                 hammer2_chain_unlock(chain);
1027                 hammer2_chain_drop(chain);
1028                 chain = NULL;
1029         } else {
1030                 hmp = NULL;
1031         }
1032
1033         /*
1034          * Flush volume roots.  Avoid replication, we only want to
1035          * flush each hammer2_dev (hmp) once.
1036          */
1037         for (j = clindex - 1; j >= 0; --j) {
1038                 if ((chain = xop->head.ip->cluster.array[j].chain) != NULL) {
1039                         if (chain->hmp == hmp) {
1040                                 chain = NULL;   /* safety */
1041                                 goto skip;
1042                         }
1043                 }
1044         }
1045         chain = NULL;   /* safety */
1046
1047         /*
1048          * spmp transaction.  The super-root is never directly mounted so
1049          * there shouldn't be any vnodes, let alone any dirty vnodes
1050          * associated with it.
1051          */
1052         hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1053
1054         /*
1055          * Media mounts have two 'roots', vchain for the topology
1056          * and fchain for the free block table.  Flush both.
1057          *
1058          * Note that the topology and free block table are handled
1059          * independently, so the free block table can wind up being
1060          * ahead of the topology.  We depend on the bulk free scan
1061          * code to deal with any loose ends.
1062          */
1063         hammer2_chain_ref(&hmp->vchain);
1064         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1065         hammer2_chain_ref(&hmp->fchain);
1066         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1067         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1068                 /*
1069                  * This will also modify vchain as a side effect,
1070                  * mark vchain as modified now.
1071                  */
1072                 hammer2_voldata_modify(hmp);
1073                 chain = &hmp->fchain;
1074                 hammer2_flush(chain, 1);
1075                 KKASSERT(chain == &hmp->fchain);
1076         }
1077         hammer2_chain_unlock(&hmp->fchain);
1078         hammer2_chain_unlock(&hmp->vchain);
1079         hammer2_chain_drop(&hmp->fchain);
1080         /* vchain dropped down below */
1081
1082         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1083         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1084                 chain = &hmp->vchain;
1085                 hammer2_flush(chain, 1);
1086                 KKASSERT(chain == &hmp->vchain);
1087         }
1088         hammer2_chain_unlock(&hmp->vchain);
1089         hammer2_chain_drop(&hmp->vchain);
1090
1091         error = 0;
1092
1093         /*
1094          * We can't safely flush the volume header until we have
1095          * flushed any device buffers which have built up.
1096          *
1097          * XXX this isn't being incremental
1098          */
1099         vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1100         error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1101         vn_unlock(hmp->devvp);
1102
1103         /*
1104          * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1105          * volume header needs synchronization via hmp->volsync.
1106          *
1107          * XXX synchronize the flag & data with only this flush XXX
1108          */
1109         if (error == 0 &&
1110             (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1111                 struct buf *bp;
1112
1113                 /*
1114                  * Synchronize the disk before flushing the volume
1115                  * header.
1116                  */
1117                 bp = getpbuf(NULL);
1118                 bp->b_bio1.bio_offset = 0;
1119                 bp->b_bufsize = 0;
1120                 bp->b_bcount = 0;
1121                 bp->b_cmd = BUF_CMD_FLUSH;
1122                 bp->b_bio1.bio_done = biodone_sync;
1123                 bp->b_bio1.bio_flags |= BIO_SYNC;
1124                 vn_strategy(hmp->devvp, &bp->b_bio1);
1125                 biowait(&bp->b_bio1, "h2vol");
1126                 relpbuf(bp, NULL);
1127
1128                 /*
1129                  * Then we can safely flush the version of the
1130                  * volume header synchronized by the flush code.
1131                  */
1132                 j = hmp->volhdrno + 1;
1133                 if (j >= HAMMER2_NUM_VOLHDRS)
1134                         j = 0;
1135                 if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1136                     hmp->volsync.volu_size) {
1137                         j = 0;
1138                 }
1139                 kprintf("sync volhdr %d %jd\n",
1140                         j, (intmax_t)hmp->volsync.volu_size);
1141                 bp = getblk(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1142                             HAMMER2_PBUFSIZE, 0, 0);
1143                 atomic_clear_int(&hmp->vchain.flags,
1144                                  HAMMER2_CHAIN_VOLUMESYNC);
1145                 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1146                 bawrite(bp);
1147                 hmp->volhdrno = j;
1148         }
1149         if (error)
1150                 total_error = error;
1151
1152         hammer2_trans_done(hmp->spmp);  /* spmp trans */
1153 skip:
1154         error = hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1155 }