hammer2 - Retool flushing and use of mirror_tid, more cluster work.
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * Flushing generally occurs bottom-up but requires a top-down scan to
42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
43  * tells how to recurse downward to find these chains.
44  */
45
46 #include <sys/cdefs.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/types.h>
50 #include <sys/lock.h>
51 #include <sys/uuid.h>
52
53 #include "hammer2.h"
54
55 #define FLUSH_DEBUG 0
56
57 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
58
59
60 /*
61  * Recursively flush the specified chain.  The chain is locked and
62  * referenced by the caller and will remain so on return.  The chain
63  * will remain referenced throughout but can temporarily lose its
64  * lock during the recursion to avoid unnecessarily stalling user
65  * processes.
66  */
67 struct hammer2_flush_info {
68         hammer2_chain_t *parent;
69         hammer2_trans_t *trans;
70         int             depth;
71         int             diddeferral;
72         int             cache_index;
73         struct h2_flush_list flushq;
74         hammer2_xid_t   sync_xid;       /* memory synchronization point */
75         hammer2_tid_t   mirror_tid;     /* avoid digging through hmp */
76         hammer2_tid_t   modify_tid;
77         hammer2_chain_t *debug;
78 };
79
80 typedef struct hammer2_flush_info hammer2_flush_info_t;
81
82 static void hammer2_flush_core(hammer2_flush_info_t *info,
83                                 hammer2_chain_t *chain, int deleting);
84 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
85
86 /*
87  * For now use a global transaction manager.  What we ultimately want to do
88  * is give each non-overlapping hmp/pmp group its own transaction manager.
89  *
90  * Transactions govern XID tracking on the physical media (the hmp), but they
91  * also govern TID tracking which is per-PFS and thus might cross multiple
92  * hmp's.  So we can't just stuff tmanage into hammer2_dev or
93  * hammer2_pfs.
94  */
95 static hammer2_trans_manage_t   tmanage;
96
97 void
98 hammer2_trans_manage_init(void)
99 {
100         lockinit(&tmanage.translk, "h2trans", 0, 0);
101         TAILQ_INIT(&tmanage.transq);
102         tmanage.flush_xid = 1;
103         tmanage.alloc_xid = tmanage.flush_xid + 1;
104 }
105
106 hammer2_xid_t
107 hammer2_trans_newxid(hammer2_pfs_t *pmp __unused)
108 {
109         hammer2_xid_t xid;
110
111         for (;;) {
112                 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1);
113                 if (xid)
114                         break;
115         }
116         return xid;
117 }
118
119 /*
120  * Transaction support functions for writing to the filesystem.
121  *
122  * Initializing a new transaction allocates a transaction ID.  Typically
123  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
124  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
125  * media target.  The latter mode is used by the recovery code.
126  *
127  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
128  * other is a set of any number of concurrent filesystem operations.  We
129  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
130  * or we can have <running_flush> + <concurrent_fs_ops>.
131  *
132  * During a flush, new fs_ops are only blocked until the fs_ops prior to
133  * the flush complete.  The new fs_ops can then run concurrent with the flush.
134  *
135  * Buffer-cache transactions operate as fs_ops but never block.  A
136  * buffer-cache flush will run either before or after the current pending
137  * flush depending on its state.
138  */
139 void
140 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags)
141 {
142         hammer2_trans_manage_t *tman;
143         hammer2_trans_t *head;
144
145         tman = &tmanage;
146
147         bzero(trans, sizeof(*trans));
148         trans->pmp = pmp;
149         trans->flags = flags;
150         trans->td = curthread;
151
152         lockmgr(&tman->translk, LK_EXCLUSIVE);
153
154         if (flags & HAMMER2_TRANS_ISFLUSH) {
155                 /*
156                  * If multiple flushes are trying to run we have to
157                  * wait until it is our turn.  All flushes are serialized.
158                  *
159                  * We queue ourselves and then wait to become the head
160                  * of the queue, allowing all prior flushes to complete.
161                  *
162                  * Multiple normal transactions can share the current
163                  * transaction id but a flush transaction needs its own
164                  * unique TID for proper block table update accounting.
165                  */
166                 ++tman->flushcnt;
167                 ++pmp->modify_tid;
168                 tman->flush_xid = hammer2_trans_newxid(pmp);
169                 trans->sync_xid = tman->flush_xid;
170                 trans->modify_tid = pmp->modify_tid;
171                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
172                 if (TAILQ_FIRST(&tman->transq) != trans) {
173                         trans->blocked = 1;
174                         while (trans->blocked) {
175                                 lksleep(&trans->sync_xid, &tman->translk,
176                                         0, "h2multf", hz);
177                         }
178                 }
179         } else if (tman->flushcnt == 0) {
180                 /*
181                  * No flushes are pending, we can go.  Use prior flush_xid + 1.
182                  *
183                  * WARNING!  Also see hammer2_chain_setflush()
184                  */
185                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
186                 trans->sync_xid = tman->flush_xid + 1;
187
188                 /* XXX improve/optimize inode allocation */
189         } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
190                 /*
191                  * A buffer cache transaction is requested while a flush
192                  * is in progress.  The flush's PREFLUSH flag must be set
193                  * in this situation.
194                  *
195                  * The buffer cache flush takes on the main flush's
196                  * transaction id.
197                  */
198                 TAILQ_FOREACH(head, &tman->transq, entry) {
199                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
200                                 break;
201                 }
202                 KKASSERT(head);
203                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
204                 trans->flags |= HAMMER2_TRANS_PREFLUSH;
205                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
206                 trans->sync_xid = head->sync_xid;
207                 trans->modify_tid = head->modify_tid;
208                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
209                 /* not allowed to block */
210         } else {
211                 /*
212                  * A normal transaction is requested while a flush is in
213                  * progress.  We insert after the current flush and may
214                  * block.
215                  *
216                  * WARNING!  Also see hammer2_chain_setflush()
217                  */
218                 TAILQ_FOREACH(head, &tman->transq, entry) {
219                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
220                                 break;
221                 }
222                 KKASSERT(head);
223                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
224                 trans->sync_xid = head->sync_xid + 1;
225                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
226
227                 /*
228                  * XXX for now we must block new transactions, synchronous
229                  * flush mode is on by default.
230                  *
231                  * If synchronous flush mode is enabled concurrent
232                  * frontend transactions during the flush are not
233                  * allowed (except we don't have a choice for buffer
234                  * cache ops).
235                  */
236                 if (hammer2_synchronous_flush > 0 ||
237                     TAILQ_FIRST(&tman->transq) != head) {
238                         trans->blocked = 1;
239                         while (trans->blocked) {
240                                 lksleep(&trans->sync_xid, &tman->translk,
241                                         0, "h2multf", hz);
242                         }
243                 }
244         }
245         if (flags & HAMMER2_TRANS_NEWINODE) {
246                 if (pmp->spmp_hmp) {
247                         /*
248                          * Super-root transaction, all new inodes have an
249                          * inode number of 1.  Normal pfs inode cache
250                          * semantics are not used.
251                          */
252                         trans->inode_tid = 1;
253                 } else {
254                         /*
255                          * Normal transaction
256                          */
257                         if (pmp->inode_tid < HAMMER2_INODE_START)
258                                 pmp->inode_tid = HAMMER2_INODE_START;
259                         trans->inode_tid = pmp->inode_tid++;
260                 }
261         }
262
263         lockmgr(&tman->translk, LK_RELEASE);
264 }
265
266 void
267 hammer2_trans_done(hammer2_trans_t *trans)
268 {
269         hammer2_trans_manage_t *tman;
270         hammer2_trans_t *head;
271         hammer2_trans_t *scan;
272
273         tman = &tmanage;
274
275         /*
276          * Remove.
277          */
278         lockmgr(&tman->translk, LK_EXCLUSIVE);
279         TAILQ_REMOVE(&tman->transq, trans, entry);
280         head = TAILQ_FIRST(&tman->transq);
281
282         /*
283          * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
284          * up through the next flush.  (If the head is a flush then we
285          * stop there, unlike the unblock code following this section).
286          */
287         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
288                 --tman->flushcnt;
289                 scan = head;
290                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
291                         atomic_clear_int(&scan->flags,
292                                          HAMMER2_TRANS_CONCURRENT);
293                         scan = TAILQ_NEXT(scan, entry);
294                 }
295         }
296
297         /*
298          * Unblock the head of the queue and any additional transactions
299          * up to the next flush.  The head can be a flush and it will be
300          * unblocked along with the non-flush transactions following it
301          * (which are allowed to run concurrently with it).
302          *
303          * In synchronous flush mode we stop if the head transaction is
304          * a flush.
305          */
306         if (head && head->blocked) {
307                 head->blocked = 0;
308                 wakeup(&head->sync_xid);
309
310                 if (hammer2_synchronous_flush > 0)
311                         scan = head;
312                 else
313                         scan = TAILQ_NEXT(head, entry);
314                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
315                         if (scan->blocked) {
316                                 scan->blocked = 0;
317                                 wakeup(&scan->sync_xid);
318                         }
319                         scan = TAILQ_NEXT(scan, entry);
320                 }
321         }
322         lockmgr(&tman->translk, LK_RELEASE);
323 }
324
325 /*
326  * Flush the chain and all modified sub-chains through the specified
327  * synchronization point, propagating parent chain modifications, modify_tid,
328  * and mirror_tid updates back up as needed.
329  *
330  * Caller must have interlocked against any non-flush-related modifying
331  * operations in progress whos XXX values are less than or equal
332  * to the passed sync_xid.
333  *
334  * Caller must have already vetted synchronization points to ensure they
335  * are properly flushed.  Only snapshots and cluster flushes can create
336  * these sorts of synchronization points.
337  *
338  * This routine can be called from several places but the most important
339  * is from VFS_SYNC.
340  *
341  * chain is locked on call and will remain locked on return.  The chain's
342  * UPDATE flag indicates that its parent's block table (which is not yet
343  * part of the flush) should be updated.  The chain may be replaced by
344  * the call if it was modified.
345  */
346 void
347 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
348 {
349         hammer2_chain_t *scan;
350         hammer2_flush_info_t info;
351         int loops;
352
353         /*
354          * Execute the recursive flush and handle deferrals.
355          *
356          * Chains can be ridiculously long (thousands deep), so to
357          * avoid blowing out the kernel stack the recursive flush has a
358          * depth limit.  Elements at the limit are placed on a list
359          * for re-execution after the stack has been popped.
360          */
361         bzero(&info, sizeof(info));
362         TAILQ_INIT(&info.flushq);
363         info.trans = trans;
364         info.sync_xid = trans->sync_xid;
365         info.cache_index = -1;
366
367         /*
368          * Calculate parent (can be NULL), if not NULL the flush core
369          * expects the parent to be referenced so it can easily lock/unlock
370          * it without it getting ripped up.
371          */
372         if ((info.parent = chain->parent) != NULL)
373                 hammer2_chain_ref(info.parent);
374
375         /*
376          * Extra ref needed because flush_core expects it when replacing
377          * chain.
378          */
379         hammer2_chain_ref(chain);
380         loops = 0;
381
382         for (;;) {
383                 /*
384                  * Unwind deep recursions which had been deferred.  This
385                  * can leave the FLUSH_* bits set for these chains, which
386                  * will be handled when we [re]flush chain after the unwind.
387                  */
388                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
389                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
390                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
391                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
392
393                         /*
394                          * Now that we've popped back up we can do a secondary
395                          * recursion on the deferred elements.
396                          *
397                          * NOTE: hammer2_flush() may replace scan.
398                          */
399                         if (hammer2_debug & 0x0040)
400                                 kprintf("deferred flush %p\n", scan);
401                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
402                         hammer2_flush(trans, scan);
403                         hammer2_chain_unlock(scan);
404                         hammer2_chain_drop(scan);       /* ref from deferral */
405                 }
406
407                 /*
408                  * [re]flush chain.
409                  */
410                 info.diddeferral = 0;
411                 hammer2_flush_core(&info, chain, 0);
412
413                 /*
414                  * Only loop if deep recursions have been deferred.
415                  */
416                 if (TAILQ_EMPTY(&info.flushq))
417                         break;
418
419                 if (++loops % 1000 == 0) {
420                         kprintf("hammer2_flush: excessive loops on %p\n",
421                                 chain);
422                         if (hammer2_debug & 0x100000)
423                                 Debugger("hell4");
424                 }
425         }
426         hammer2_chain_drop(chain);
427         if (info.parent)
428                 hammer2_chain_drop(info.parent);
429 }
430
431 /*
432  * This is the core of the chain flushing code.  The chain is locked by the
433  * caller and must also have an extra ref on it by the caller, and remains
434  * locked and will have an extra ref on return.  Upon return, the caller can
435  * test the UPDATE bit on the child to determine if the parent needs updating.
436  *
437  * (1) Determine if this node is a candidate for the flush, return if it is
438  *     not.  fchain and vchain are always candidates for the flush.
439  *
440  * (2) If we recurse too deep the chain is entered onto the deferral list and
441  *     the current flush stack is aborted until after the deferral list is
442  *     run.
443  *
444  * (3) Recursively flush live children (rbtree).  This can create deferrals.
445  *     A successful flush clears the MODIFIED and UPDATE bits on the children
446  *     and typically causes the parent to be marked MODIFIED as the children
447  *     update the parent's block table.  A parent might already be marked
448  *     MODIFIED due to a deletion (whos blocktable update in the parent is
449  *     handled by the frontend), or if the parent itself is modified by the
450  *     frontend for other reasons.
451  *
452  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
453  *     Deleted-but-open inodes can still be individually flushed via the
454  *     filesystem syncer.
455  *
456  * (5) Note that an unmodified child may still need the block table in its
457  *     parent updated (e.g. rename/move).  The child will have UPDATE set
458  *     in this case.
459  *
460  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
461  *
462  * blockref.modify_tid is consistent only within a PFS, and will not be
463  * consistent during synchronization.  mirror_tid is consistent across the
464  * block device regardless of the PFS.
465  */
466 static void
467 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
468                    int deleting)
469 {
470         hammer2_chain_t *parent;
471         hammer2_dev_t *hmp;
472         int diddeferral;
473
474         /*
475          * (1) Optimize downward recursion to locate nodes needing action.
476          *     Nothing to do if none of these flags are set.
477          */
478         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
479                 if (hammer2_debug & 0x200) {
480                         if (info->debug == NULL)
481                                 info->debug = chain;
482                 } else {
483                         return;
484                 }
485         }
486
487         hmp = chain->hmp;
488         diddeferral = info->diddeferral;
489         parent = info->parent;          /* can be NULL */
490
491         /*
492          * Downward search recursion
493          */
494         if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
495                 /*
496                  * Already deferred.
497                  */
498                 ++info->diddeferral;
499         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
500                 /*
501                  * Recursion depth reached.
502                  */
503                 hammer2_chain_ref(chain);
504                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
505                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
506                 ++info->diddeferral;
507         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
508                 /*
509                  * Downward recursion search (actual flush occurs bottom-up).
510                  * pre-clear ONFLUSH.  It can get set again due to races,
511                  * which we want so the scan finds us again in the next flush.
512                  */
513                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
514                 info->parent = chain;
515                 hammer2_spin_ex(&chain->core.spin);
516                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
517                         NULL, hammer2_flush_recurse, info);
518                 hammer2_spin_unex(&chain->core.spin);
519                 info->parent = parent;
520                 if (info->diddeferral)
521                         hammer2_chain_setflush(info->trans, chain);
522         }
523
524         /*
525          * Now we are in the bottom-up part of the recursion.
526          *
527          * Do not update chain if lower layers were deferred.
528          */
529         if (info->diddeferral)
530                 goto done;
531
532         /*
533          * Propagate the DESTROY flag downwards.  This dummies up the flush
534          * code and tries to invalidate related buffer cache buffers to
535          * avoid the disk write.
536          */
537         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
538                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
539
540         /*
541          * Chain was already modified or has become modified, flush it out.
542          */
543 again:
544         if ((hammer2_debug & 0x200) &&
545             info->debug &&
546             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
547                 hammer2_chain_t *scan = chain;
548
549                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
550                 while (scan) {
551                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
552                                 scan, scan->flags,
553                                 scan->bref.key, scan->bref.type);
554                         if (scan == info->debug)
555                                 break;
556                         scan = scan->parent;
557                 }
558         }
559
560         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
561                 /*
562                  * Dispose of the modified bit.
563                  *
564                  * UPDATE should already be set.
565                  * bref.mirror_tid should already be set.
566                  */
567                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
568                          chain == &hmp->vchain);
569                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
570
571                 /*
572                  * Manage threads waiting for excessive dirty memory to
573                  * be retired.
574                  */
575                 if (chain->pmp)
576                         hammer2_pfs_memory_wakeup(chain->pmp);
577
578                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
579                     chain == &hmp->vchain ||
580                     chain == &hmp->fchain) {
581                         /*
582                          * Drop the ref from the MODIFIED bit we cleared,
583                          * net -1 ref.
584                          */
585                         hammer2_chain_drop(chain);
586                 } else {
587                         /*
588                          * Drop the ref from the MODIFIED bit we cleared and
589                          * set a ref for the UPDATE bit we are setting.  Net
590                          * 0 refs.
591                          */
592                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
593                 }
594
595                 /*
596                  * Issue the flush.  This is indirect via the DIO.
597                  *
598                  * NOTE: A DELETED node that reaches this point must be
599                  *       flushed for synchronization point consistency.
600                  *
601                  * NOTE: Even though MODIFIED was already set, the related DIO
602                  *       might not be dirty due to a system buffer cache
603                  *       flush and must be set dirty if we are going to make
604                  *       further modifications to the buffer.  Chains with
605                  *       embedded data don't need this.
606                  */
607                 if (hammer2_debug & 0x1000) {
608                         kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
609                                 "data=%016jx\n",
610                                 chain, chain->bref.type,
611                                 chain->bref.key, chain->bref.keybits,
612                                 info->sync_xid,
613                                 chain->bref.data_off);
614                 }
615                 if (hammer2_debug & 0x2000) {
616                         Debugger("Flush hell");
617                 }
618
619                 /*
620                  * Update chain CRCs for flush.
621                  *
622                  * NOTE: Volume headers are NOT flushed here as they require
623                  *       special processing.
624                  */
625                 switch(chain->bref.type) {
626                 case HAMMER2_BREF_TYPE_FREEMAP:
627                         /*
628                          * Update the volume header's freemap_tid to the
629                          * freemap's flushing mirror_tid.
630                          *
631                          * (note: embedded data, do not call setdirty)
632                          */
633                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
634                         KKASSERT(chain == &hmp->fchain);
635                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
636                         kprintf("sync freemap mirror_tid %08jx\n",
637                                 (intmax_t)chain->bref.mirror_tid);
638
639                         /*
640                          * The freemap can be flushed independently of the
641                          * main topology, but for the case where it is
642                          * flushed in the same transaction, and flushed
643                          * before vchain (a case we want to allow for
644                          * performance reasons), make sure modifications
645                          * made during the flush under vchain use a new
646                          * transaction id.
647                          *
648                          * Otherwise the mount recovery code will get confused.
649                          */
650                         ++hmp->voldata.mirror_tid;
651                         break;
652                 case HAMMER2_BREF_TYPE_VOLUME:
653                         /*
654                          * The free block table is flushed by
655                          * hammer2_vfs_sync() before it flushes vchain.
656                          * We must still hold fchain locked while copying
657                          * voldata to volsync, however.
658                          *
659                          * (note: embedded data, do not call setdirty)
660                          */
661                         hammer2_voldata_lock(hmp);
662                         hammer2_chain_lock(&hmp->fchain,
663                                            HAMMER2_RESOLVE_ALWAYS);
664                         kprintf("sync volume  mirror_tid %08jx\n",
665                                 (intmax_t)chain->bref.mirror_tid);
666
667                         /*
668                          * Update the volume header's mirror_tid to the
669                          * main topology's flushing mirror_tid.  It is
670                          * possible that voldata.mirror_tid is already
671                          * beyond bref.mirror_tid due to the bump we made
672                          * above in BREF_TYPE_FREEMAP.
673                          */
674                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
675                                 hmp->voldata.mirror_tid =
676                                         chain->bref.mirror_tid;
677                         }
678
679                         /*
680                          * The volume header is flushed manually by the
681                          * syncer, not here.  All we do here is adjust the
682                          * crc's.
683                          */
684                         KKASSERT(chain->data != NULL);
685                         KKASSERT(chain->dio == NULL);
686
687                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
688                                 hammer2_icrc32(
689                                         (char *)&hmp->voldata +
690                                          HAMMER2_VOLUME_ICRC1_OFF,
691                                         HAMMER2_VOLUME_ICRC1_SIZE);
692                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
693                                 hammer2_icrc32(
694                                         (char *)&hmp->voldata +
695                                          HAMMER2_VOLUME_ICRC0_OFF,
696                                         HAMMER2_VOLUME_ICRC0_SIZE);
697                         hmp->voldata.icrc_volheader =
698                                 hammer2_icrc32(
699                                         (char *)&hmp->voldata +
700                                          HAMMER2_VOLUME_ICRCVH_OFF,
701                                         HAMMER2_VOLUME_ICRCVH_SIZE);
702
703                         kprintf("syncvolhdr %016jx %016jx\n",
704                                 hmp->voldata.mirror_tid,
705                                 hmp->vchain.bref.mirror_tid);
706                         hmp->volsync = hmp->voldata;
707                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
708                         hammer2_chain_unlock(&hmp->fchain);
709                         hammer2_voldata_unlock(hmp);
710                         break;
711                 case HAMMER2_BREF_TYPE_DATA:
712                         /*
713                          * Data elements have already been flushed via the
714                          * logical file buffer cache.  Their hash was set in
715                          * the bref by the vop_write code.  Do not re-dirty.
716                          *
717                          * Make sure any device buffer(s) have been flushed
718                          * out here (there aren't usually any to flush) XXX.
719                          */
720                         break;
721                 case HAMMER2_BREF_TYPE_INDIRECT:
722                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
723                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
724                         /*
725                          * Buffer I/O will be cleaned up when the volume is
726                          * flushed (but the kernel is free to flush it before
727                          * then, as well).
728                          */
729                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
730                         hammer2_chain_setcheck(chain, chain->data);
731                         break;
732                 case HAMMER2_BREF_TYPE_INODE:
733                         /*
734                          * NOTE: We must call io_setdirty() to make any late
735                          *       changes to the inode data, the system might
736                          *       have already flushed the buffer.
737                          */
738                         if (chain->data->ipdata.op_flags &
739                             HAMMER2_OPFLAG_PFSROOT) {
740                                 /*
741                                  * non-NULL pmp if mounted as a PFS.  We must
742                                  * sync fields cached in the pmp? XXX
743                                  */
744                                 hammer2_inode_data_t *ipdata;
745
746                                 hammer2_io_setdirty(chain->dio);
747                                 ipdata = &chain->data->ipdata;
748                                 if (chain->pmp) {
749                                         ipdata->pfs_inum =
750                                                 chain->pmp->inode_tid;
751                                 }
752                         } else {
753                                 /* can't be mounted as a PFS */
754                         }
755
756                         /*
757                          * Update inode statistics.  Pending stats in chain
758                          * are cleared out on UPDATE so expect that bit to
759                          * be set here too or the statistics will not be
760                          * rolled-up properly.
761                          */
762                         if (chain->data_count || chain->inode_count) {
763                                 hammer2_inode_data_t *ipdata;
764
765                                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
766                                 hammer2_io_setdirty(chain->dio);
767                                 ipdata = &chain->data->ipdata;
768                                 ipdata->data_count += chain->data_count;
769                                 ipdata->inode_count += chain->inode_count;
770                         }
771                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
772                         hammer2_chain_setcheck(chain, chain->data);
773                         break;
774                 default:
775                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
776                         panic("hammer2_flush_core: unsupported "
777                               "embedded bref %d",
778                               chain->bref.type);
779                         /* NOT REACHED */
780                 }
781
782                 /*
783                  * If the chain was destroyed try to avoid unnecessary I/O.
784                  * (this only really works if the DIO system buffer is the
785                  * same size as chain->bytes).
786                  */
787                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
788                         hammer2_io_setinval(chain->dio, chain->bytes);
789                 }
790         }
791
792         /*
793          * If UPDATE is set the parent block table may need to be updated.
794          *
795          * NOTE: UPDATE may be set on vchain or fchain in which case
796          *       parent could be NULL.  It's easiest to allow the case
797          *       and test for NULL.  parent can also wind up being NULL
798          *       due to a deletion so we need to handle the case anyway.
799          *
800          * If no parent exists we can just clear the UPDATE bit.  If the
801          * chain gets reattached later on the bit will simply get set
802          * again.
803          */
804         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
805                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
806                 hammer2_chain_drop(chain);
807         }
808
809         /*
810          * The chain may need its blockrefs updated in the parent.  This
811          * requires some fancy footwork.
812          */
813         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
814                 hammer2_blockref_t *base;
815                 int count;
816
817                 /*
818                  * Both parent and chain must be locked.  This requires
819                  * temporarily unlocking the chain.  We have to deal with
820                  * the case where the chain might be reparented or modified
821                  * while it was unlocked.
822                  */
823                 hammer2_chain_unlock(chain);
824                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
825                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
826                 if (chain->parent != parent) {
827                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent);
828                         hammer2_chain_unlock(parent);
829                         goto done;
830                 }
831
832                 /*
833                  * Check race condition.  If someone got in and modified
834                  * it again while it was unlocked, we have to loop up.
835                  */
836                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
837                         hammer2_chain_unlock(parent);
838                         kprintf("hammer2_flush: chain %p flush-mod race\n",
839                                 chain);
840                         goto again;
841                 }
842
843                 /*
844                  * Clear UPDATE flag
845                  */
846                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
847                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
848                         hammer2_chain_drop(chain);
849                 }
850                 hammer2_chain_modify(info->trans, parent, 0);
851
852                 /*
853                  * Calculate blockmap pointer
854                  */
855                 switch(parent->bref.type) {
856                 case HAMMER2_BREF_TYPE_INODE:
857                         /*
858                          * Access the inode's block array.  However, there is
859                          * no block array if the inode is flagged DIRECTDATA.
860                          */
861                         if (parent->data &&
862                             (parent->data->ipdata.op_flags &
863                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
864                                 base = &parent->data->
865                                         ipdata.u.blockset.blockref[0];
866                         } else {
867                                 base = NULL;
868                         }
869                         count = HAMMER2_SET_COUNT;
870                         break;
871                 case HAMMER2_BREF_TYPE_INDIRECT:
872                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
873                         if (parent->data)
874                                 base = &parent->data->npdata[0];
875                         else
876                                 base = NULL;
877                         count = parent->bytes / sizeof(hammer2_blockref_t);
878                         break;
879                 case HAMMER2_BREF_TYPE_VOLUME:
880                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
881                         count = HAMMER2_SET_COUNT;
882                         break;
883                 case HAMMER2_BREF_TYPE_FREEMAP:
884                         base = &parent->data->npdata[0];
885                         count = HAMMER2_SET_COUNT;
886                         break;
887                 default:
888                         base = NULL;
889                         count = 0;
890                         panic("hammer2_flush_core: "
891                               "unrecognized blockref type: %d",
892                               parent->bref.type);
893                 }
894
895                 /*
896                  * Blocktable updates
897                  *
898                  * We synchronize pending statistics at this time.  Delta
899                  * adjustments designated for the current and upper level
900                  * are synchronized.
901                  */
902                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
903                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
904                                 hammer2_base_delete(info->trans, parent,
905                                                     base, count,
906                                                     &info->cache_index, chain);
907                                 /* base_delete clears both bits */
908                         } else {
909                                 atomic_clear_int(&chain->flags,
910                                                  HAMMER2_CHAIN_BMAPUPD);
911                         }
912                 }
913                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
914                         parent->data_count += chain->data_count +
915                                               chain->data_count_up;
916                         parent->inode_count += chain->inode_count +
917                                                chain->inode_count_up;
918                         chain->data_count = 0;
919                         chain->inode_count = 0;
920                         chain->data_count_up = 0;
921                         chain->inode_count_up = 0;
922                         hammer2_base_insert(info->trans, parent,
923                                             base, count,
924                                             &info->cache_index, chain);
925                         /* base_insert sets BMAPPED */
926                 }
927                 hammer2_chain_unlock(parent);
928         }
929
930         /*
931          * Final cleanup after flush
932          */
933 done:
934         KKASSERT(chain->refs > 0);
935         if (hammer2_debug & 0x200) {
936                 if (info->debug == chain)
937                         info->debug = NULL;
938         }
939 }
940
941 /*
942  * Flush recursion helper, called from flush_core, calls flush_core.
943  *
944  * Flushes the children of the caller's chain (info->parent), restricted
945  * by sync_tid.  Set info->domodify if the child's blockref must propagate
946  * back up to the parent.
947  *
948  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
949  * flush scan order prevents any chains from being lost.  A child can be
950  * executes more than once.
951  *
952  * WARNING! If we do not call hammer2_flush_core() we must update
953  *          bref.mirror_tid ourselves to indicate that the flush has
954  *          processed the child.
955  *
956  * WARNING! parent->core spinlock is held on entry and return.
957  *
958  * WARNING! Flushes do not cross PFS boundaries.  Specifically, a flush must
959  *          not cross a pfs-root boundary.
960  */
961 static int
962 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
963 {
964         hammer2_flush_info_t *info = data;
965         /*hammer2_trans_t *trans = info->trans;*/
966         hammer2_chain_t *parent = info->parent;
967
968         /*
969          * (child can never be fchain or vchain so a special check isn't
970          *  needed).
971          *
972          * We must ref the child before unlocking the spinlock.
973          *
974          * The caller has added a ref to the parent so we can temporarily
975          * unlock it in order to lock the child.
976          */
977         hammer2_chain_ref(child);
978         hammer2_spin_unex(&parent->core.spin);
979
980         hammer2_chain_unlock(parent);
981         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
982
983         /*
984          * Recurse and collect deferral data.  We're in the media flush,
985          * this can cross PFS boundaries.
986          */
987         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
988                 ++info->depth;
989                 hammer2_flush_core(info, child, 0); /* XXX deleting */
990                 --info->depth;
991         } else if (hammer2_debug & 0x200) {
992                 if (info->debug == NULL)
993                         info->debug = child;
994                 ++info->depth;
995                 hammer2_flush_core(info, child, 0); /* XXX deleting */
996                 --info->depth;
997                 if (info->debug == child)
998                         info->debug = NULL;
999         }
1000
1001         /*
1002          * Relock to continue the loop
1003          */
1004         hammer2_chain_unlock(child);
1005         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1006         hammer2_chain_drop(child);
1007         KKASSERT(info->parent == parent);
1008         hammer2_spin_ex(&parent->core.spin);
1009
1010         return (0);
1011 }