Remove obsolete ieee80211_amrr.9 manual page.
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * The biggest issue is that each PFS may belong to a cluster so its media
42  * modify_tid and mirror_tid fields are in a completely different domain
43  * than the topology related to the super-root.
44  *
45  * Flushing generally occurs bottom-up but requires a top-down scan to
46  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
47  * tells how to recurse downward to find these chains.
48  */
49
50 #include <sys/cdefs.h>
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/lock.h>
55 #include <sys/uuid.h>
56
57 #include "hammer2.h"
58
59 #define FLUSH_DEBUG 0
60
61 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
62
63
64 /*
65  * Recursively flush the specified chain.  The chain is locked and
66  * referenced by the caller and will remain so on return.  The chain
67  * will remain referenced throughout but can temporarily lose its
68  * lock during the recursion to avoid unnecessarily stalling user
69  * processes.
70  */
71 struct hammer2_flush_info {
72         hammer2_chain_t *parent;
73         hammer2_trans_t *trans;
74         int             depth;
75         int             diddeferral;
76         int             cache_index;
77         struct h2_flush_list flushq;
78         hammer2_xid_t   sync_xid;       /* memory synchronization point */
79         hammer2_chain_t *debug;
80 };
81
82 typedef struct hammer2_flush_info hammer2_flush_info_t;
83
84 static void hammer2_flush_core(hammer2_flush_info_t *info,
85                                 hammer2_chain_t *chain, int deleting);
86 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
87
88 /*
89  * For now use a global transaction manager.  What we ultimately want to do
90  * is give each non-overlapping hmp/pmp group its own transaction manager.
91  *
92  * Transactions govern XID tracking on the physical media (the hmp), but they
93  * also govern TID tracking which is per-PFS and thus might cross multiple
94  * hmp's.  So we can't just stuff tmanage into hammer2_mount or
95  * hammer2_pfsmount.
96  */
97 static hammer2_trans_manage_t   tmanage;
98
99 void
100 hammer2_trans_manage_init(void)
101 {
102         lockinit(&tmanage.translk, "h2trans", 0, 0);
103         TAILQ_INIT(&tmanage.transq);
104         tmanage.flush_xid = 1;
105         tmanage.alloc_xid = tmanage.flush_xid + 1;
106 }
107
108 hammer2_xid_t
109 hammer2_trans_newxid(hammer2_pfsmount_t *pmp __unused)
110 {
111         hammer2_xid_t xid;
112
113         for (;;) {
114                 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1);
115                 if (xid)
116                         break;
117         }
118         return xid;
119 }
120
121 /*
122  * Transaction support functions for writing to the filesystem.
123  *
124  * Initializing a new transaction allocates a transaction ID.  Typically
125  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
126  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
127  * media target.  The latter mode is used by the recovery code.
128  *
129  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
130  * other is a set of any number of concurrent filesystem operations.  We
131  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
132  * or we can have <running_flush> + <concurrent_fs_ops>.
133  *
134  * During a flush, new fs_ops are only blocked until the fs_ops prior to
135  * the flush complete.  The new fs_ops can then run concurrent with the flush.
136  *
137  * Buffer-cache transactions operate as fs_ops but never block.  A
138  * buffer-cache flush will run either before or after the current pending
139  * flush depending on its state.
140  */
141 void
142 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfsmount_t *pmp, int flags)
143 {
144         hammer2_trans_manage_t *tman;
145         hammer2_trans_t *head;
146
147         tman = &tmanage;
148
149         bzero(trans, sizeof(*trans));
150         trans->pmp = pmp;
151         trans->flags = flags;
152         trans->td = curthread;
153
154         lockmgr(&tman->translk, LK_EXCLUSIVE);
155
156         if (flags & HAMMER2_TRANS_ISFLUSH) {
157                 /*
158                  * If multiple flushes are trying to run we have to
159                  * wait until it is our turn.  All flushes are serialized.
160                  *
161                  * We queue ourselves and then wait to become the head
162                  * of the queue, allowing all prior flushes to complete.
163                  *
164                  * Multiple normal transactions can share the current
165                  * transaction id but a flush transaction needs its own
166                  * unique TID for proper block table update accounting.
167                  */
168                 ++tman->flushcnt;
169                 ++pmp->alloc_tid;
170                 pmp->flush_tid = pmp->alloc_tid;
171                 tman->flush_xid = hammer2_trans_newxid(pmp);
172                 trans->sync_xid = tman->flush_xid;
173                 ++pmp->alloc_tid;
174                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
175                 if (TAILQ_FIRST(&tman->transq) != trans) {
176                         trans->blocked = 1;
177                         while (trans->blocked) {
178                                 lksleep(&trans->sync_xid, &tman->translk,
179                                         0, "h2multf", hz);
180                         }
181                 }
182         } else if (tman->flushcnt == 0) {
183                 /*
184                  * No flushes are pending, we can go.  Use prior flush_xid + 1.
185                  *
186                  * WARNING!  Also see hammer2_chain_setflush()
187                  */
188                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
189                 trans->sync_xid = tman->flush_xid + 1;
190
191                 /* XXX improve/optimize inode allocation */
192         } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
193                 /*
194                  * A buffer cache transaction is requested while a flush
195                  * is in progress.  The flush's PREFLUSH flag must be set
196                  * in this situation.
197                  *
198                  * The buffer cache flush takes on the main flush's
199                  * transaction id.
200                  */
201                 TAILQ_FOREACH(head, &tman->transq, entry) {
202                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
203                                 break;
204                 }
205                 KKASSERT(head);
206                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
207                 trans->flags |= HAMMER2_TRANS_PREFLUSH;
208                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
209                 trans->sync_xid = head->sync_xid;
210                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
211                 /* not allowed to block */
212         } else {
213                 /*
214                  * A normal transaction is requested while a flush is in
215                  * progress.  We insert after the current flush and may
216                  * block.
217                  *
218                  * WARNING!  Also see hammer2_chain_setflush()
219                  */
220                 TAILQ_FOREACH(head, &tman->transq, entry) {
221                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
222                                 break;
223                 }
224                 KKASSERT(head);
225                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
226                 trans->sync_xid = head->sync_xid + 1;
227                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
228
229                 /*
230                  * XXX for now we must block new transactions, synchronous
231                  * flush mode is on by default.
232                  *
233                  * If synchronous flush mode is enabled concurrent
234                  * frontend transactions during the flush are not
235                  * allowed (except we don't have a choice for buffer
236                  * cache ops).
237                  */
238                 if (hammer2_synchronous_flush > 0 ||
239                     TAILQ_FIRST(&tman->transq) != head) {
240                         trans->blocked = 1;
241                         while (trans->blocked) {
242                                 lksleep(&trans->sync_xid,
243                                         &tman->translk, 0,
244                                         "h2multf", hz);
245                         }
246                 }
247         }
248         if (flags & HAMMER2_TRANS_NEWINODE) {
249                 if (pmp->spmp_hmp) {
250                         /*
251                          * Super-root transaction, all new inodes have an
252                          * inode number of 1.  Normal pfs inode cache
253                          * semantics are not used.
254                          */
255                         trans->inode_tid = 1;
256                 } else {
257                         /*
258                          * Normal transaction
259                          */
260                         if (pmp->inode_tid < HAMMER2_INODE_START)
261                                 pmp->inode_tid = HAMMER2_INODE_START;
262                         trans->inode_tid = pmp->inode_tid++;
263                 }
264         }
265
266         lockmgr(&tman->translk, LK_RELEASE);
267 }
268
269 /*
270  * This may only be called while in a flush transaction.  It's a bit of a
271  * hack but after flushing a PFS we need to flush each volume root as part
272  * of the same transaction.
273  */
274 void
275 hammer2_trans_spmp(hammer2_trans_t *trans, hammer2_pfsmount_t *spmp)
276 {
277         ++spmp->alloc_tid;
278         spmp->flush_tid = spmp->alloc_tid;
279         ++spmp->alloc_tid;
280         trans->pmp = spmp;
281 }
282
283
284 void
285 hammer2_trans_done(hammer2_trans_t *trans)
286 {
287         hammer2_trans_manage_t *tman;
288         hammer2_trans_t *head;
289         hammer2_trans_t *scan;
290
291         tman = &tmanage;
292
293         /*
294          * Remove.
295          */
296         lockmgr(&tman->translk, LK_EXCLUSIVE);
297         TAILQ_REMOVE(&tman->transq, trans, entry);
298         head = TAILQ_FIRST(&tman->transq);
299
300         /*
301          * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
302          * up through the next flush.  (If the head is a flush then we
303          * stop there, unlike the unblock code following this section).
304          */
305         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
306                 --tman->flushcnt;
307                 scan = head;
308                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
309                         atomic_clear_int(&scan->flags,
310                                          HAMMER2_TRANS_CONCURRENT);
311                         scan = TAILQ_NEXT(scan, entry);
312                 }
313         }
314
315         /*
316          * Unblock the head of the queue and any additional transactions
317          * up to the next flush.  The head can be a flush and it will be
318          * unblocked along with the non-flush transactions following it
319          * (which are allowed to run concurrently with it).
320          *
321          * In synchronous flush mode we stop if the head transaction is
322          * a flush.
323          */
324         if (head && head->blocked) {
325                 head->blocked = 0;
326                 wakeup(&head->sync_xid);
327
328                 if (hammer2_synchronous_flush > 0)
329                         scan = head;
330                 else
331                         scan = TAILQ_NEXT(head, entry);
332                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
333                         if (scan->blocked) {
334                                 scan->blocked = 0;
335                                 wakeup(&scan->sync_xid);
336                         }
337                         scan = TAILQ_NEXT(scan, entry);
338                 }
339         }
340         lockmgr(&tman->translk, LK_RELEASE);
341 }
342
343 /*
344  * Flush the chain and all modified sub-chains through the specified
345  * synchronization point, propagating parent chain modifications and
346  * mirror_tid updates back up as needed.
347  *
348  * Caller must have interlocked against any non-flush-related modifying
349  * operations in progress whos XXX values are less than or equal
350  * to the passed sync_xid.
351  *
352  * Caller must have already vetted synchronization points to ensure they
353  * are properly flushed.  Only snapshots and cluster flushes can create
354  * these sorts of synchronization points.
355  *
356  * This routine can be called from several places but the most important
357  * is from VFS_SYNC.
358  *
359  * chain is locked on call and will remain locked on return.  The chain's
360  * UPDATE flag indicates that its parent's block table (which is not yet
361  * part of the flush) should be updated.  The chain may be replaced by
362  * the call if it was modified.
363  */
364 void
365 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
366 {
367         hammer2_chain_t *scan;
368         hammer2_flush_info_t info;
369         int loops;
370
371         /*
372          * Execute the recursive flush and handle deferrals.
373          *
374          * Chains can be ridiculously long (thousands deep), so to
375          * avoid blowing out the kernel stack the recursive flush has a
376          * depth limit.  Elements at the limit are placed on a list
377          * for re-execution after the stack has been popped.
378          */
379         bzero(&info, sizeof(info));
380         TAILQ_INIT(&info.flushq);
381         info.trans = trans;
382         info.sync_xid = trans->sync_xid;
383         info.cache_index = -1;
384
385         /*
386          * Calculate parent (can be NULL), if not NULL the flush core
387          * expects the parent to be referenced so it can easily lock/unlock
388          * it without it getting ripped up.
389          */
390         if ((info.parent = chain->parent) != NULL)
391                 hammer2_chain_ref(info.parent);
392
393         /*
394          * Extra ref needed because flush_core expects it when replacing
395          * chain.
396          */
397         hammer2_chain_ref(chain);
398         loops = 0;
399
400         for (;;) {
401                 /*
402                  * Unwind deep recursions which had been deferred.  This
403                  * can leave the FLUSH_* bits set for these chains, which
404                  * will be handled when we [re]flush chain after the unwind.
405                  */
406                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
407                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
408                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
409                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
410
411                         /*
412                          * Now that we've popped back up we can do a secondary
413                          * recursion on the deferred elements.
414                          *
415                          * NOTE: hammer2_flush() may replace scan.
416                          */
417                         if (hammer2_debug & 0x0040)
418                                 kprintf("deferred flush %p\n", scan);
419                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
420                         hammer2_chain_drop(scan);       /* ref from deferral */
421                         hammer2_flush(trans, scan);
422                         hammer2_chain_unlock(scan);
423                 }
424
425                 /*
426                  * [re]flush chain.
427                  */
428                 info.diddeferral = 0;
429                 hammer2_flush_core(&info, chain, 0);
430
431                 /*
432                  * Only loop if deep recursions have been deferred.
433                  */
434                 if (TAILQ_EMPTY(&info.flushq))
435                         break;
436
437                 if (++loops % 1000 == 0) {
438                         kprintf("hammer2_flush: excessive loops on %p\n",
439                                 chain);
440                         if (hammer2_debug & 0x100000)
441                                 Debugger("hell4");
442                 }
443         }
444         hammer2_chain_drop(chain);
445         if (info.parent)
446                 hammer2_chain_drop(info.parent);
447 }
448
449 /*
450  * This is the core of the chain flushing code.  The chain is locked by the
451  * caller and must also have an extra ref on it by the caller, and remains
452  * locked and will have an extra ref on return.  Upon return, the caller can
453  * test the UPDATE bit on the child to determine if the parent needs updating.
454  *
455  * (1) Determine if this node is a candidate for the flush, return if it is
456  *     not.  fchain and vchain are always candidates for the flush.
457  *
458  * (2) If we recurse too deep the chain is entered onto the deferral list and
459  *     the current flush stack is aborted until after the deferral list is
460  *     run.
461  *
462  * (3) Recursively flush live children (rbtree).  This can create deferrals.
463  *     A successful flush clears the MODIFIED and UPDATE bits on the children
464  *     and typically causes the parent to be marked MODIFIED as the children
465  *     update the parent's block table.  A parent might already be marked
466  *     MODIFIED due to a deletion (whos blocktable update in the parent is
467  *     handled by the frontend), or if the parent itself is modified by the
468  *     frontend for other reasons.
469  *
470  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
471  *     Deleted-but-open inodes can still be individually flushed via the
472  *     filesystem syncer.
473  *
474  * (5) Note that an unmodified child may still need the block table in its
475  *     parent updated (e.g. rename/move).  The child will have UPDATE set
476  *     in this case.
477  *
478  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
479  *
480  * blockref.modify_tid and blockref.mirror_tid are consistent only within a
481  * PFS.  This is why we cannot cache sync_tid in the transaction structure.
482  * Instead we access it from the pmp.
483  */
484 static void
485 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
486                    int deleting)
487 {
488         hammer2_chain_t *parent;
489         hammer2_mount_t *hmp;
490         hammer2_pfsmount_t *pmp;
491         int diddeferral;
492
493         /*
494          * (1) Optimize downward recursion to locate nodes needing action.
495          *     Nothing to do if none of these flags are set.
496          */
497         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
498                 if (hammer2_debug & 0x200) {
499                         if (info->debug == NULL)
500                                 info->debug = chain;
501                 } else {
502                         return;
503                 }
504         }
505
506         hmp = chain->hmp;
507         pmp = chain->pmp;               /* can be NULL */
508         diddeferral = info->diddeferral;
509         parent = info->parent;          /* can be NULL */
510
511         /*
512          * mirror_tid should not be forward-indexed
513          */
514         KKASSERT(pmp == NULL || chain->bref.mirror_tid <= pmp->flush_tid);
515
516         /*
517          * Downward search recursion
518          */
519         if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
520                 /*
521                  * Already deferred.
522                  */
523                 ++info->diddeferral;
524         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
525                 /*
526                  * Recursion depth reached.
527                  */
528                 hammer2_chain_ref(chain);
529                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
530                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
531                 ++info->diddeferral;
532         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
533                 /*
534                  * Downward recursion search (actual flush occurs bottom-up).
535                  * pre-clear ONFLUSH.  It can get set again due to races,
536                  * which we want so the scan finds us again in the next flush.
537                  */
538                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
539                 info->parent = chain;
540                 spin_lock(&chain->core.cst.spin);
541                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
542                         NULL, hammer2_flush_recurse, info);
543                 spin_unlock(&chain->core.cst.spin);
544                 info->parent = parent;
545                 if (info->diddeferral)
546                         hammer2_chain_setflush(info->trans, chain);
547         }
548
549         /*
550          * Now we are in the bottom-up part of the recursion.
551          *
552          * Do not update chain if lower layers were deferred.
553          */
554         if (info->diddeferral)
555                 goto done;
556
557         /*
558          * Propagate the DESTROY flag downwards.  This dummies up the flush
559          * code and tries to invalidate related buffer cache buffers to
560          * avoid the disk write.
561          */
562         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
563                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
564
565         /*
566          * Chain was already modified or has become modified, flush it out.
567          */
568 again:
569         if ((hammer2_debug & 0x200) &&
570             info->debug &&
571             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
572                 hammer2_chain_t *scan = chain;
573
574                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
575                 while (scan) {
576                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
577                                 scan, scan->flags,
578                                 scan->bref.key, scan->bref.type);
579                         if (scan == info->debug)
580                                 break;
581                         scan = scan->parent;
582                 }
583         }
584
585         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
586                 /*
587                  * Dispose of the modified bit.  UPDATE should already be
588                  * set.
589                  */
590                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
591                          chain == &hmp->vchain);
592                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
593                 if (pmp) {
594                         hammer2_pfs_memory_wakeup(pmp);
595                         chain->bref.mirror_tid = pmp->flush_tid;
596                 }
597
598                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
599                     chain == &hmp->vchain ||
600                     chain == &hmp->fchain) {
601                         /*
602                          * Drop the ref from the MODIFIED bit we cleared,
603                          * net -1 ref.
604                          */
605                         hammer2_chain_drop(chain);
606                 } else {
607                         /*
608                          * Drop the ref from the MODIFIED bit we cleared and
609                          * set a ref for the UPDATE bit we are setting.  Net
610                          * 0 refs.
611                          */
612                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
613                 }
614
615                 /*
616                  * Issue the flush.  This is indirect via the DIO.
617                  *
618                  * NOTE: A DELETED node that reaches this point must be
619                  *       flushed for synchronization point consistency.
620                  *
621                  * NOTE: Even though MODIFIED was already set, the related DIO
622                  *       might not be dirty due to a system buffer cache
623                  *       flush and must be set dirty if we are going to make
624                  *       further modifications to the buffer.  Chains with
625                  *       embedded data don't need this.
626                  *
627                  * Update bref.mirror_tid, clear MODIFIED, and set UPDATE.
628                  */
629                 if (hammer2_debug & 0x1000) {
630                         kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
631                                 "data=%016jx\n",
632                                 chain, chain->bref.type,
633                                 chain->bref.key, chain->bref.keybits,
634                                 info->sync_xid,
635                                 chain->bref.data_off);
636                 }
637                 if (hammer2_debug & 0x2000) {
638                         Debugger("Flush hell");
639                 }
640
641                 /*
642                  * Update chain CRCs for flush.
643                  *
644                  * NOTE: Volume headers are NOT flushed here as they require
645                  *       special processing.
646                  */
647                 switch(chain->bref.type) {
648                 case HAMMER2_BREF_TYPE_FREEMAP:
649                         /*
650                          * (note: embedded data, do not call setdirty)
651                          */
652                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
653                         hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
654                         break;
655                 case HAMMER2_BREF_TYPE_VOLUME:
656                         /*
657                          * The free block table is flushed by hammer2_vfs_sync()
658                          * before it flushes vchain.  We must still hold fchain
659                          * locked while copying voldata to volsync, however.
660                          *
661                          * (note: embedded data, do not call setdirty)
662                          */
663                         hammer2_voldata_lock(hmp);
664                         hammer2_chain_lock(&hmp->fchain,
665                                            HAMMER2_RESOLVE_ALWAYS);
666                         /*
667                          * There is no parent to our root vchain and fchain to
668                          * synchronize the bref to, their updated mirror_tid's
669                          * must be synchronized to the volume header.
670                          */
671                         hmp->voldata.mirror_tid = chain->bref.mirror_tid;
672                         hmp->voldata.freemap_tid = hmp->fchain.bref.mirror_tid;
673                         kprintf("mirror_tid %08jx\n",
674                                 (intmax_t)chain->bref.mirror_tid);
675
676                         /*
677                          * The volume header is flushed manually by the
678                          * syncer, not here.  All we do here is adjust the
679                          * crc's.
680                          */
681                         KKASSERT(chain->data != NULL);
682                         KKASSERT(chain->dio == NULL);
683
684                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
685                                 hammer2_icrc32(
686                                         (char *)&hmp->voldata +
687                                          HAMMER2_VOLUME_ICRC1_OFF,
688                                         HAMMER2_VOLUME_ICRC1_SIZE);
689                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
690                                 hammer2_icrc32(
691                                         (char *)&hmp->voldata +
692                                          HAMMER2_VOLUME_ICRC0_OFF,
693                                         HAMMER2_VOLUME_ICRC0_SIZE);
694                         hmp->voldata.icrc_volheader =
695                                 hammer2_icrc32(
696                                         (char *)&hmp->voldata +
697                                          HAMMER2_VOLUME_ICRCVH_OFF,
698                                         HAMMER2_VOLUME_ICRCVH_SIZE);
699                         hmp->volsync = hmp->voldata;
700                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
701                         hammer2_chain_unlock(&hmp->fchain);
702                         hammer2_voldata_unlock(hmp);
703                         break;
704                 case HAMMER2_BREF_TYPE_DATA:
705                         /*
706                          * Data elements have already been flushed via the
707                          * logical file buffer cache.  Their hash was set in
708                          * the bref by the vop_write code.  Do not re-dirty.
709                          *
710                          * Make sure any device buffer(s) have been flushed
711                          * out here (there aren't usually any to flush) XXX.
712                          */
713                         break;
714                 case HAMMER2_BREF_TYPE_INDIRECT:
715                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
716                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
717                         /*
718                          * Buffer I/O will be cleaned up when the volume is
719                          * flushed (but the kernel is free to flush it before
720                          * then, as well).
721                          */
722                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
723                         hammer2_chain_setcheck(chain, chain->data);
724                         break;
725                 case HAMMER2_BREF_TYPE_INODE:
726                         /*
727                          * NOTE: We must call io_setdirty() to make any late
728                          *       changes to the inode data, the system might
729                          *       have already flushed the buffer.
730                          */
731                         if (chain->data->ipdata.op_flags &
732                             HAMMER2_OPFLAG_PFSROOT) {
733                                 /*
734                                  * non-NULL pmp if mounted as a PFS.  We must
735                                  * sync fields cached in the pmp? XXX
736                                  */
737                                 hammer2_inode_data_t *ipdata;
738
739                                 hammer2_io_setdirty(chain->dio);
740                                 ipdata = &chain->data->ipdata;
741                                 if (pmp)
742                                         ipdata->pfs_inum = pmp->inode_tid;
743                         } else {
744                                 /* can't be mounted as a PFS */
745                         }
746
747                         /*
748                          * Update inode statistics.  Pending stats in chain
749                          * are cleared out on UPDATE so expect that bit to
750                          * be set here too or the statistics will not be
751                          * rolled-up properly.
752                          */
753                         if (chain->data_count || chain->inode_count) {
754                                 hammer2_inode_data_t *ipdata;
755
756                                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
757                                 hammer2_io_setdirty(chain->dio);
758                                 ipdata = &chain->data->ipdata;
759                                 ipdata->data_count += chain->data_count;
760                                 ipdata->inode_count += chain->inode_count;
761                         }
762                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
763                         hammer2_chain_setcheck(chain, chain->data);
764                         break;
765                 default:
766                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
767                         panic("hammer2_flush_core: unsupported "
768                               "embedded bref %d",
769                               chain->bref.type);
770                         /* NOT REACHED */
771                 }
772
773                 /*
774                  * If the chain was destroyed try to avoid unnecessary I/O.
775                  * (this only really works if the DIO system buffer is the
776                  * same size as chain->bytes).
777                  */
778                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
779                         hammer2_io_setinval(chain->dio, chain->bytes);
780                 }
781         }
782
783         /*
784          * If UPDATE is set the parent block table may need to be updated.
785          *
786          * NOTE: UPDATE may be set on vchain or fchain in which case
787          *       parent could be NULL.  It's easiest to allow the case
788          *       and test for NULL.  parent can also wind up being NULL
789          *       due to a deletion so we need to handle the case anyway.
790          *
791          * If no parent exists we can just clear the UPDATE bit.  If the
792          * chain gets reattached later on the bit will simply get set
793          * again.
794          */
795         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
796                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
797                 hammer2_chain_drop(chain);
798         }
799
800         /*
801          * The chain may need its blockrefs updated in the parent.  This
802          * requires some fancy footwork.
803          */
804         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
805                 hammer2_blockref_t *base;
806                 int count;
807
808                 /*
809                  * Both parent and chain must be locked.  This requires
810                  * temporarily unlocking the chain.  We have to deal with
811                  * the case where the chain might be reparented or modified
812                  * while it was unlocked.
813                  */
814                 hammer2_chain_unlock(chain);
815                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
816                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
817                 if (chain->parent != parent) {
818                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent);
819                         hammer2_chain_unlock(parent);
820                         goto done;
821                 }
822
823                 /*
824                  * Check race condition.  If someone got in and modified
825                  * it again while it was unlocked, we have to loop up.
826                  */
827                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
828                         hammer2_chain_unlock(parent);
829                         kprintf("hammer2_flush: chain %p flush-mod race\n",
830                                 chain);
831                         goto again;
832                 }
833
834                 /*
835                  * Clear UPDATE flag
836                  */
837                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
838                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
839                         hammer2_chain_drop(chain);
840                 }
841                 hammer2_chain_modify(info->trans, parent, 0);
842
843                 /*
844                  * Calculate blockmap pointer
845                  */
846                 switch(parent->bref.type) {
847                 case HAMMER2_BREF_TYPE_INODE:
848                         /*
849                          * Access the inode's block array.  However, there is
850                          * no block array if the inode is flagged DIRECTDATA.
851                          */
852                         if (parent->data &&
853                             (parent->data->ipdata.op_flags &
854                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
855                                 base = &parent->data->
856                                         ipdata.u.blockset.blockref[0];
857                         } else {
858                                 base = NULL;
859                         }
860                         count = HAMMER2_SET_COUNT;
861                         break;
862                 case HAMMER2_BREF_TYPE_INDIRECT:
863                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
864                         if (parent->data)
865                                 base = &parent->data->npdata[0];
866                         else
867                                 base = NULL;
868                         count = parent->bytes / sizeof(hammer2_blockref_t);
869                         break;
870                 case HAMMER2_BREF_TYPE_VOLUME:
871                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
872                         count = HAMMER2_SET_COUNT;
873                         break;
874                 case HAMMER2_BREF_TYPE_FREEMAP:
875                         base = &parent->data->npdata[0];
876                         count = HAMMER2_SET_COUNT;
877                         break;
878                 default:
879                         base = NULL;
880                         count = 0;
881                         panic("hammer2_flush_core: "
882                               "unrecognized blockref type: %d",
883                               parent->bref.type);
884                 }
885
886                 /*
887                  * Blocktable updates
888                  *
889                  * We synchronize pending statistics at this time.  Delta
890                  * adjustments designated for the current and upper level
891                  * are synchronized.
892                  */
893                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
894                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
895                                 hammer2_base_delete(info->trans, parent,
896                                                     base, count,
897                                                     &info->cache_index, chain);
898                                 /* base_delete clears both bits */
899                         } else {
900                                 atomic_clear_int(&chain->flags,
901                                                  HAMMER2_CHAIN_BMAPUPD);
902                         }
903                 }
904                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
905                         parent->data_count += chain->data_count +
906                                               chain->data_count_up;
907                         parent->inode_count += chain->inode_count +
908                                                chain->inode_count_up;
909                         chain->data_count = 0;
910                         chain->inode_count = 0;
911                         chain->data_count_up = 0;
912                         chain->inode_count_up = 0;
913                         hammer2_base_insert(info->trans, parent,
914                                             base, count,
915                                             &info->cache_index, chain);
916                         /* base_insert sets BMAPPED */
917                 }
918                 hammer2_chain_unlock(parent);
919         }
920
921         /*
922          * Final cleanup after flush
923          */
924 done:
925         KKASSERT(chain->refs > 1);
926         KKASSERT(pmp == NULL ||
927                  chain->bref.mirror_tid <= chain->pmp->flush_tid);
928         if (hammer2_debug & 0x200) {
929                 if (info->debug == chain)
930                         info->debug = NULL;
931         }
932 }
933
934 /*
935  * Flush recursion helper, called from flush_core, calls flush_core.
936  *
937  * Flushes the children of the caller's chain (info->parent), restricted
938  * by sync_tid.  Set info->domodify if the child's blockref must propagate
939  * back up to the parent.
940  *
941  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
942  * flush scan order prevents any chains from being lost.  A child can be
943  * executes more than once.
944  *
945  * WARNING! If we do not call hammer2_flush_core() we must update
946  *          bref.mirror_tid ourselves to indicate that the flush has
947  *          processed the child.
948  *
949  * WARNING! parent->core spinlock is held on entry and return.
950  *
951  * WARNING! Flushes do not cross PFS boundaries.  Specifically, a flush must
952  *          not cross a pfs-root boundary.
953  */
954 static int
955 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
956 {
957         hammer2_flush_info_t *info = data;
958         /*hammer2_trans_t *trans = info->trans;*/
959         hammer2_chain_t *parent = info->parent;
960
961         /*
962          * (child can never be fchain or vchain so a special check isn't
963          *  needed).
964          *
965          * We must ref the child before unlocking the spinlock.
966          *
967          * The caller has added a ref to the parent so we can temporarily
968          * unlock it in order to lock the child.
969          */
970         hammer2_chain_ref(child);
971         spin_unlock(&parent->core.cst.spin);
972
973         hammer2_chain_unlock(parent);
974         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
975
976         /*
977          * Never recurse across a mounted PFS boundary.
978          *
979          * Recurse and collect deferral data.
980          */
981         if ((child->flags & HAMMER2_CHAIN_PFSBOUNDARY) == 0 ||
982             child->pmp == NULL) {
983                 if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
984                         ++info->depth;
985                         hammer2_flush_core(info, child, 0); /* XXX deleting */
986                         --info->depth;
987                 } else if (hammer2_debug & 0x200) {
988                         if (info->debug == NULL)
989                                 info->debug = child;
990                         ++info->depth;
991                         hammer2_flush_core(info, child, 0); /* XXX deleting */
992                         --info->depth;
993                         if (info->debug == child)
994                                 info->debug = NULL;
995                 }
996         }
997
998         /*
999          * Relock to continue the loop
1000          */
1001         hammer2_chain_unlock(child);
1002         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1003         hammer2_chain_drop(child);
1004         KKASSERT(info->parent == parent);
1005         spin_lock(&parent->core.cst.spin);
1006
1007         return (0);
1008 }