hammer2 - Refactor frontend part 3/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_flush.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  *                      TRANSACTION AND FLUSH HANDLING
37  *
38  * Deceptively simple but actually fairly difficult to implement properly is
39  * how I would describe it.
40  *
41  * Flushing generally occurs bottom-up but requires a top-down scan to
42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
43  * tells how to recurse downward to find these chains.
44  */
45
46 #include <sys/cdefs.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/types.h>
50 #include <sys/lock.h>
51 #include <sys/uuid.h>
52
53 #include "hammer2.h"
54
55 #define FLUSH_DEBUG 0
56
57 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
58
59
60 /*
61  * Recursively flush the specified chain.  The chain is locked and
62  * referenced by the caller and will remain so on return.  The chain
63  * will remain referenced throughout but can temporarily lose its
64  * lock during the recursion to avoid unnecessarily stalling user
65  * processes.
66  */
67 struct hammer2_flush_info {
68         hammer2_chain_t *parent;
69         hammer2_trans_t *trans;
70         int             depth;
71         int             diddeferral;
72         int             cache_index;
73         struct h2_flush_list flushq;
74         hammer2_xid_t   sync_xid;       /* memory synchronization point */
75         hammer2_chain_t *debug;
76 };
77
78 typedef struct hammer2_flush_info hammer2_flush_info_t;
79
80 static void hammer2_flush_core(hammer2_flush_info_t *info,
81                                 hammer2_chain_t *chain, int deleting);
82 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
83
84 /*
85  * For now use a global transaction manager.  What we ultimately want to do
86  * is give each non-overlapping hmp/pmp group its own transaction manager.
87  *
88  * Transactions govern XID tracking on the physical media (the hmp), but they
89  * also govern TID tracking which is per-PFS and thus might cross multiple
90  * hmp's.  So we can't just stuff tmanage into hammer2_dev or
91  * hammer2_pfs.
92  */
93 void
94 hammer2_trans_manage_init(hammer2_trans_manage_t *tman)
95 {
96         lockinit(&tman->translk, "h2trans", 0, 0);
97         TAILQ_INIT(&tman->transq);
98         tman->flush_xid = 1;
99         tman->alloc_xid = tman->flush_xid + 1;
100 }
101
102 hammer2_xid_t
103 hammer2_trans_newxid(hammer2_pfs_t *pmp)
104 {
105         hammer2_xid_t xid;
106
107         for (;;) {
108                 xid = atomic_fetchadd_int(&pmp->tmanage.alloc_xid, 1);
109                 if (xid)
110                         break;
111         }
112         return xid;
113 }
114
115 /*
116  * Transaction support functions for writing to the filesystem.
117  *
118  * Initializing a new transaction allocates a transaction ID.  Typically
119  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
120  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
121  * media target.  The latter mode is used by the recovery code.
122  *
123  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
124  * other is a set of any number of concurrent filesystem operations.  We
125  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
126  * or we can have <running_flush> + <concurrent_fs_ops>.
127  *
128  * During a flush, new fs_ops are only blocked until the fs_ops prior to
129  * the flush complete.  The new fs_ops can then run concurrent with the flush.
130  *
131  * Buffer-cache transactions operate as fs_ops but never block.  A
132  * buffer-cache flush will run either before or after the current pending
133  * flush depending on its state.
134  */
135 void
136 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags)
137 {
138         hammer2_trans_manage_t *tman;
139         hammer2_trans_t *head;
140
141         tman = &pmp->tmanage;
142
143         bzero(trans, sizeof(*trans));
144         trans->pmp = pmp;
145         trans->flags = flags;
146         trans->td = curthread;
147
148         lockmgr(&tman->translk, LK_EXCLUSIVE);
149
150         if (flags & HAMMER2_TRANS_ISFLUSH) {
151                 /*
152                  * If multiple flushes are trying to run we have to
153                  * wait until it is our turn.  All flushes are serialized.
154                  *
155                  * We queue ourselves and then wait to become the head
156                  * of the queue, allowing all prior flushes to complete.
157                  *
158                  * Multiple normal transactions can share the current
159                  * transaction id but a flush transaction needs its own
160                  * unique TID for proper block table update accounting.
161                  */
162                 ++tman->flushcnt;
163                 ++pmp->modify_tid;
164                 tman->flush_xid = hammer2_trans_newxid(pmp);
165                 trans->sync_xid = tman->flush_xid;
166                 trans->modify_tid = pmp->modify_tid;
167                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
168                 if (TAILQ_FIRST(&tman->transq) != trans) {
169                         trans->blocked = 1;
170                         while (trans->blocked) {
171                                 lksleep(&trans->sync_xid, &tman->translk,
172                                         0, "h2multf", hz);
173                         }
174                 }
175         } else if (tman->flushcnt == 0) {
176                 /*
177                  * No flushes are pending, we can go.  Use prior flush_xid + 1.
178                  *
179                  * WARNING!  Also see hammer2_chain_setflush()
180                  */
181                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
182                 trans->sync_xid = tman->flush_xid + 1;
183
184                 /* XXX improve/optimize inode allocation */
185         } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
186                 /*
187                  * A buffer cache transaction is requested while a flush
188                  * is in progress.  The flush's PREFLUSH flag must be set
189                  * in this situation.
190                  *
191                  * The buffer cache flush takes on the main flush's
192                  * transaction id.
193                  */
194                 TAILQ_FOREACH(head, &tman->transq, entry) {
195                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
196                                 break;
197                 }
198                 KKASSERT(head);
199                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
200                 trans->flags |= HAMMER2_TRANS_PREFLUSH;
201                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
202                 trans->sync_xid = head->sync_xid;
203                 trans->modify_tid = head->modify_tid;
204                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
205                 /* not allowed to block */
206         } else {
207                 /*
208                  * A normal transaction is requested while a flush is in
209                  * progress.  We insert after the current flush and may
210                  * block.
211                  *
212                  * WARNING!  Also see hammer2_chain_setflush()
213                  */
214                 TAILQ_FOREACH(head, &tman->transq, entry) {
215                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
216                                 break;
217                 }
218                 KKASSERT(head);
219                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
220                 trans->sync_xid = head->sync_xid + 1;
221                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
222
223                 /*
224                  * XXX for now we must block new transactions, synchronous
225                  * flush mode is on by default.
226                  *
227                  * If synchronous flush mode is enabled concurrent
228                  * frontend transactions during the flush are not
229                  * allowed (except we don't have a choice for buffer
230                  * cache ops).
231                  */
232                 if (hammer2_synchronous_flush > 0 ||
233                     TAILQ_FIRST(&tman->transq) != head) {
234                         trans->blocked = 1;
235                         while (trans->blocked) {
236                                 lksleep(&trans->sync_xid, &tman->translk,
237                                         0, "h2multf", hz);
238                         }
239                 }
240         }
241         if (flags & HAMMER2_TRANS_NEWINODE) {
242                 if (pmp->spmp_hmp) {
243                         /*
244                          * Super-root transaction, all new inodes have an
245                          * inode number of 1.  Normal pfs inode cache
246                          * semantics are not used.
247                          */
248                         trans->inode_tid = 1;
249                 } else {
250                         /*
251                          * Normal transaction
252                          */
253                         if (pmp->inode_tid < HAMMER2_INODE_START)
254                                 pmp->inode_tid = HAMMER2_INODE_START;
255                         trans->inode_tid = pmp->inode_tid++;
256                 }
257         }
258
259         lockmgr(&tman->translk, LK_RELEASE);
260 }
261
262 void
263 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
264 {
265         hammer2_trans_manage_t *tman;
266         hammer2_trans_t *head;
267
268         tman = &pmp->tmanage;
269         lockmgr(&tman->translk, LK_EXCLUSIVE);
270         if (tman->flushcnt) {
271                 TAILQ_FOREACH(head, &tman->transq, entry) {
272                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
273                                 break;
274                 }
275                 KKASSERT(head);
276                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
277         }
278         lockmgr(&tman->translk, LK_RELEASE);
279 }
280
281 void
282 hammer2_trans_done(hammer2_trans_t *trans)
283 {
284         hammer2_trans_manage_t *tman;
285         hammer2_trans_t *head;
286         hammer2_trans_t *scan;
287
288         tman = &trans->pmp->tmanage;
289
290         /*
291          * Remove.
292          */
293         lockmgr(&tman->translk, LK_EXCLUSIVE);
294         TAILQ_REMOVE(&tman->transq, trans, entry);
295         head = TAILQ_FIRST(&tman->transq);
296
297         /*
298          * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
299          * up through the next flush.  (If the head is a flush then we
300          * stop there, unlike the unblock code following this section).
301          */
302         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
303                 --tman->flushcnt;
304                 scan = head;
305                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
306                         atomic_clear_int(&scan->flags,
307                                          HAMMER2_TRANS_CONCURRENT);
308                         scan = TAILQ_NEXT(scan, entry);
309                 }
310         }
311
312         /*
313          * Unblock the head of the queue and any additional transactions
314          * up to the next flush.  The head can be a flush and it will be
315          * unblocked along with the non-flush transactions following it
316          * (which are allowed to run concurrently with it).
317          *
318          * In synchronous flush mode we stop if the head transaction is
319          * a flush.
320          */
321         if (head && head->blocked) {
322                 head->blocked = 0;
323                 wakeup(&head->sync_xid);
324
325                 if (hammer2_synchronous_flush > 0)
326                         scan = head;
327                 else
328                         scan = TAILQ_NEXT(head, entry);
329                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
330                         if (scan->blocked) {
331                                 scan->blocked = 0;
332                                 wakeup(&scan->sync_xid);
333                         }
334                         scan = TAILQ_NEXT(scan, entry);
335                 }
336         }
337         lockmgr(&tman->translk, LK_RELEASE);
338 }
339
340 /*
341  * Chains undergoing destruction are removed from the in-memory topology.
342  * To avoid getting lost these chains are placed on the delayed flush
343  * queue which will properly dispose of them.
344  *
345  * We do this instead of issuing an immediate flush in order to give
346  * recursive deletions (rm -rf, etc) a chance to remove more of the
347  * hierarchy, potentially allowing an enormous amount of write I/O to
348  * be avoided.
349  */
350 void
351 hammer2_delayed_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
352 {
353         if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) {
354                 hammer2_spin_ex(&chain->hmp->list_spin);
355                 if ((chain->flags & (HAMMER2_CHAIN_DELAYED |
356                                      HAMMER2_CHAIN_DEFERRED)) == 0) {
357                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED |
358                                                       HAMMER2_CHAIN_DEFERRED);
359                         TAILQ_INSERT_TAIL(&chain->hmp->flushq,
360                                           chain, flush_node);
361                         hammer2_chain_ref(chain);
362                 }
363                 hammer2_spin_unex(&chain->hmp->list_spin);
364         }
365 }
366
367 /*
368  * Flush the chain and all modified sub-chains through the specified
369  * synchronization point, propagating parent chain modifications, modify_tid,
370  * and mirror_tid updates back up as needed.
371  *
372  * Caller must have interlocked against any non-flush-related modifying
373  * operations in progress whos XXX values are less than or equal
374  * to the passed sync_xid.
375  *
376  * Caller must have already vetted synchronization points to ensure they
377  * are properly flushed.  Only snapshots and cluster flushes can create
378  * these sorts of synchronization points.
379  *
380  * This routine can be called from several places but the most important
381  * is from VFS_SYNC.
382  *
383  * chain is locked on call and will remain locked on return.  The chain's
384  * UPDATE flag indicates that its parent's block table (which is not yet
385  * part of the flush) should be updated.  The chain may be replaced by
386  * the call if it was modified.
387  */
388 void
389 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain, int istop)
390 {
391         hammer2_chain_t *scan;
392         hammer2_flush_info_t info;
393         hammer2_dev_t *hmp;
394         int loops;
395
396         /*
397          * Execute the recursive flush and handle deferrals.
398          *
399          * Chains can be ridiculously long (thousands deep), so to
400          * avoid blowing out the kernel stack the recursive flush has a
401          * depth limit.  Elements at the limit are placed on a list
402          * for re-execution after the stack has been popped.
403          */
404         bzero(&info, sizeof(info));
405         TAILQ_INIT(&info.flushq);
406         info.trans = trans;
407         info.sync_xid = trans->sync_xid;
408         info.cache_index = -1;
409
410         /*
411          * Calculate parent (can be NULL), if not NULL the flush core
412          * expects the parent to be referenced so it can easily lock/unlock
413          * it without it getting ripped up.
414          */
415         if ((info.parent = chain->parent) != NULL)
416                 hammer2_chain_ref(info.parent);
417
418         /*
419          * Extra ref needed because flush_core expects it when replacing
420          * chain.
421          */
422         hammer2_chain_ref(chain);
423         hmp = chain->hmp;
424         loops = 0;
425
426         for (;;) {
427                 /*
428                  * Move hmp->flushq to info.flushq if non-empty so it can
429                  * be processed.
430                  */
431                 if (TAILQ_FIRST(&hmp->flushq) != NULL) {
432                         hammer2_spin_ex(&chain->hmp->list_spin);
433                         TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node);
434                         hammer2_spin_unex(&chain->hmp->list_spin);
435                 }
436
437                 /*
438                  * Unwind deep recursions which had been deferred.  This
439                  * can leave the FLUSH_* bits set for these chains, which
440                  * will be handled when we [re]flush chain after the unwind.
441                  */
442                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
443                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
444                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
445                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED |
446                                                        HAMMER2_CHAIN_DELAYED);
447
448                         /*
449                          * Now that we've popped back up we can do a secondary
450                          * recursion on the deferred elements.
451                          *
452                          * NOTE: hammer2_flush() may replace scan.
453                          */
454                         if (hammer2_debug & 0x0040)
455                                 kprintf("deferred flush %p\n", scan);
456                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
457                         hammer2_flush(trans, scan, 0);
458                         hammer2_chain_unlock(scan);
459                         hammer2_chain_drop(scan);       /* ref from deferral */
460                 }
461
462                 /*
463                  * [re]flush chain.
464                  */
465                 info.diddeferral = 0;
466                 hammer2_flush_core(&info, chain, istop);
467
468                 /*
469                  * Only loop if deep recursions have been deferred.
470                  */
471                 if (TAILQ_EMPTY(&info.flushq))
472                         break;
473
474                 if (++loops % 1000 == 0) {
475                         kprintf("hammer2_flush: excessive loops on %p\n",
476                                 chain);
477                         if (hammer2_debug & 0x100000)
478                                 Debugger("hell4");
479                 }
480         }
481         hammer2_chain_drop(chain);
482         if (info.parent)
483                 hammer2_chain_drop(info.parent);
484 }
485
486 /*
487  * This is the core of the chain flushing code.  The chain is locked by the
488  * caller and must also have an extra ref on it by the caller, and remains
489  * locked and will have an extra ref on return.  Upon return, the caller can
490  * test the UPDATE bit on the child to determine if the parent needs updating.
491  *
492  * (1) Determine if this node is a candidate for the flush, return if it is
493  *     not.  fchain and vchain are always candidates for the flush.
494  *
495  * (2) If we recurse too deep the chain is entered onto the deferral list and
496  *     the current flush stack is aborted until after the deferral list is
497  *     run.
498  *
499  * (3) Recursively flush live children (rbtree).  This can create deferrals.
500  *     A successful flush clears the MODIFIED and UPDATE bits on the children
501  *     and typically causes the parent to be marked MODIFIED as the children
502  *     update the parent's block table.  A parent might already be marked
503  *     MODIFIED due to a deletion (whos blocktable update in the parent is
504  *     handled by the frontend), or if the parent itself is modified by the
505  *     frontend for other reasons.
506  *
507  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
508  *     Deleted-but-open inodes can still be individually flushed via the
509  *     filesystem syncer.
510  *
511  * (5) Note that an unmodified child may still need the block table in its
512  *     parent updated (e.g. rename/move).  The child will have UPDATE set
513  *     in this case.
514  *
515  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
516  *
517  * blockref.modify_tid is consistent only within a PFS, and will not be
518  * consistent during synchronization.  mirror_tid is consistent across the
519  * block device regardless of the PFS.
520  */
521 static void
522 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
523                    int istop)
524 {
525         hammer2_chain_t *parent;
526         hammer2_dev_t *hmp;
527         int diddeferral;
528
529         /*
530          * (1) Optimize downward recursion to locate nodes needing action.
531          *     Nothing to do if none of these flags are set.
532          */
533         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
534                 if (hammer2_debug & 0x200) {
535                         if (info->debug == NULL)
536                                 info->debug = chain;
537                 } else {
538                         return;
539                 }
540         }
541
542         hmp = chain->hmp;
543         diddeferral = info->diddeferral;
544         parent = info->parent;          /* can be NULL */
545
546         /*
547          * Downward search recursion
548          */
549         if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) {
550                 /*
551                  * Already deferred.
552                  */
553                 ++info->diddeferral;
554         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
555                 /*
556                  * Recursion depth reached.
557                  */
558                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0);
559                 hammer2_chain_ref(chain);
560                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
561                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
562                 ++info->diddeferral;
563         } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && istop == 0) {
564                 /*
565                  * We do not recurse through PFSROOTs.  PFSROOT flushes are
566                  * handled by the related pmp's (whether mounted or not,
567                  * including during recovery).
568                  *
569                  * But we must still process the PFSROOT chains for block
570                  * table updates in their parent (which IS part of our flush).
571                  *
572                  * Note that the volume root, vchain, does not set this flag.
573                  */
574                 ;
575         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
576                 /*
577                  * Downward recursion search (actual flush occurs bottom-up).
578                  * pre-clear ONFLUSH.  It can get set again due to races,
579                  * which we want so the scan finds us again in the next flush.
580                  * These races can also include 
581                  *
582                  * Flush recursions stop at PFSROOT boundaries.  Each PFS
583                  * must be individually flushed and then the root must
584                  * be flushed.
585                  */
586                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
587                 info->parent = chain;
588                 hammer2_spin_ex(&chain->core.spin);
589                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
590                         NULL, hammer2_flush_recurse, info);
591                 hammer2_spin_unex(&chain->core.spin);
592                 info->parent = parent;
593                 if (info->diddeferral)
594                         hammer2_chain_setflush(info->trans, chain);
595         }
596
597         /*
598          * Now we are in the bottom-up part of the recursion.
599          *
600          * Do not update chain if lower layers were deferred.
601          */
602         if (info->diddeferral)
603                 goto done;
604
605         /*
606          * Propagate the DESTROY flag downwards.  This dummies up the flush
607          * code and tries to invalidate related buffer cache buffers to
608          * avoid the disk write.
609          */
610         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
611                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
612
613         /*
614          * Chain was already modified or has become modified, flush it out.
615          */
616 again:
617         if ((hammer2_debug & 0x200) &&
618             info->debug &&
619             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
620                 hammer2_chain_t *scan = chain;
621
622                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
623                 while (scan) {
624                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
625                                 scan, scan->flags,
626                                 scan->bref.key, scan->bref.type);
627                         if (scan == info->debug)
628                                 break;
629                         scan = scan->parent;
630                 }
631         }
632
633         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
634                 /*
635                  * Dispose of the modified bit.
636                  *
637                  * UPDATE should already be set.
638                  * bref.mirror_tid should already be set.
639                  */
640                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
641                          chain == &hmp->vchain);
642                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
643
644                 /*
645                  * Manage threads waiting for excessive dirty memory to
646                  * be retired.
647                  */
648                 if (chain->pmp)
649                         hammer2_pfs_memory_wakeup(chain->pmp);
650
651                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
652                     chain == &hmp->vchain ||
653                     chain == &hmp->fchain) {
654                         /*
655                          * Drop the ref from the MODIFIED bit we cleared,
656                          * net -1 ref.
657                          */
658                         hammer2_chain_drop(chain);
659                 } else {
660                         /*
661                          * Drop the ref from the MODIFIED bit we cleared and
662                          * set a ref for the UPDATE bit we are setting.  Net
663                          * 0 refs.
664                          */
665                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
666                 }
667
668                 /*
669                  * Issue the flush.  This is indirect via the DIO.
670                  *
671                  * NOTE: A DELETED node that reaches this point must be
672                  *       flushed for synchronization point consistency.
673                  *
674                  * NOTE: Even though MODIFIED was already set, the related DIO
675                  *       might not be dirty due to a system buffer cache
676                  *       flush and must be set dirty if we are going to make
677                  *       further modifications to the buffer.  Chains with
678                  *       embedded data don't need this.
679                  */
680                 if (hammer2_debug & 0x1000) {
681                         kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
682                                 "data=%016jx\n",
683                                 chain, chain->bref.type,
684                                 chain->bref.key, chain->bref.keybits,
685                                 info->sync_xid,
686                                 chain->bref.data_off);
687                 }
688                 if (hammer2_debug & 0x2000) {
689                         Debugger("Flush hell");
690                 }
691
692                 /*
693                  * Update chain CRCs for flush.
694                  *
695                  * NOTE: Volume headers are NOT flushed here as they require
696                  *       special processing.
697                  */
698                 switch(chain->bref.type) {
699                 case HAMMER2_BREF_TYPE_FREEMAP:
700                         /*
701                          * Update the volume header's freemap_tid to the
702                          * freemap's flushing mirror_tid.
703                          *
704                          * (note: embedded data, do not call setdirty)
705                          */
706                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
707                         KKASSERT(chain == &hmp->fchain);
708                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
709                         kprintf("sync freemap mirror_tid %08jx\n",
710                                 (intmax_t)chain->bref.mirror_tid);
711
712                         /*
713                          * The freemap can be flushed independently of the
714                          * main topology, but for the case where it is
715                          * flushed in the same transaction, and flushed
716                          * before vchain (a case we want to allow for
717                          * performance reasons), make sure modifications
718                          * made during the flush under vchain use a new
719                          * transaction id.
720                          *
721                          * Otherwise the mount recovery code will get confused.
722                          */
723                         ++hmp->voldata.mirror_tid;
724                         break;
725                 case HAMMER2_BREF_TYPE_VOLUME:
726                         /*
727                          * The free block table is flushed by
728                          * hammer2_vfs_sync() before it flushes vchain.
729                          * We must still hold fchain locked while copying
730                          * voldata to volsync, however.
731                          *
732                          * (note: embedded data, do not call setdirty)
733                          */
734                         hammer2_chain_lock(&hmp->fchain,
735                                            HAMMER2_RESOLVE_ALWAYS);
736                         hammer2_voldata_lock(hmp);
737                         kprintf("sync volume  mirror_tid %08jx\n",
738                                 (intmax_t)chain->bref.mirror_tid);
739
740                         /*
741                          * Update the volume header's mirror_tid to the
742                          * main topology's flushing mirror_tid.  It is
743                          * possible that voldata.mirror_tid is already
744                          * beyond bref.mirror_tid due to the bump we made
745                          * above in BREF_TYPE_FREEMAP.
746                          */
747                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
748                                 hmp->voldata.mirror_tid =
749                                         chain->bref.mirror_tid;
750                         }
751
752                         /*
753                          * The volume header is flushed manually by the
754                          * syncer, not here.  All we do here is adjust the
755                          * crc's.
756                          */
757                         KKASSERT(chain->data != NULL);
758                         KKASSERT(chain->dio == NULL);
759
760                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
761                                 hammer2_icrc32(
762                                         (char *)&hmp->voldata +
763                                          HAMMER2_VOLUME_ICRC1_OFF,
764                                         HAMMER2_VOLUME_ICRC1_SIZE);
765                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
766                                 hammer2_icrc32(
767                                         (char *)&hmp->voldata +
768                                          HAMMER2_VOLUME_ICRC0_OFF,
769                                         HAMMER2_VOLUME_ICRC0_SIZE);
770                         hmp->voldata.icrc_volheader =
771                                 hammer2_icrc32(
772                                         (char *)&hmp->voldata +
773                                          HAMMER2_VOLUME_ICRCVH_OFF,
774                                         HAMMER2_VOLUME_ICRCVH_SIZE);
775
776                         kprintf("syncvolhdr %016jx %016jx\n",
777                                 hmp->voldata.mirror_tid,
778                                 hmp->vchain.bref.mirror_tid);
779                         hmp->volsync = hmp->voldata;
780                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
781                         hammer2_voldata_unlock(hmp);
782                         hammer2_chain_unlock(&hmp->fchain);
783                         break;
784                 case HAMMER2_BREF_TYPE_DATA:
785                         /*
786                          * Data elements have already been flushed via the
787                          * logical file buffer cache.  Their hash was set in
788                          * the bref by the vop_write code.  Do not re-dirty.
789                          *
790                          * Make sure any device buffer(s) have been flushed
791                          * out here (there aren't usually any to flush) XXX.
792                          */
793                         break;
794                 case HAMMER2_BREF_TYPE_INDIRECT:
795                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
796                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
797                         /*
798                          * Buffer I/O will be cleaned up when the volume is
799                          * flushed (but the kernel is free to flush it before
800                          * then, as well).
801                          */
802                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
803                         hammer2_chain_setcheck(chain, chain->data);
804                         break;
805                 case HAMMER2_BREF_TYPE_INODE:
806                         /*
807                          * NOTE: We must call io_setdirty() to make any late
808                          *       changes to the inode data, the system might
809                          *       have already flushed the buffer.
810                          */
811                         if (chain->data->ipdata.meta.op_flags &
812                             HAMMER2_OPFLAG_PFSROOT) {
813                                 /*
814                                  * non-NULL pmp if mounted as a PFS.  We must
815                                  * sync fields cached in the pmp? XXX
816                                  */
817                                 hammer2_inode_data_t *ipdata;
818
819                                 hammer2_io_setdirty(chain->dio);
820                                 ipdata = &chain->data->ipdata;
821                                 if (chain->pmp) {
822                                         ipdata->meta.pfs_inum =
823                                                 chain->pmp->inode_tid;
824                                 }
825                         } else {
826                                 /* can't be mounted as a PFS */
827                         }
828
829                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
830                         hammer2_chain_setcheck(chain, chain->data);
831                         break;
832                 default:
833                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
834                         panic("hammer2_flush_core: unsupported "
835                               "embedded bref %d",
836                               chain->bref.type);
837                         /* NOT REACHED */
838                 }
839
840                 /*
841                  * If the chain was destroyed try to avoid unnecessary I/O.
842                  * (this only really works if the DIO system buffer is the
843                  * same size as chain->bytes).
844                  */
845                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
846                         hammer2_io_setinval(chain->dio, chain->bytes);
847                 }
848         }
849
850         /*
851          * If UPDATE is set the parent block table may need to be updated.
852          *
853          * NOTE: UPDATE may be set on vchain or fchain in which case
854          *       parent could be NULL.  It's easiest to allow the case
855          *       and test for NULL.  parent can also wind up being NULL
856          *       due to a deletion so we need to handle the case anyway.
857          *
858          * If no parent exists we can just clear the UPDATE bit.  If the
859          * chain gets reattached later on the bit will simply get set
860          * again.
861          */
862         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
863                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
864                 hammer2_chain_drop(chain);
865         }
866
867         /*
868          * The chain may need its blockrefs updated in the parent.  This
869          * requires some fancy footwork.
870          */
871         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
872                 hammer2_blockref_t *base;
873                 int count;
874
875                 /*
876                  * Both parent and chain must be locked.  This requires
877                  * temporarily unlocking the chain.  We have to deal with
878                  * the case where the chain might be reparented or modified
879                  * while it was unlocked.
880                  */
881                 hammer2_chain_unlock(chain);
882                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
883                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
884                 if (chain->parent != parent) {
885                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n",
886                                 chain, chain->parent, parent);
887                         hammer2_chain_unlock(parent);
888                         goto done;
889                 }
890
891                 /*
892                  * Check race condition.  If someone got in and modified
893                  * it again while it was unlocked, we have to loop up.
894                  */
895                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
896                         hammer2_chain_unlock(parent);
897                         kprintf("hammer2_flush: chain %p flush-mod race\n",
898                                 chain);
899                         goto again;
900                 }
901
902                 /*
903                  * Clear UPDATE flag, mark parent modified, update its
904                  * modify_tid if necessary, and adjust the parent blockmap.
905                  */
906                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
907                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
908                         hammer2_chain_drop(chain);
909                 }
910
911                 /*
912                  * (optional code)
913                  *
914                  * Avoid actually modifying and updating the parent if it
915                  * was flagged for destruction.  This can greatly reduce
916                  * disk I/O in large tree removals because the
917                  * hammer2_io_setinval() call in the upward recursion
918                  * (see MODIFIED code above) can only handle a few cases.
919                  */
920                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
921                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
922                                 parent->bref.modify_tid =
923                                         chain->bref.modify_tid;
924                         }
925                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
926                                                         HAMMER2_CHAIN_BMAPUPD);
927                         hammer2_chain_unlock(parent);
928                         goto skipupdate;
929                 }
930
931                 /*
932                  * We are updating the parent's blockmap, the parent must
933                  * be set modified.
934                  */
935                 hammer2_chain_modify(info->trans, parent, 0);
936                 if (parent->bref.modify_tid < chain->bref.modify_tid)
937                         parent->bref.modify_tid = chain->bref.modify_tid;
938
939                 /*
940                  * Calculate blockmap pointer
941                  */
942                 switch(parent->bref.type) {
943                 case HAMMER2_BREF_TYPE_INODE:
944                         /*
945                          * Access the inode's block array.  However, there is
946                          * no block array if the inode is flagged DIRECTDATA.
947                          */
948                         if (parent->data &&
949                             (parent->data->ipdata.meta.op_flags &
950                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
951                                 base = &parent->data->
952                                         ipdata.u.blockset.blockref[0];
953                         } else {
954                                 base = NULL;
955                         }
956                         count = HAMMER2_SET_COUNT;
957                         break;
958                 case HAMMER2_BREF_TYPE_INDIRECT:
959                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
960                         if (parent->data)
961                                 base = &parent->data->npdata[0];
962                         else
963                                 base = NULL;
964                         count = parent->bytes / sizeof(hammer2_blockref_t);
965                         break;
966                 case HAMMER2_BREF_TYPE_VOLUME:
967                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
968                         count = HAMMER2_SET_COUNT;
969                         break;
970                 case HAMMER2_BREF_TYPE_FREEMAP:
971                         base = &parent->data->npdata[0];
972                         count = HAMMER2_SET_COUNT;
973                         break;
974                 default:
975                         base = NULL;
976                         count = 0;
977                         panic("hammer2_flush_core: "
978                               "unrecognized blockref type: %d",
979                               parent->bref.type);
980                 }
981
982                 /*
983                  * Blocktable updates
984                  *
985                  * We synchronize pending statistics at this time.  Delta
986                  * adjustments designated for the current and upper level
987                  * are synchronized.
988                  */
989                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
990                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
991                                 hammer2_spin_ex(&parent->core.spin);
992                                 hammer2_base_delete(info->trans, parent,
993                                                     base, count,
994                                                     &info->cache_index, chain);
995                                 hammer2_spin_unex(&parent->core.spin);
996                                 /* base_delete clears both bits */
997                         } else {
998                                 atomic_clear_int(&chain->flags,
999                                                  HAMMER2_CHAIN_BMAPUPD);
1000                         }
1001                 }
1002                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
1003                         hammer2_spin_ex(&parent->core.spin);
1004                         hammer2_base_insert(info->trans, parent,
1005                                             base, count,
1006                                             &info->cache_index, chain);
1007                         hammer2_spin_unex(&parent->core.spin);
1008                         /* base_insert sets BMAPPED */
1009                 }
1010                 hammer2_chain_unlock(parent);
1011         }
1012 skipupdate:
1013         ;
1014
1015         /*
1016          * Final cleanup after flush
1017          */
1018 done:
1019         KKASSERT(chain->refs > 0);
1020         if (hammer2_debug & 0x200) {
1021                 if (info->debug == chain)
1022                         info->debug = NULL;
1023         }
1024 }
1025
1026 /*
1027  * Flush recursion helper, called from flush_core, calls flush_core.
1028  *
1029  * Flushes the children of the caller's chain (info->parent), restricted
1030  * by sync_tid.  Set info->domodify if the child's blockref must propagate
1031  * back up to the parent.
1032  *
1033  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
1034  * flush scan order prevents any chains from being lost.  A child can be
1035  * executes more than once.
1036  *
1037  * WARNING! If we do not call hammer2_flush_core() we must update
1038  *          bref.mirror_tid ourselves to indicate that the flush has
1039  *          processed the child.
1040  *
1041  * WARNING! parent->core spinlock is held on entry and return.
1042  */
1043 static int
1044 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
1045 {
1046         hammer2_flush_info_t *info = data;
1047         /*hammer2_trans_t *trans = info->trans;*/
1048         hammer2_chain_t *parent = info->parent;
1049
1050         /*
1051          * (child can never be fchain or vchain so a special check isn't
1052          *  needed).
1053          *
1054          * We must ref the child before unlocking the spinlock.
1055          *
1056          * The caller has added a ref to the parent so we can temporarily
1057          * unlock it in order to lock the child.
1058          */
1059         hammer2_chain_ref(child);
1060         hammer2_spin_unex(&parent->core.spin);
1061
1062         hammer2_chain_unlock(parent);
1063         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
1064
1065         /*
1066          * Recurse and collect deferral data.  We're in the media flush,
1067          * this can cross PFS boundaries.
1068          */
1069         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1070                 ++info->depth;
1071                 hammer2_flush_core(info, child, 0);
1072                 --info->depth;
1073         } else if (hammer2_debug & 0x200) {
1074                 if (info->debug == NULL)
1075                         info->debug = child;
1076                 ++info->depth;
1077                 hammer2_flush_core(info, child, 0);
1078                 --info->depth;
1079                 if (info->debug == child)
1080                         info->debug = NULL;
1081         }
1082
1083         /*
1084          * Relock to continue the loop
1085          */
1086         hammer2_chain_unlock(child);
1087         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1088         hammer2_chain_drop(child);
1089         KKASSERT(info->parent == parent);
1090         hammer2_spin_ex(&parent->core.spin);
1091
1092         return (0);
1093 }