sys/vfs/hammer2/hammer2_flush.c

   1 /*
   2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@dragonflybsd.org>
   6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in
  16  *    the documentation and/or other materials provided with the
  17  *    distribution.
  18  * 3. Neither the name of The DragonFly Project nor the names of its
  19  *    contributors may be used to endorse or promote products derived
  20  *    from this software without specific, prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35 /*
  36  *                      TRANSACTION AND FLUSH HANDLING
  37  *
  38  * Deceptively simple but actually fairly difficult to implement properly is
  39  * how I would describe it.
  40  *
  41  * Flushing generally occurs bottom-up but requires a top-down scan to
  42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
  43  * tells how to recurse downward to find these chains.
  44  */
  45
  46 #include <sys/cdefs.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/types.h>
  50 #include <sys/lock.h>
  51 #include <sys/uuid.h>
  52
  53 #include "hammer2.h"
  54
  55 #define FLUSH_DEBUG 0
  56
  57 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
  58
  59
  60 /*
  61  * Recursively flush the specified chain.  The chain is locked and
  62  * referenced by the caller and will remain so on return.  The chain
  63  * will remain referenced throughout but can temporarily lose its
  64  * lock during the recursion to avoid unnecessarily stalling user
  65  * processes.
  66  */
  67 struct hammer2_flush_info {
  68         hammer2_chain_t *parent;
  69         hammer2_trans_t *trans;
  70         int             depth;
  71         int             diddeferral;
  72         int             cache_index;
  73         struct h2_flush_list flushq;
  74         hammer2_xid_t   sync_xid;       /* memory synchronization point */
  75         hammer2_tid_t   mirror_tid;     /* avoid digging through hmp */
  76         hammer2_tid_t   modify_tid;
  77         hammer2_chain_t *debug;
  78 };
  79
  80 typedef struct hammer2_flush_info hammer2_flush_info_t;
  81
  82 static void hammer2_flush_core(hammer2_flush_info_t *info,
  83                                 hammer2_chain_t *chain, int deleting);
  84 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
  85
  86 /*
  87  * For now use a global transaction manager.  What we ultimately want to do
  88  * is give each non-overlapping hmp/pmp group its own transaction manager.
  89  *
  90  * Transactions govern XID tracking on the physical media (the hmp), but they
  91  * also govern TID tracking which is per-PFS and thus might cross multiple
  92  * hmp's.  So we can't just stuff tmanage into hammer2_dev or
  93  * hammer2_pfs.
  94  */
  95 static hammer2_trans_manage_t   tmanage;
  96
  97 void
  98 hammer2_trans_manage_init(void)
  99 {
 100         lockinit(&tmanage.translk, "h2trans", 0, 0);
 101         TAILQ_INIT(&tmanage.transq);
 102         tmanage.flush_xid = 1;
 103         tmanage.alloc_xid = tmanage.flush_xid + 1;
 104 }
 105
 106 hammer2_xid_t
 107 hammer2_trans_newxid(hammer2_pfs_t *pmp __unused)
 108 {
 109         hammer2_xid_t xid;
 110
 111         for (;;) {
 112                 xid = atomic_fetchadd_int(&tmanage.alloc_xid, 1);
 113                 if (xid)
 114                         break;
 115         }
 116         return xid;
 117 }
 118
 119 /*
 120  * Transaction support functions for writing to the filesystem.
 121  *
 122  * Initializing a new transaction allocates a transaction ID.  Typically
 123  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
 124  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
 125  * media target.  The latter mode is used by the recovery code.
 126  *
 127  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
 128  * other is a set of any number of concurrent filesystem operations.  We
 129  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
 130  * or we can have <running_flush> + <concurrent_fs_ops>.
 131  *
 132  * During a flush, new fs_ops are only blocked until the fs_ops prior to
 133  * the flush complete.  The new fs_ops can then run concurrent with the flush.
 134  *
 135  * Buffer-cache transactions operate as fs_ops but never block.  A
 136  * buffer-cache flush will run either before or after the current pending
 137  * flush depending on its state.
 138  */
 139 void
 140 hammer2_trans_init(hammer2_trans_t *trans, hammer2_pfs_t *pmp, int flags)
 141 {
 142         hammer2_trans_manage_t *tman;
 143         hammer2_trans_t *head;
 144
 145         tman = &tmanage;
 146
 147         bzero(trans, sizeof(*trans));
 148         trans->pmp = pmp;
 149         trans->flags = flags;
 150         trans->td = curthread;
 151
 152         lockmgr(&tman->translk, LK_EXCLUSIVE);
 153
 154         if (flags & HAMMER2_TRANS_ISFLUSH) {
 155                 /*
 156                  * If multiple flushes are trying to run we have to
 157                  * wait until it is our turn.  All flushes are serialized.
 158                  *
 159                  * We queue ourselves and then wait to become the head
 160                  * of the queue, allowing all prior flushes to complete.
 161                  *
 162                  * Multiple normal transactions can share the current
 163                  * transaction id but a flush transaction needs its own
 164                  * unique TID for proper block table update accounting.
 165                  */
 166                 ++tman->flushcnt;
 167                 ++pmp->modify_tid;
 168                 tman->flush_xid = hammer2_trans_newxid(pmp);
 169                 trans->sync_xid = tman->flush_xid;
 170                 trans->modify_tid = pmp->modify_tid;
 171                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
 172                 if (TAILQ_FIRST(&tman->transq) != trans) {
 173                         trans->blocked = 1;
 174                         while (trans->blocked) {
 175                                 lksleep(&trans->sync_xid, &tman->translk,
 176                                         0, "h2multf", hz);
 177                         }
 178                 }
 179         } else if (tman->flushcnt == 0) {
 180                 /*
 181                  * No flushes are pending, we can go.  Use prior flush_xid + 1.
 182                  *
 183                  * WARNING!  Also see hammer2_chain_setflush()
 184                  */
 185                 TAILQ_INSERT_TAIL(&tman->transq, trans, entry);
 186                 trans->sync_xid = tman->flush_xid + 1;
 187
 188                 /* XXX improve/optimize inode allocation */
 189         } else if (trans->flags & HAMMER2_TRANS_BUFCACHE) {
 190                 /*
 191                  * A buffer cache transaction is requested while a flush
 192                  * is in progress.  The flush's PREFLUSH flag must be set
 193                  * in this situation.
 194                  *
 195                  * The buffer cache flush takes on the main flush's
 196                  * transaction id.
 197                  */
 198                 TAILQ_FOREACH(head, &tman->transq, entry) {
 199                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 200                                 break;
 201                 }
 202                 KKASSERT(head);
 203                 KKASSERT(head->flags & HAMMER2_TRANS_PREFLUSH);
 204                 trans->flags |= HAMMER2_TRANS_PREFLUSH;
 205                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
 206                 trans->sync_xid = head->sync_xid;
 207                 trans->modify_tid = head->modify_tid;
 208                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
 209                 /* not allowed to block */
 210         } else {
 211                 /*
 212                  * A normal transaction is requested while a flush is in
 213                  * progress.  We insert after the current flush and may
 214                  * block.
 215                  *
 216                  * WARNING!  Also see hammer2_chain_setflush()
 217                  */
 218                 TAILQ_FOREACH(head, &tman->transq, entry) {
 219                         if (head->flags & HAMMER2_TRANS_ISFLUSH)
 220                                 break;
 221                 }
 222                 KKASSERT(head);
 223                 TAILQ_INSERT_AFTER(&tman->transq, head, trans, entry);
 224                 trans->sync_xid = head->sync_xid + 1;
 225                 trans->flags |= HAMMER2_TRANS_CONCURRENT;
 226
 227                 /*
 228                  * XXX for now we must block new transactions, synchronous
 229                  * flush mode is on by default.
 230                  *
 231                  * If synchronous flush mode is enabled concurrent
 232                  * frontend transactions during the flush are not
 233                  * allowed (except we don't have a choice for buffer
 234                  * cache ops).
 235                  */
 236                 if (hammer2_synchronous_flush > 0 ||
 237                     TAILQ_FIRST(&tman->transq) != head) {
 238                         trans->blocked = 1;
 239                         while (trans->blocked) {
 240                                 lksleep(&trans->sync_xid, &tman->translk,
 241                                         0, "h2multf", hz);
 242                         }
 243                 }
 244         }
 245         if (flags & HAMMER2_TRANS_NEWINODE) {
 246                 if (pmp->spmp_hmp) {
 247                         /*
 248                          * Super-root transaction, all new inodes have an
 249                          * inode number of 1.  Normal pfs inode cache
 250                          * semantics are not used.
 251                          */
 252                         trans->inode_tid = 1;
 253                 } else {
 254                         /*
 255                          * Normal transaction
 256                          */
 257                         if (pmp->inode_tid < HAMMER2_INODE_START)
 258                                 pmp->inode_tid = HAMMER2_INODE_START;
 259                         trans->inode_tid = pmp->inode_tid++;
 260                 }
 261         }
 262
 263         lockmgr(&tman->translk, LK_RELEASE);
 264 }
 265
 266 void
 267 hammer2_trans_done(hammer2_trans_t *trans)
 268 {
 269         hammer2_trans_manage_t *tman;
 270         hammer2_trans_t *head;
 271         hammer2_trans_t *scan;
 272
 273         tman = &tmanage;
 274
 275         /*
 276          * Remove.
 277          */
 278         lockmgr(&tman->translk, LK_EXCLUSIVE);
 279         TAILQ_REMOVE(&tman->transq, trans, entry);
 280         head = TAILQ_FIRST(&tman->transq);
 281
 282         /*
 283          * Adjust flushcnt if this was a flush, clear TRANS_CONCURRENT
 284          * up through the next flush.  (If the head is a flush then we
 285          * stop there, unlike the unblock code following this section).
 286          */
 287         if (trans->flags & HAMMER2_TRANS_ISFLUSH) {
 288                 --tman->flushcnt;
 289                 scan = head;
 290                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
 291                         atomic_clear_int(&scan->flags,
 292                                          HAMMER2_TRANS_CONCURRENT);
 293                         scan = TAILQ_NEXT(scan, entry);
 294                 }
 295         }
 296
 297         /*
 298          * Unblock the head of the queue and any additional transactions
 299          * up to the next flush.  The head can be a flush and it will be
 300          * unblocked along with the non-flush transactions following it
 301          * (which are allowed to run concurrently with it).
 302          *
 303          * In synchronous flush mode we stop if the head transaction is
 304          * a flush.
 305          */
 306         if (head && head->blocked) {
 307                 head->blocked = 0;
 308                 wakeup(&head->sync_xid);
 309
 310                 if (hammer2_synchronous_flush > 0)
 311                         scan = head;
 312                 else
 313                         scan = TAILQ_NEXT(head, entry);
 314                 while (scan && (scan->flags & HAMMER2_TRANS_ISFLUSH) == 0) {
 315                         if (scan->blocked) {
 316                                 scan->blocked = 0;
 317                                 wakeup(&scan->sync_xid);
 318                         }
 319                         scan = TAILQ_NEXT(scan, entry);
 320                 }
 321         }
 322         lockmgr(&tman->translk, LK_RELEASE);
 323 }
 324
 325 /*
 326  * Flush the chain and all modified sub-chains through the specified
 327  * synchronization point, propagating parent chain modifications, modify_tid,
 328  * and mirror_tid updates back up as needed.
 329  *
 330  * Caller must have interlocked against any non-flush-related modifying
 331  * operations in progress whos XXX values are less than or equal
 332  * to the passed sync_xid.
 333  *
 334  * Caller must have already vetted synchronization points to ensure they
 335  * are properly flushed.  Only snapshots and cluster flushes can create
 336  * these sorts of synchronization points.
 337  *
 338  * This routine can be called from several places but the most important
 339  * is from VFS_SYNC.
 340  *
 341  * chain is locked on call and will remain locked on return.  The chain's
 342  * UPDATE flag indicates that its parent's block table (which is not yet
 343  * part of the flush) should be updated.  The chain may be replaced by
 344  * the call if it was modified.
 345  */
 346 void
 347 hammer2_flush(hammer2_trans_t *trans, hammer2_chain_t *chain)
 348 {
 349         hammer2_chain_t *scan;
 350         hammer2_flush_info_t info;
 351         int loops;
 352
 353         /*
 354          * Execute the recursive flush and handle deferrals.
 355          *
 356          * Chains can be ridiculously long (thousands deep), so to
 357          * avoid blowing out the kernel stack the recursive flush has a
 358          * depth limit.  Elements at the limit are placed on a list
 359          * for re-execution after the stack has been popped.
 360          */
 361         bzero(&info, sizeof(info));
 362         TAILQ_INIT(&info.flushq);
 363         info.trans = trans;
 364         info.sync_xid = trans->sync_xid;
 365         info.cache_index = -1;
 366
 367         /*
 368          * Calculate parent (can be NULL), if not NULL the flush core
 369          * expects the parent to be referenced so it can easily lock/unlock
 370          * it without it getting ripped up.
 371          */
 372         if ((info.parent = chain->parent) != NULL)
 373                 hammer2_chain_ref(info.parent);
 374
 375         /*
 376          * Extra ref needed because flush_core expects it when replacing
 377          * chain.
 378          */
 379         hammer2_chain_ref(chain);
 380         loops = 0;
 381
 382         for (;;) {
 383                 /*
 384                  * Unwind deep recursions which had been deferred.  This
 385                  * can leave the FLUSH_* bits set for these chains, which
 386                  * will be handled when we [re]flush chain after the unwind.
 387                  */
 388                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
 389                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
 390                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
 391                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
 392
 393                         /*
 394                          * Now that we've popped back up we can do a secondary
 395                          * recursion on the deferred elements.
 396                          *
 397                          * NOTE: hammer2_flush() may replace scan.
 398                          */
 399                         if (hammer2_debug & 0x0040)
 400                                 kprintf("deferred flush %p\n", scan);
 401                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
 402                         hammer2_flush(trans, scan);
 403                         hammer2_chain_unlock(scan);
 404                         hammer2_chain_drop(scan);       /* ref from deferral */
 405                 }
 406
 407                 /*
 408                  * [re]flush chain.
 409                  */
 410                 info.diddeferral = 0;
 411                 hammer2_flush_core(&info, chain, 0);
 412
 413                 /*
 414                  * Only loop if deep recursions have been deferred.
 415                  */
 416                 if (TAILQ_EMPTY(&info.flushq))
 417                         break;
 418
 419                 if (++loops % 1000 == 0) {
 420                         kprintf("hammer2_flush: excessive loops on %p\n",
 421                                 chain);
 422                         if (hammer2_debug & 0x100000)
 423                                 Debugger("hell4");
 424                 }
 425         }
 426         hammer2_chain_drop(chain);
 427         if (info.parent)
 428                 hammer2_chain_drop(info.parent);
 429 }
 430
 431 /*
 432  * This is the core of the chain flushing code.  The chain is locked by the
 433  * caller and must also have an extra ref on it by the caller, and remains
 434  * locked and will have an extra ref on return.  Upon return, the caller can
 435  * test the UPDATE bit on the child to determine if the parent needs updating.
 436  *
 437  * (1) Determine if this node is a candidate for the flush, return if it is
 438  *     not.  fchain and vchain are always candidates for the flush.
 439  *
 440  * (2) If we recurse too deep the chain is entered onto the deferral list and
 441  *     the current flush stack is aborted until after the deferral list is
 442  *     run.
 443  *
 444  * (3) Recursively flush live children (rbtree).  This can create deferrals.
 445  *     A successful flush clears the MODIFIED and UPDATE bits on the children
 446  *     and typically causes the parent to be marked MODIFIED as the children
 447  *     update the parent's block table.  A parent might already be marked
 448  *     MODIFIED due to a deletion (whos blocktable update in the parent is
 449  *     handled by the frontend), or if the parent itself is modified by the
 450  *     frontend for other reasons.
 451  *
 452  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
 453  *     Deleted-but-open inodes can still be individually flushed via the
 454  *     filesystem syncer.
 455  *
 456  * (5) Note that an unmodified child may still need the block table in its
 457  *     parent updated (e.g. rename/move).  The child will have UPDATE set
 458  *     in this case.
 459  *
 460  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
 461  *
 462  * blockref.modify_tid is consistent only within a PFS, and will not be
 463  * consistent during synchronization.  mirror_tid is consistent across the
 464  * block device regardless of the PFS.
 465  */
 466 static void
 467 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
 468                    int deleting)
 469 {
 470         hammer2_chain_t *parent;
 471         hammer2_dev_t *hmp;
 472         int diddeferral;
 473
 474         /*
 475          * (1) Optimize downward recursion to locate nodes needing action.
 476          *     Nothing to do if none of these flags are set.
 477          */
 478         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
 479                 if (hammer2_debug & 0x200) {
 480                         if (info->debug == NULL)
 481                                 info->debug = chain;
 482                 } else {
 483                         return;
 484                 }
 485         }
 486
 487         hmp = chain->hmp;
 488         diddeferral = info->diddeferral;
 489         parent = info->parent;          /* can be NULL */
 490
 491         /*
 492          * Downward search recursion
 493          */
 494         if (chain->flags & HAMMER2_CHAIN_DEFERRED) {
 495                 /*
 496                  * Already deferred.
 497                  */
 498                 ++info->diddeferral;
 499         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
 500                 /*
 501                  * Recursion depth reached.
 502                  */
 503                 hammer2_chain_ref(chain);
 504                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
 505                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
 506                 ++info->diddeferral;
 507         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
 508                 /*
 509                  * Downward recursion search (actual flush occurs bottom-up).
 510                  * pre-clear ONFLUSH.  It can get set again due to races,
 511                  * which we want so the scan finds us again in the next flush.
 512                  */
 513                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
 514                 info->parent = chain;
 515                 hammer2_spin_ex(&chain->core.spin);
 516                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
 517                         NULL, hammer2_flush_recurse, info);
 518                 hammer2_spin_unex(&chain->core.spin);
 519                 info->parent = parent;
 520                 if (info->diddeferral)
 521                         hammer2_chain_setflush(info->trans, chain);
 522         }
 523
 524         /*
 525          * Now we are in the bottom-up part of the recursion.
 526          *
 527          * Do not update chain if lower layers were deferred.
 528          */
 529         if (info->diddeferral)
 530                 goto done;
 531
 532         /*
 533          * Propagate the DESTROY flag downwards.  This dummies up the flush
 534          * code and tries to invalidate related buffer cache buffers to
 535          * avoid the disk write.
 536          */
 537         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
 538                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
 539
 540         /*
 541          * Chain was already modified or has become modified, flush it out.
 542          */
 543 again:
 544         if ((hammer2_debug & 0x200) &&
 545             info->debug &&
 546             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
 547                 hammer2_chain_t *scan = chain;
 548
 549                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
 550                 while (scan) {
 551                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
 552                                 scan, scan->flags,
 553                                 scan->bref.key, scan->bref.type);
 554                         if (scan == info->debug)
 555                                 break;
 556                         scan = scan->parent;
 557                 }
 558         }
 559
 560         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 561                 /*
 562                  * Dispose of the modified bit.
 563                  *
 564                  * UPDATE should already be set.
 565                  * bref.mirror_tid should already be set.
 566                  */
 567                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 568                          chain == &hmp->vchain);
 569                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
 570
 571                 /*
 572                  * Manage threads waiting for excessive dirty memory to
 573                  * be retired.
 574                  */
 575                 if (chain->pmp)
 576                         hammer2_pfs_memory_wakeup(chain->pmp);
 577
 578                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 579                     chain == &hmp->vchain ||
 580                     chain == &hmp->fchain) {
 581                         /*
 582                          * Drop the ref from the MODIFIED bit we cleared,
 583                          * net -1 ref.
 584                          */
 585                         hammer2_chain_drop(chain);
 586                 } else {
 587                         /*
 588                          * Drop the ref from the MODIFIED bit we cleared and
 589                          * set a ref for the UPDATE bit we are setting.  Net
 590                          * 0 refs.
 591                          */
 592                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 593                 }
 594
 595                 /*
 596                  * Issue the flush.  This is indirect via the DIO.
 597                  *
 598                  * NOTE: A DELETED node that reaches this point must be
 599                  *       flushed for synchronization point consistency.
 600                  *
 601                  * NOTE: Even though MODIFIED was already set, the related DIO
 602                  *       might not be dirty due to a system buffer cache
 603                  *       flush and must be set dirty if we are going to make
 604                  *       further modifications to the buffer.  Chains with
 605                  *       embedded data don't need this.
 606                  */
 607                 if (hammer2_debug & 0x1000) {
 608                         kprintf("Flush %p.%d %016jx/%d sync_xid=%08x "
 609                                 "data=%016jx\n",
 610                                 chain, chain->bref.type,
 611                                 chain->bref.key, chain->bref.keybits,
 612                                 info->sync_xid,
 613                                 chain->bref.data_off);
 614                 }
 615                 if (hammer2_debug & 0x2000) {
 616                         Debugger("Flush hell");
 617                 }
 618
 619                 /*
 620                  * Update chain CRCs for flush.
 621                  *
 622                  * NOTE: Volume headers are NOT flushed here as they require
 623                  *       special processing.
 624                  */
 625                 switch(chain->bref.type) {
 626                 case HAMMER2_BREF_TYPE_FREEMAP:
 627                         /*
 628                          * Update the volume header's freemap_tid to the
 629                          * freemap's flushing mirror_tid.
 630                          *
 631                          * (note: embedded data, do not call setdirty)
 632                          */
 633                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
 634                         KKASSERT(chain == &hmp->fchain);
 635                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
 636                         kprintf("sync freemap mirror_tid %08jx\n",
 637                                 (intmax_t)chain->bref.mirror_tid);
 638
 639                         /*
 640                          * The freemap can be flushed independently of the
 641                          * main topology, but for the case where it is
 642                          * flushed in the same transaction, and flushed
 643                          * before vchain (a case we want to allow for
 644                          * performance reasons), make sure modifications
 645                          * made during the flush under vchain use a new
 646                          * transaction id.
 647                          *
 648                          * Otherwise the mount recovery code will get confused.
 649                          */
 650                         ++hmp->voldata.mirror_tid;
 651                         break;
 652                 case HAMMER2_BREF_TYPE_VOLUME:
 653                         /*
 654                          * The free block table is flushed by
 655                          * hammer2_vfs_sync() before it flushes vchain.
 656                          * We must still hold fchain locked while copying
 657                          * voldata to volsync, however.
 658                          *
 659                          * (note: embedded data, do not call setdirty)
 660                          */
 661                         hammer2_voldata_lock(hmp);
 662                         hammer2_chain_lock(&hmp->fchain,
 663                                            HAMMER2_RESOLVE_ALWAYS);
 664                         kprintf("sync volume  mirror_tid %08jx\n",
 665                                 (intmax_t)chain->bref.mirror_tid);
 666
 667                         /*
 668                          * Update the volume header's mirror_tid to the
 669                          * main topology's flushing mirror_tid.  It is
 670                          * possible that voldata.mirror_tid is already
 671                          * beyond bref.mirror_tid due to the bump we made
 672                          * above in BREF_TYPE_FREEMAP.
 673                          */
 674                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
 675                                 hmp->voldata.mirror_tid =
 676                                         chain->bref.mirror_tid;
 677                         }
 678
 679                         /*
 680                          * The volume header is flushed manually by the
 681                          * syncer, not here.  All we do here is adjust the
 682                          * crc's.
 683                          */
 684                         KKASSERT(chain->data != NULL);
 685                         KKASSERT(chain->dio == NULL);
 686
 687                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
 688                                 hammer2_icrc32(
 689                                         (char *)&hmp->voldata +
 690                                          HAMMER2_VOLUME_ICRC1_OFF,
 691                                         HAMMER2_VOLUME_ICRC1_SIZE);
 692                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
 693                                 hammer2_icrc32(
 694                                         (char *)&hmp->voldata +
 695                                          HAMMER2_VOLUME_ICRC0_OFF,
 696                                         HAMMER2_VOLUME_ICRC0_SIZE);
 697                         hmp->voldata.icrc_volheader =
 698                                 hammer2_icrc32(
 699                                         (char *)&hmp->voldata +
 700                                          HAMMER2_VOLUME_ICRCVH_OFF,
 701                                         HAMMER2_VOLUME_ICRCVH_SIZE);
 702
 703                         kprintf("syncvolhdr %016jx %016jx\n",
 704                                 hmp->voldata.mirror_tid,
 705                                 hmp->vchain.bref.mirror_tid);
 706                         hmp->volsync = hmp->voldata;
 707                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
 708                         hammer2_chain_unlock(&hmp->fchain);
 709                         hammer2_voldata_unlock(hmp);
 710                         break;
 711                 case HAMMER2_BREF_TYPE_DATA:
 712                         /*
 713                          * Data elements have already been flushed via the
 714                          * logical file buffer cache.  Their hash was set in
 715                          * the bref by the vop_write code.  Do not re-dirty.
 716                          *
 717                          * Make sure any device buffer(s) have been flushed
 718                          * out here (there aren't usually any to flush) XXX.
 719                          */
 720                         break;
 721                 case HAMMER2_BREF_TYPE_INDIRECT:
 722                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 723                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 724                         /*
 725                          * Buffer I/O will be cleaned up when the volume is
 726                          * flushed (but the kernel is free to flush it before
 727                          * then, as well).
 728                          */
 729                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 730                         hammer2_chain_setcheck(chain, chain->data);
 731                         break;
 732                 case HAMMER2_BREF_TYPE_INODE:
 733                         /*
 734                          * NOTE: We must call io_setdirty() to make any late
 735                          *       changes to the inode data, the system might
 736                          *       have already flushed the buffer.
 737                          */
 738                         if (chain->data->ipdata.op_flags &
 739                             HAMMER2_OPFLAG_PFSROOT) {
 740                                 /*
 741                                  * non-NULL pmp if mounted as a PFS.  We must
 742                                  * sync fields cached in the pmp? XXX
 743                                  */
 744                                 hammer2_inode_data_t *ipdata;
 745
 746                                 hammer2_io_setdirty(chain->dio);
 747                                 ipdata = &chain->data->ipdata;
 748                                 if (chain->pmp) {
 749                                         ipdata->pfs_inum =
 750                                                 chain->pmp->inode_tid;
 751                                 }
 752                         } else {
 753                                 /* can't be mounted as a PFS */
 754                         }
 755
 756                         /*
 757                          * Update inode statistics.  Pending stats in chain
 758                          * are cleared out on UPDATE so expect that bit to
 759                          * be set here too or the statistics will not be
 760                          * rolled-up properly.
 761                          */
 762                         if (chain->data_count || chain->inode_count) {
 763                                 hammer2_inode_data_t *ipdata;
 764
 765                                 KKASSERT(chain->flags & HAMMER2_CHAIN_UPDATE);
 766                                 hammer2_io_setdirty(chain->dio);
 767                                 ipdata = &chain->data->ipdata;
 768                                 ipdata->data_count += chain->data_count;
 769                                 ipdata->inode_count += chain->inode_count;
 770                         }
 771                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 772                         hammer2_chain_setcheck(chain, chain->data);
 773                         break;
 774                 default:
 775                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
 776                         panic("hammer2_flush_core: unsupported "
 777                               "embedded bref %d",
 778                               chain->bref.type);
 779                         /* NOT REACHED */
 780                 }
 781
 782                 /*
 783                  * If the chain was destroyed try to avoid unnecessary I/O.
 784                  * (this only really works if the DIO system buffer is the
 785                  * same size as chain->bytes).
 786                  */
 787                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
 788                         hammer2_io_setinval(chain->dio, chain->bytes);
 789                 }
 790         }
 791
 792         /*
 793          * If UPDATE is set the parent block table may need to be updated.
 794          *
 795          * NOTE: UPDATE may be set on vchain or fchain in which case
 796          *       parent could be NULL.  It's easiest to allow the case
 797          *       and test for NULL.  parent can also wind up being NULL
 798          *       due to a deletion so we need to handle the case anyway.
 799          *
 800          * If no parent exists we can just clear the UPDATE bit.  If the
 801          * chain gets reattached later on the bit will simply get set
 802          * again.
 803          */
 804         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
 805                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 806                 hammer2_chain_drop(chain);
 807         }
 808
 809         /*
 810          * The chain may need its blockrefs updated in the parent.  This
 811          * requires some fancy footwork.
 812          */
 813         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 814                 hammer2_blockref_t *base;
 815                 int count;
 816
 817                 /*
 818                  * Both parent and chain must be locked.  This requires
 819                  * temporarily unlocking the chain.  We have to deal with
 820                  * the case where the chain might be reparented or modified
 821                  * while it was unlocked.
 822                  */
 823                 hammer2_chain_unlock(chain);
 824                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
 825                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
 826                 if (chain->parent != parent) {
 827                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n", chain, chain->parent, parent);
 828                         hammer2_chain_unlock(parent);
 829                         goto done;
 830                 }
 831
 832                 /*
 833                  * Check race condition.  If someone got in and modified
 834                  * it again while it was unlocked, we have to loop up.
 835                  */
 836                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 837                         hammer2_chain_unlock(parent);
 838                         kprintf("hammer2_flush: chain %p flush-mod race\n",
 839                                 chain);
 840                         goto again;
 841                 }
 842
 843                 /*
 844                  * Clear UPDATE flag
 845                  */
 846                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 847                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 848                         hammer2_chain_drop(chain);
 849                 }
 850                 hammer2_chain_modify(info->trans, parent, 0);
 851
 852                 /*
 853                  * Calculate blockmap pointer
 854                  */
 855                 switch(parent->bref.type) {
 856                 case HAMMER2_BREF_TYPE_INODE:
 857                         /*
 858                          * Access the inode's block array.  However, there is
 859                          * no block array if the inode is flagged DIRECTDATA.
 860                          */
 861                         if (parent->data &&
 862                             (parent->data->ipdata.op_flags &
 863                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
 864                                 base = &parent->data->
 865                                         ipdata.u.blockset.blockref[0];
 866                         } else {
 867                                 base = NULL;
 868                         }
 869                         count = HAMMER2_SET_COUNT;
 870                         break;
 871                 case HAMMER2_BREF_TYPE_INDIRECT:
 872                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 873                         if (parent->data)
 874                                 base = &parent->data->npdata[0];
 875                         else
 876                                 base = NULL;
 877                         count = parent->bytes / sizeof(hammer2_blockref_t);
 878                         break;
 879                 case HAMMER2_BREF_TYPE_VOLUME:
 880                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
 881                         count = HAMMER2_SET_COUNT;
 882                         break;
 883                 case HAMMER2_BREF_TYPE_FREEMAP:
 884                         base = &parent->data->npdata[0];
 885                         count = HAMMER2_SET_COUNT;
 886                         break;
 887                 default:
 888                         base = NULL;
 889                         count = 0;
 890                         panic("hammer2_flush_core: "
 891                               "unrecognized blockref type: %d",
 892                               parent->bref.type);
 893                 }
 894
 895                 /*
 896                  * Blocktable updates
 897                  *
 898                  * We synchronize pending statistics at this time.  Delta
 899                  * adjustments designated for the current and upper level
 900                  * are synchronized.
 901                  */
 902                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
 903                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
 904                                 hammer2_base_delete(info->trans, parent,
 905                                                     base, count,
 906                                                     &info->cache_index, chain);
 907                                 /* base_delete clears both bits */
 908                         } else {
 909                                 atomic_clear_int(&chain->flags,
 910                                                  HAMMER2_CHAIN_BMAPUPD);
 911                         }
 912                 }
 913                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
 914                         parent->data_count += chain->data_count +
 915                                               chain->data_count_up;
 916                         parent->inode_count += chain->inode_count +
 917                                                chain->inode_count_up;
 918                         chain->data_count = 0;
 919                         chain->inode_count = 0;
 920                         chain->data_count_up = 0;
 921                         chain->inode_count_up = 0;
 922                         hammer2_base_insert(info->trans, parent,
 923                                             base, count,
 924                                             &info->cache_index, chain);
 925                         /* base_insert sets BMAPPED */
 926                 }
 927                 hammer2_chain_unlock(parent);
 928         }
 929
 930         /*
 931          * Final cleanup after flush
 932          */
 933 done:
 934         KKASSERT(chain->refs > 0);
 935         if (hammer2_debug & 0x200) {
 936                 if (info->debug == chain)
 937                         info->debug = NULL;
 938         }
 939 }
 940
 941 /*
 942  * Flush recursion helper, called from flush_core, calls flush_core.
 943  *
 944  * Flushes the children of the caller's chain (info->parent), restricted
 945  * by sync_tid.  Set info->domodify if the child's blockref must propagate
 946  * back up to the parent.
 947  *
 948  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
 949  * flush scan order prevents any chains from being lost.  A child can be
 950  * executes more than once.
 951  *
 952  * WARNING! If we do not call hammer2_flush_core() we must update
 953  *          bref.mirror_tid ourselves to indicate that the flush has
 954  *          processed the child.
 955  *
 956  * WARNING! parent->core spinlock is held on entry and return.
 957  *
 958  * WARNING! Flushes do not cross PFS boundaries.  Specifically, a flush must
 959  *          not cross a pfs-root boundary.
 960  */
 961 static int
 962 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
 963 {
 964         hammer2_flush_info_t *info = data;
 965         /*hammer2_trans_t *trans = info->trans;*/
 966         hammer2_chain_t *parent = info->parent;
 967
 968         /*
 969          * (child can never be fchain or vchain so a special check isn't
 970          *  needed).
 971          *
 972          * We must ref the child before unlocking the spinlock.
 973          *
 974          * The caller has added a ref to the parent so we can temporarily
 975          * unlock it in order to lock the child.
 976          */
 977         hammer2_chain_ref(child);
 978         hammer2_spin_unex(&parent->core.spin);
 979
 980         hammer2_chain_unlock(parent);
 981         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
 982
 983         /*
 984          * Recurse and collect deferral data.  We're in the media flush,
 985          * this can cross PFS boundaries.
 986          */
 987         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
 988                 ++info->depth;
 989                 hammer2_flush_core(info, child, 0); /* XXX deleting */
 990                 --info->depth;
 991         } else if (hammer2_debug & 0x200) {
 992                 if (info->debug == NULL)
 993                         info->debug = child;
 994                 ++info->depth;
 995                 hammer2_flush_core(info, child, 0); /* XXX deleting */
 996                 --info->depth;
 997                 if (info->debug == child)
 998                         info->debug = NULL;
 999         }
1000
1001         /*
1002          * Relock to continue the loop
1003          */
1004         hammer2_chain_unlock(child);
1005         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
1006         hammer2_chain_drop(child);
1007         KKASSERT(info->parent == parent);
1008         hammer2_spin_ex(&parent->core.spin);
1009
1010         return (0);
1011 }