sys/vfs/hammer2/hammer2_flush.c

   1 /*
   2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@dragonflybsd.org>
   6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  *
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in
  16  *    the documentation and/or other materials provided with the
  17  *    distribution.
  18  * 3. Neither the name of The DragonFly Project nor the names of its
  19  *    contributors may be used to endorse or promote products derived
  20  *    from this software without specific, prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35 /*
  36  *                      TRANSACTION AND FLUSH HANDLING
  37  *
  38  * Deceptively simple but actually fairly difficult to implement properly is
  39  * how I would describe it.
  40  *
  41  * Flushing generally occurs bottom-up but requires a top-down scan to
  42  * locate chains with MODIFIED and/or UPDATE bits set.  The ONFLUSH flag
  43  * tells how to recurse downward to find these chains.
  44  */
  45
  46 #include <sys/cdefs.h>
  47 #include <sys/param.h>
  48 #include <sys/systm.h>
  49 #include <sys/types.h>
  50 #include <sys/lock.h>
  51 #include <sys/uuid.h>
  52
  53 #include "hammer2.h"
  54
  55 #define FLUSH_DEBUG 0
  56
  57 #define HAMMER2_FLUSH_DEPTH_LIMIT       10      /* stack recursion limit */
  58
  59
  60 /*
  61  * Recursively flush the specified chain.  The chain is locked and
  62  * referenced by the caller and will remain so on return.  The chain
  63  * will remain referenced throughout but can temporarily lose its
  64  * lock during the recursion to avoid unnecessarily stalling user
  65  * processes.
  66  */
  67 struct hammer2_flush_info {
  68         hammer2_chain_t *parent;
  69         int             depth;
  70         int             diddeferral;
  71         int             cache_index;
  72         struct h2_flush_list flushq;
  73         hammer2_chain_t *debug;
  74 };
  75
  76 typedef struct hammer2_flush_info hammer2_flush_info_t;
  77
  78 static void hammer2_flush_core(hammer2_flush_info_t *info,
  79                                 hammer2_chain_t *chain, int deleting);
  80 static int hammer2_flush_recurse(hammer2_chain_t *child, void *data);
  81
  82 /*
  83  * Any per-pfs transaction initialization goes here.
  84  */
  85 void
  86 hammer2_trans_manage_init(hammer2_pfs_t *pmp)
  87 {
  88 }
  89
  90 /*
  91  * Transaction support for any modifying operation.  Transactions are used
  92  * in the pmp layer by the frontend and in the spmp layer by the backend.
  93  *
  94  * 0                    - Normal transaction, interlocked against flush
  95  *                        transaction.
  96  *
  97  * TRANS_ISFLUSH        - Flush transaction, interlocked against normal
  98  *                        transaction.
  99  *
 100  * TRANS_BUFCACHE       - Buffer cache transaction, no interlock.
 101  *
 102  * Initializing a new transaction allocates a transaction ID.  Typically
 103  * passed a pmp (hmp passed as NULL), indicating a cluster transaction.  Can
 104  * be passed a NULL pmp and non-NULL hmp to indicate a transaction on a single
 105  * media target.  The latter mode is used by the recovery code.
 106  *
 107  * TWO TRANSACTION IDs can run concurrently, where one is a flush and the
 108  * other is a set of any number of concurrent filesystem operations.  We
 109  * can either have <running_fs_ops> + <waiting_flush> + <blocked_fs_ops>
 110  * or we can have <running_flush> + <concurrent_fs_ops>.
 111  *
 112  * During a flush, new fs_ops are only blocked until the fs_ops prior to
 113  * the flush complete.  The new fs_ops can then run concurrent with the flush.
 114  *
 115  * Buffer-cache transactions operate as fs_ops but never block.  A
 116  * buffer-cache flush will run either before or after the current pending
 117  * flush depending on its state.
 118  */
 119 void
 120 hammer2_trans_init(hammer2_pfs_t *pmp, uint32_t flags)
 121 {
 122         uint32_t oflags;
 123         uint32_t nflags;
 124         int dowait;
 125
 126         for (;;) {
 127                 oflags = pmp->trans.flags;
 128                 cpu_ccfence();
 129                 dowait = 0;
 130
 131                 if (flags & HAMMER2_TRANS_ISFLUSH) {
 132                         /*
 133                          * Requesting flush transaction.  Wait for all
 134                          * currently running transactions to finish.
 135                          */
 136                         if (oflags & HAMMER2_TRANS_MASK) {
 137                                 nflags = oflags | HAMMER2_TRANS_FPENDING |
 138                                                   HAMMER2_TRANS_WAITING;
 139                                 dowait = 1;
 140                         } else {
 141                                 nflags = (oflags | flags) + 1;
 142                         }
 143                         ++pmp->modify_tid;
 144                 } else if (flags & HAMMER2_TRANS_BUFCACHE) {
 145                         /*
 146                          * Requesting strategy transaction.  Generally
 147                          * allowed in all situations unless a flush
 148                          * is running without the preflush flag.
 149                          */
 150                         if ((oflags & (HAMMER2_TRANS_ISFLUSH |
 151                                        HAMMER2_TRANS_PREFLUSH)) ==
 152                             HAMMER2_TRANS_ISFLUSH) {
 153                                 nflags = oflags | HAMMER2_TRANS_WAITING;
 154                                 dowait = 1;
 155                         } else {
 156                                 nflags = (oflags | flags) + 1;
 157                         }
 158                 } else {
 159                         /*
 160                          * Requesting normal transaction.  Wait for any
 161                          * flush to finish before allowing.
 162                          */
 163                         if (oflags & HAMMER2_TRANS_ISFLUSH) {
 164                                 nflags = oflags | HAMMER2_TRANS_WAITING;
 165                                 dowait = 1;
 166                         } else {
 167                                 nflags = (oflags | flags) + 1;
 168                         }
 169                 }
 170                 if (dowait)
 171                         tsleep_interlock(&pmp->trans.sync_wait, 0);
 172                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
 173                         if (dowait == 0)
 174                                 break;
 175                         tsleep(&pmp->trans.sync_wait, PINTERLOCKED,
 176                                "h2trans", hz);
 177                 } else {
 178                         cpu_pause();
 179                 }
 180                 /* retry */
 181         }
 182 }
 183
 184 void
 185 hammer2_trans_done(hammer2_pfs_t *pmp)
 186 {
 187         uint32_t oflags;
 188         uint32_t nflags;
 189
 190         for (;;) {
 191                 oflags = pmp->trans.flags;
 192                 cpu_ccfence();
 193                 KKASSERT(oflags & HAMMER2_TRANS_MASK);
 194                 if ((oflags & HAMMER2_TRANS_MASK) == 1) {
 195                         /*
 196                          * This was the last transaction
 197                          */
 198                         nflags = (oflags - 1) & ~(HAMMER2_TRANS_ISFLUSH |
 199                                                   HAMMER2_TRANS_BUFCACHE |
 200                                                   HAMMER2_TRANS_PREFLUSH |
 201                                                   HAMMER2_TRANS_FPENDING |
 202                                                   HAMMER2_TRANS_WAITING);
 203                 } else {
 204                         /*
 205                          * Still transactions pending
 206                          */
 207                         nflags = oflags - 1;
 208                 }
 209                 if (atomic_cmpset_int(&pmp->trans.flags, oflags, nflags)) {
 210                         if ((nflags & HAMMER2_TRANS_MASK) == 0 &&
 211                             (oflags & HAMMER2_TRANS_WAITING)) {
 212                                 wakeup(&pmp->trans.sync_wait);
 213                         }
 214                         break;
 215                 } else {
 216                         cpu_pause();
 217                 }
 218                 /* retry */
 219         }
 220 }
 221
 222 /*
 223  * Obtain new, unique inode number (not serialized by caller).
 224  */
 225 hammer2_tid_t
 226 hammer2_trans_newinum(hammer2_pfs_t *pmp)
 227 {
 228         hammer2_tid_t tid;
 229
 230         KKASSERT(sizeof(long) == 8);
 231         tid = atomic_fetchadd_long(&pmp->inode_tid, 1);
 232
 233         return tid;
 234 }
 235
 236 /*
 237  * Assert that a strategy call is ok here.  Strategy calls are legal
 238  *
 239  * (1) In a normal transaction.
 240  * (2) In a flush transaction only if PREFLUSH is also set.
 241  */
 242 void
 243 hammer2_trans_assert_strategy(hammer2_pfs_t *pmp)
 244 {
 245         KKASSERT((pmp->trans.flags & HAMMER2_TRANS_ISFLUSH) == 0 ||
 246                  (pmp->trans.flags & HAMMER2_TRANS_PREFLUSH));
 247 }
 248
 249
 250 /*
 251  * Chains undergoing destruction are removed from the in-memory topology.
 252  * To avoid getting lost these chains are placed on the delayed flush
 253  * queue which will properly dispose of them.
 254  *
 255  * We do this instead of issuing an immediate flush in order to give
 256  * recursive deletions (rm -rf, etc) a chance to remove more of the
 257  * hierarchy, potentially allowing an enormous amount of write I/O to
 258  * be avoided.
 259  */
 260 void
 261 hammer2_delayed_flush(hammer2_chain_t *chain)
 262 {
 263         if ((chain->flags & HAMMER2_CHAIN_DELAYED) == 0) {
 264                 hammer2_spin_ex(&chain->hmp->list_spin);
 265                 if ((chain->flags & (HAMMER2_CHAIN_DELAYED |
 266                                      HAMMER2_CHAIN_DEFERRED)) == 0) {
 267                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELAYED |
 268                                                       HAMMER2_CHAIN_DEFERRED);
 269                         TAILQ_INSERT_TAIL(&chain->hmp->flushq,
 270                                           chain, flush_node);
 271                         hammer2_chain_ref(chain);
 272                 }
 273                 hammer2_spin_unex(&chain->hmp->list_spin);
 274         }
 275 }
 276
 277 /*
 278  * Flush the chain and all modified sub-chains through the specified
 279  * synchronization point, propagating parent chain modifications, modify_tid,
 280  * and mirror_tid updates back up as needed.
 281  *
 282  * Caller must have already vetted synchronization points to ensure they
 283  * are properly flushed.  Only snapshots and cluster flushes can create
 284  * these sorts of synchronization points.
 285  *
 286  * This routine can be called from several places but the most important
 287  * is from VFS_SYNC.
 288  *
 289  * chain is locked on call and will remain locked on return.  The chain's
 290  * UPDATE flag indicates that its parent's block table (which is not yet
 291  * part of the flush) should be updated.  The chain may be replaced by
 292  * the call if it was modified.
 293  */
 294 void
 295 hammer2_flush(hammer2_chain_t *chain, int istop)
 296 {
 297         hammer2_chain_t *scan;
 298         hammer2_flush_info_t info;
 299         hammer2_dev_t *hmp;
 300         int loops;
 301
 302         /*
 303          * Execute the recursive flush and handle deferrals.
 304          *
 305          * Chains can be ridiculously long (thousands deep), so to
 306          * avoid blowing out the kernel stack the recursive flush has a
 307          * depth limit.  Elements at the limit are placed on a list
 308          * for re-execution after the stack has been popped.
 309          */
 310         bzero(&info, sizeof(info));
 311         TAILQ_INIT(&info.flushq);
 312         info.cache_index = -1;
 313
 314         /*
 315          * Calculate parent (can be NULL), if not NULL the flush core
 316          * expects the parent to be referenced so it can easily lock/unlock
 317          * it without it getting ripped up.
 318          */
 319         if ((info.parent = chain->parent) != NULL)
 320                 hammer2_chain_ref(info.parent);
 321
 322         /*
 323          * Extra ref needed because flush_core expects it when replacing
 324          * chain.
 325          */
 326         hammer2_chain_ref(chain);
 327         hmp = chain->hmp;
 328         loops = 0;
 329
 330         for (;;) {
 331                 /*
 332                  * Move hmp->flushq to info.flushq if non-empty so it can
 333                  * be processed.
 334                  */
 335                 if (TAILQ_FIRST(&hmp->flushq) != NULL) {
 336                         hammer2_spin_ex(&chain->hmp->list_spin);
 337                         TAILQ_CONCAT(&info.flushq, &hmp->flushq, flush_node);
 338                         hammer2_spin_unex(&chain->hmp->list_spin);
 339                 }
 340
 341                 /*
 342                  * Unwind deep recursions which had been deferred.  This
 343                  * can leave the FLUSH_* bits set for these chains, which
 344                  * will be handled when we [re]flush chain after the unwind.
 345                  */
 346                 while ((scan = TAILQ_FIRST(&info.flushq)) != NULL) {
 347                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
 348                         TAILQ_REMOVE(&info.flushq, scan, flush_node);
 349                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED |
 350                                                        HAMMER2_CHAIN_DELAYED);
 351
 352                         /*
 353                          * Now that we've popped back up we can do a secondary
 354                          * recursion on the deferred elements.
 355                          *
 356                          * NOTE: hammer2_flush() may replace scan.
 357                          */
 358                         if (hammer2_debug & 0x0040)
 359                                 kprintf("deferred flush %p\n", scan);
 360                         hammer2_chain_lock(scan, HAMMER2_RESOLVE_MAYBE);
 361                         hammer2_flush(scan, 0);
 362                         hammer2_chain_unlock(scan);
 363                         hammer2_chain_drop(scan);       /* ref from deferral */
 364                 }
 365
 366                 /*
 367                  * [re]flush chain.
 368                  */
 369                 info.diddeferral = 0;
 370                 hammer2_flush_core(&info, chain, istop);
 371
 372                 /*
 373                  * Only loop if deep recursions have been deferred.
 374                  */
 375                 if (TAILQ_EMPTY(&info.flushq))
 376                         break;
 377
 378                 if (++loops % 1000 == 0) {
 379                         kprintf("hammer2_flush: excessive loops on %p\n",
 380                                 chain);
 381                         if (hammer2_debug & 0x100000)
 382                                 Debugger("hell4");
 383                 }
 384         }
 385         hammer2_chain_drop(chain);
 386         if (info.parent)
 387                 hammer2_chain_drop(info.parent);
 388 }
 389
 390 /*
 391  * This is the core of the chain flushing code.  The chain is locked by the
 392  * caller and must also have an extra ref on it by the caller, and remains
 393  * locked and will have an extra ref on return.  Upon return, the caller can
 394  * test the UPDATE bit on the child to determine if the parent needs updating.
 395  *
 396  * (1) Determine if this node is a candidate for the flush, return if it is
 397  *     not.  fchain and vchain are always candidates for the flush.
 398  *
 399  * (2) If we recurse too deep the chain is entered onto the deferral list and
 400  *     the current flush stack is aborted until after the deferral list is
 401  *     run.
 402  *
 403  * (3) Recursively flush live children (rbtree).  This can create deferrals.
 404  *     A successful flush clears the MODIFIED and UPDATE bits on the children
 405  *     and typically causes the parent to be marked MODIFIED as the children
 406  *     update the parent's block table.  A parent might already be marked
 407  *     MODIFIED due to a deletion (whos blocktable update in the parent is
 408  *     handled by the frontend), or if the parent itself is modified by the
 409  *     frontend for other reasons.
 410  *
 411  * (4) Permanently disconnected sub-trees are cleaned up by the front-end.
 412  *     Deleted-but-open inodes can still be individually flushed via the
 413  *     filesystem syncer.
 414  *
 415  * (5) Note that an unmodified child may still need the block table in its
 416  *     parent updated (e.g. rename/move).  The child will have UPDATE set
 417  *     in this case.
 418  *
 419  *                      WARNING ON BREF MODIFY_TID/MIRROR_TID
 420  *
 421  * blockref.modify_tid is consistent only within a PFS, and will not be
 422  * consistent during synchronization.  mirror_tid is consistent across the
 423  * block device regardless of the PFS.
 424  */
 425 static void
 426 hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain,
 427                    int istop)
 428 {
 429         hammer2_chain_t *parent;
 430         hammer2_dev_t *hmp;
 431         int diddeferral;
 432
 433         /*
 434          * (1) Optimize downward recursion to locate nodes needing action.
 435          *     Nothing to do if none of these flags are set.
 436          */
 437         if ((chain->flags & HAMMER2_CHAIN_FLUSH_MASK) == 0) {
 438                 if (hammer2_debug & 0x200) {
 439                         if (info->debug == NULL)
 440                                 info->debug = chain;
 441                 } else {
 442                         return;
 443                 }
 444         }
 445
 446         hmp = chain->hmp;
 447         diddeferral = info->diddeferral;
 448         parent = info->parent;          /* can be NULL */
 449
 450         /*
 451          * Downward search recursion
 452          */
 453         if (chain->flags & (HAMMER2_CHAIN_DEFERRED | HAMMER2_CHAIN_DELAYED)) {
 454                 /*
 455                  * Already deferred.
 456                  */
 457                 ++info->diddeferral;
 458         } else if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT) {
 459                 /*
 460                  * Recursion depth reached.
 461                  */
 462                 KKASSERT((chain->flags & HAMMER2_CHAIN_DELAYED) == 0);
 463                 hammer2_chain_ref(chain);
 464                 TAILQ_INSERT_TAIL(&info->flushq, chain, flush_node);
 465                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
 466                 ++info->diddeferral;
 467         } else if ((chain->flags & HAMMER2_CHAIN_PFSBOUNDARY) && istop == 0) {
 468                 /*
 469                  * We do not recurse through PFSROOTs.  PFSROOT flushes are
 470                  * handled by the related pmp's (whether mounted or not,
 471                  * including during recovery).
 472                  *
 473                  * But we must still process the PFSROOT chains for block
 474                  * table updates in their parent (which IS part of our flush).
 475                  *
 476                  * Note that the volume root, vchain, does not set this flag.
 477                  */
 478                 ;
 479         } else if (chain->flags & HAMMER2_CHAIN_ONFLUSH) {
 480                 /*
 481                  * Downward recursion search (actual flush occurs bottom-up).
 482                  * pre-clear ONFLUSH.  It can get set again due to races,
 483                  * which we want so the scan finds us again in the next flush.
 484                  * These races can also include
 485                  *
 486                  * Flush recursions stop at PFSROOT boundaries.  Each PFS
 487                  * must be individually flushed and then the root must
 488                  * be flushed.
 489                  */
 490                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONFLUSH);
 491                 info->parent = chain;
 492                 hammer2_spin_ex(&chain->core.spin);
 493                 RB_SCAN(hammer2_chain_tree, &chain->core.rbtree,
 494                         NULL, hammer2_flush_recurse, info);
 495                 hammer2_spin_unex(&chain->core.spin);
 496                 info->parent = parent;
 497                 if (info->diddeferral)
 498                         hammer2_chain_setflush(chain);
 499         }
 500
 501         /*
 502          * Now we are in the bottom-up part of the recursion.
 503          *
 504          * Do not update chain if lower layers were deferred.
 505          */
 506         if (info->diddeferral)
 507                 goto done;
 508
 509         /*
 510          * Propagate the DESTROY flag downwards.  This dummies up the flush
 511          * code and tries to invalidate related buffer cache buffers to
 512          * avoid the disk write.
 513          */
 514         if (parent && (parent->flags & HAMMER2_CHAIN_DESTROY))
 515                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROY);
 516
 517         /*
 518          * Chain was already modified or has become modified, flush it out.
 519          */
 520 again:
 521         if ((hammer2_debug & 0x200) &&
 522             info->debug &&
 523             (chain->flags & (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_UPDATE))) {
 524                 hammer2_chain_t *scan = chain;
 525
 526                 kprintf("DISCONNECTED FLUSH %p->%p\n", info->debug, chain);
 527                 while (scan) {
 528                         kprintf("    chain %p [%08x] bref=%016jx:%02x\n",
 529                                 scan, scan->flags,
 530                                 scan->bref.key, scan->bref.type);
 531                         if (scan == info->debug)
 532                                 break;
 533                         scan = scan->parent;
 534                 }
 535         }
 536
 537         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 538                 /*
 539                  * Dispose of the modified bit.
 540                  *
 541                  * UPDATE should already be set.
 542                  * bref.mirror_tid should already be set.
 543                  */
 544                 KKASSERT((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 545                          chain == &hmp->vchain);
 546                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
 547
 548                 /*
 549                  * Manage threads waiting for excessive dirty memory to
 550                  * be retired.
 551                  */
 552                 if (chain->pmp)
 553                         hammer2_pfs_memory_wakeup(chain->pmp);
 554
 555                 if ((chain->flags & HAMMER2_CHAIN_UPDATE) ||
 556                     chain == &hmp->vchain ||
 557                     chain == &hmp->fchain) {
 558                         /*
 559                          * Drop the ref from the MODIFIED bit we cleared,
 560                          * net -1 ref.
 561                          */
 562                         hammer2_chain_drop(chain);
 563                 } else {
 564                         /*
 565                          * Drop the ref from the MODIFIED bit we cleared and
 566                          * set a ref for the UPDATE bit we are setting.  Net
 567                          * 0 refs.
 568                          */
 569                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 570                 }
 571
 572                 /*
 573                  * Issue the flush.  This is indirect via the DIO.
 574                  *
 575                  * NOTE: A DELETED node that reaches this point must be
 576                  *       flushed for synchronization point consistency.
 577                  *
 578                  * NOTE: Even though MODIFIED was already set, the related DIO
 579                  *       might not be dirty due to a system buffer cache
 580                  *       flush and must be set dirty if we are going to make
 581                  *       further modifications to the buffer.  Chains with
 582                  *       embedded data don't need this.
 583                  */
 584                 if (hammer2_debug & 0x1000) {
 585                         kprintf("Flush %p.%d %016jx/%d data=%016jx",
 586                                 chain, chain->bref.type,
 587                                 (uintmax_t)chain->bref.key,
 588                                 chain->bref.keybits,
 589                                 (uintmax_t)chain->bref.data_off);
 590                 }
 591                 if (hammer2_debug & 0x2000) {
 592                         Debugger("Flush hell");
 593                 }
 594
 595                 /*
 596                  * Update chain CRCs for flush.
 597                  *
 598                  * NOTE: Volume headers are NOT flushed here as they require
 599                  *       special processing.
 600                  */
 601                 switch(chain->bref.type) {
 602                 case HAMMER2_BREF_TYPE_FREEMAP:
 603                         /*
 604                          * Update the volume header's freemap_tid to the
 605                          * freemap's flushing mirror_tid.
 606                          *
 607                          * (note: embedded data, do not call setdirty)
 608                          */
 609                         KKASSERT(hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED);
 610                         KKASSERT(chain == &hmp->fchain);
 611                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
 612                         kprintf("sync freemap mirror_tid %08jx\n",
 613                                 (intmax_t)chain->bref.mirror_tid);
 614
 615                         /*
 616                          * The freemap can be flushed independently of the
 617                          * main topology, but for the case where it is
 618                          * flushed in the same transaction, and flushed
 619                          * before vchain (a case we want to allow for
 620                          * performance reasons), make sure modifications
 621                          * made during the flush under vchain use a new
 622                          * transaction id.
 623                          *
 624                          * Otherwise the mount recovery code will get confused.
 625                          */
 626                         ++hmp->voldata.mirror_tid;
 627                         break;
 628                 case HAMMER2_BREF_TYPE_VOLUME:
 629                         /*
 630                          * The free block table is flushed by
 631                          * hammer2_vfs_sync() before it flushes vchain.
 632                          * We must still hold fchain locked while copying
 633                          * voldata to volsync, however.
 634                          *
 635                          * (note: embedded data, do not call setdirty)
 636                          */
 637                         hammer2_chain_lock(&hmp->fchain,
 638                                            HAMMER2_RESOLVE_ALWAYS);
 639                         hammer2_voldata_lock(hmp);
 640                         kprintf("sync volume  mirror_tid %08jx\n",
 641                                 (intmax_t)chain->bref.mirror_tid);
 642
 643                         /*
 644                          * Update the volume header's mirror_tid to the
 645                          * main topology's flushing mirror_tid.  It is
 646                          * possible that voldata.mirror_tid is already
 647                          * beyond bref.mirror_tid due to the bump we made
 648                          * above in BREF_TYPE_FREEMAP.
 649                          */
 650                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
 651                                 hmp->voldata.mirror_tid =
 652                                         chain->bref.mirror_tid;
 653                         }
 654
 655                         /*
 656                          * The volume header is flushed manually by the
 657                          * syncer, not here.  All we do here is adjust the
 658                          * crc's.
 659                          */
 660                         KKASSERT(chain->data != NULL);
 661                         KKASSERT(chain->dio == NULL);
 662
 663                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
 664                                 hammer2_icrc32(
 665                                         (char *)&hmp->voldata +
 666                                          HAMMER2_VOLUME_ICRC1_OFF,
 667                                         HAMMER2_VOLUME_ICRC1_SIZE);
 668                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
 669                                 hammer2_icrc32(
 670                                         (char *)&hmp->voldata +
 671                                          HAMMER2_VOLUME_ICRC0_OFF,
 672                                         HAMMER2_VOLUME_ICRC0_SIZE);
 673                         hmp->voldata.icrc_volheader =
 674                                 hammer2_icrc32(
 675                                         (char *)&hmp->voldata +
 676                                          HAMMER2_VOLUME_ICRCVH_OFF,
 677                                         HAMMER2_VOLUME_ICRCVH_SIZE);
 678
 679                         kprintf("syncvolhdr %016jx %016jx\n",
 680                                 hmp->voldata.mirror_tid,
 681                                 hmp->vchain.bref.mirror_tid);
 682                         hmp->volsync = hmp->voldata;
 683                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_VOLUMESYNC);
 684                         hammer2_voldata_unlock(hmp);
 685                         hammer2_chain_unlock(&hmp->fchain);
 686                         break;
 687                 case HAMMER2_BREF_TYPE_DATA:
 688                         /*
 689                          * Data elements have already been flushed via the
 690                          * logical file buffer cache.  Their hash was set in
 691                          * the bref by the vop_write code.  Do not re-dirty.
 692                          *
 693                          * Make sure any device buffer(s) have been flushed
 694                          * out here (there aren't usually any to flush) XXX.
 695                          */
 696                         break;
 697                 case HAMMER2_BREF_TYPE_INDIRECT:
 698                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 699                 case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
 700                         /*
 701                          * Buffer I/O will be cleaned up when the volume is
 702                          * flushed (but the kernel is free to flush it before
 703                          * then, as well).
 704                          */
 705                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 706                         hammer2_chain_setcheck(chain, chain->data);
 707                         break;
 708                 case HAMMER2_BREF_TYPE_INODE:
 709                         /*
 710                          * NOTE: We must call io_setdirty() to make any late
 711                          *       changes to the inode data, the system might
 712                          *       have already flushed the buffer.
 713                          */
 714                         if (chain->data->ipdata.meta.op_flags &
 715                             HAMMER2_OPFLAG_PFSROOT) {
 716                                 /*
 717                                  * non-NULL pmp if mounted as a PFS.  We must
 718                                  * sync fields cached in the pmp? XXX
 719                                  */
 720                                 hammer2_inode_data_t *ipdata;
 721
 722                                 hammer2_io_setdirty(chain->dio);
 723                                 ipdata = &chain->data->ipdata;
 724                                 if (chain->pmp) {
 725                                         ipdata->meta.pfs_inum =
 726                                                 chain->pmp->inode_tid;
 727                                 }
 728                         } else {
 729                                 /* can't be mounted as a PFS */
 730                         }
 731
 732                         KKASSERT((chain->flags & HAMMER2_CHAIN_EMBEDDED) == 0);
 733                         hammer2_chain_setcheck(chain, chain->data);
 734                         break;
 735                 default:
 736                         KKASSERT(chain->flags & HAMMER2_CHAIN_EMBEDDED);
 737                         panic("hammer2_flush_core: unsupported "
 738                               "embedded bref %d",
 739                               chain->bref.type);
 740                         /* NOT REACHED */
 741                 }
 742
 743                 /*
 744                  * If the chain was destroyed try to avoid unnecessary I/O.
 745                  * (this only really works if the DIO system buffer is the
 746                  * same size as chain->bytes).
 747                  */
 748                 if ((chain->flags & HAMMER2_CHAIN_DESTROY) && chain->dio) {
 749                         hammer2_io_setinval(chain->dio, chain->bytes);
 750                 }
 751         }
 752
 753         /*
 754          * If UPDATE is set the parent block table may need to be updated.
 755          *
 756          * NOTE: UPDATE may be set on vchain or fchain in which case
 757          *       parent could be NULL.  It's easiest to allow the case
 758          *       and test for NULL.  parent can also wind up being NULL
 759          *       due to a deletion so we need to handle the case anyway.
 760          *
 761          * If no parent exists we can just clear the UPDATE bit.  If the
 762          * chain gets reattached later on the bit will simply get set
 763          * again.
 764          */
 765         if ((chain->flags & HAMMER2_CHAIN_UPDATE) && parent == NULL) {
 766                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 767                 hammer2_chain_drop(chain);
 768         }
 769
 770         /*
 771          * The chain may need its blockrefs updated in the parent.  This
 772          * requires some fancy footwork.
 773          */
 774         if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 775                 hammer2_blockref_t *base;
 776                 int count;
 777
 778                 /*
 779                  * Both parent and chain must be locked.  This requires
 780                  * temporarily unlocking the chain.  We have to deal with
 781                  * the case where the chain might be reparented or modified
 782                  * while it was unlocked.
 783                  */
 784                 hammer2_chain_unlock(chain);
 785                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
 786                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_MAYBE);
 787                 if (chain->parent != parent) {
 788                         kprintf("PARENT MISMATCH ch=%p p=%p/%p\n",
 789                                 chain, chain->parent, parent);
 790                         hammer2_chain_unlock(parent);
 791                         goto done;
 792                 }
 793
 794                 /*
 795                  * Check race condition.  If someone got in and modified
 796                  * it again while it was unlocked, we have to loop up.
 797                  */
 798                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
 799                         hammer2_chain_unlock(parent);
 800                         kprintf("hammer2_flush: chain %p flush-mod race\n",
 801                                 chain);
 802                         goto again;
 803                 }
 804
 805                 /*
 806                  * Clear UPDATE flag, mark parent modified, update its
 807                  * modify_tid if necessary, and adjust the parent blockmap.
 808                  */
 809                 if (chain->flags & HAMMER2_CHAIN_UPDATE) {
 810                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE);
 811                         hammer2_chain_drop(chain);
 812                 }
 813
 814                 /*
 815                  * (optional code)
 816                  *
 817                  * Avoid actually modifying and updating the parent if it
 818                  * was flagged for destruction.  This can greatly reduce
 819                  * disk I/O in large tree removals because the
 820                  * hammer2_io_setinval() call in the upward recursion
 821                  * (see MODIFIED code above) can only handle a few cases.
 822                  */
 823                 if (parent->flags & HAMMER2_CHAIN_DESTROY) {
 824                         if (parent->bref.modify_tid < chain->bref.modify_tid) {
 825                                 parent->bref.modify_tid =
 826                                         chain->bref.modify_tid;
 827                         }
 828                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_BMAPPED |
 829                                                         HAMMER2_CHAIN_BMAPUPD);
 830                         hammer2_chain_unlock(parent);
 831                         goto skipupdate;
 832                 }
 833
 834                 /*
 835                  * We are updating the parent's blockmap, the parent must
 836                  * be set modified.
 837                  */
 838                 hammer2_chain_modify(parent, HAMMER2_MODIFY_KEEPMODIFY);
 839                 if (parent->bref.modify_tid < chain->bref.modify_tid)
 840                         parent->bref.modify_tid = chain->bref.modify_tid;
 841
 842                 /*
 843                  * Calculate blockmap pointer
 844                  */
 845                 switch(parent->bref.type) {
 846                 case HAMMER2_BREF_TYPE_INODE:
 847                         /*
 848                          * Access the inode's block array.  However, there is
 849                          * no block array if the inode is flagged DIRECTDATA.
 850                          */
 851                         if (parent->data &&
 852                             (parent->data->ipdata.meta.op_flags &
 853                              HAMMER2_OPFLAG_DIRECTDATA) == 0) {
 854                                 base = &parent->data->
 855                                         ipdata.u.blockset.blockref[0];
 856                         } else {
 857                                 base = NULL;
 858                         }
 859                         count = HAMMER2_SET_COUNT;
 860                         break;
 861                 case HAMMER2_BREF_TYPE_INDIRECT:
 862                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
 863                         if (parent->data)
 864                                 base = &parent->data->npdata[0];
 865                         else
 866                                 base = NULL;
 867                         count = parent->bytes / sizeof(hammer2_blockref_t);
 868                         break;
 869                 case HAMMER2_BREF_TYPE_VOLUME:
 870                         base = &chain->hmp->voldata.sroot_blockset.blockref[0];
 871                         count = HAMMER2_SET_COUNT;
 872                         break;
 873                 case HAMMER2_BREF_TYPE_FREEMAP:
 874                         base = &parent->data->npdata[0];
 875                         count = HAMMER2_SET_COUNT;
 876                         break;
 877                 default:
 878                         base = NULL;
 879                         count = 0;
 880                         panic("hammer2_flush_core: "
 881                               "unrecognized blockref type: %d",
 882                               parent->bref.type);
 883                 }
 884
 885                 /*
 886                  * Blocktable updates
 887                  *
 888                  * We synchronize pending statistics at this time.  Delta
 889                  * adjustments designated for the current and upper level
 890                  * are synchronized.
 891                  */
 892                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPUPD)) {
 893                         if (chain->flags & HAMMER2_CHAIN_BMAPPED) {
 894                                 hammer2_spin_ex(&parent->core.spin);
 895                                 hammer2_base_delete(parent, base, count,
 896                                                     &info->cache_index, chain);
 897                                 hammer2_spin_unex(&parent->core.spin);
 898                                 /* base_delete clears both bits */
 899                         } else {
 900                                 atomic_clear_int(&chain->flags,
 901                                                  HAMMER2_CHAIN_BMAPUPD);
 902                         }
 903                 }
 904                 if (base && (chain->flags & HAMMER2_CHAIN_BMAPPED) == 0) {
 905                         hammer2_spin_ex(&parent->core.spin);
 906                         hammer2_base_insert(parent, base, count,
 907                                             &info->cache_index, chain);
 908                         hammer2_spin_unex(&parent->core.spin);
 909                         /* base_insert sets BMAPPED */
 910                 }
 911                 hammer2_chain_unlock(parent);
 912         }
 913 skipupdate:
 914         ;
 915
 916         /*
 917          * Final cleanup after flush
 918          */
 919 done:
 920         KKASSERT(chain->refs > 0);
 921         if (hammer2_debug & 0x200) {
 922                 if (info->debug == chain)
 923                         info->debug = NULL;
 924         }
 925 }
 926
 927 /*
 928  * Flush recursion helper, called from flush_core, calls flush_core.
 929  *
 930  * Flushes the children of the caller's chain (info->parent), restricted
 931  * by sync_tid.  Set info->domodify if the child's blockref must propagate
 932  * back up to the parent.
 933  *
 934  * Ripouts can move child from rbtree to dbtree or dbq but the caller's
 935  * flush scan order prevents any chains from being lost.  A child can be
 936  * executes more than once.
 937  *
 938  * WARNING! If we do not call hammer2_flush_core() we must update
 939  *          bref.mirror_tid ourselves to indicate that the flush has
 940  *          processed the child.
 941  *
 942  * WARNING! parent->core spinlock is held on entry and return.
 943  */
 944 static int
 945 hammer2_flush_recurse(hammer2_chain_t *child, void *data)
 946 {
 947         hammer2_flush_info_t *info = data;
 948         hammer2_chain_t *parent = info->parent;
 949
 950         /*
 951          * (child can never be fchain or vchain so a special check isn't
 952          *  needed).
 953          *
 954          * We must ref the child before unlocking the spinlock.
 955          *
 956          * The caller has added a ref to the parent so we can temporarily
 957          * unlock it in order to lock the child.
 958          */
 959         hammer2_chain_ref(child);
 960         hammer2_spin_unex(&parent->core.spin);
 961
 962         hammer2_chain_unlock(parent);
 963         hammer2_chain_lock(child, HAMMER2_RESOLVE_MAYBE);
 964
 965         /*
 966          * Recurse and collect deferral data.  We're in the media flush,
 967          * this can cross PFS boundaries.
 968          */
 969         if (child->flags & HAMMER2_CHAIN_FLUSH_MASK) {
 970                 ++info->depth;
 971                 hammer2_flush_core(info, child, 0);
 972                 --info->depth;
 973         } else if (hammer2_debug & 0x200) {
 974                 if (info->debug == NULL)
 975                         info->debug = child;
 976                 ++info->depth;
 977                 hammer2_flush_core(info, child, 0);
 978                 --info->depth;
 979                 if (info->debug == child)
 980                         info->debug = NULL;
 981         }
 982
 983         /*
 984          * Relock to continue the loop
 985          */
 986         hammer2_chain_unlock(child);
 987         hammer2_chain_lock(parent, HAMMER2_RESOLVE_MAYBE);
 988         hammer2_chain_drop(child);
 989         KKASSERT(info->parent == parent);
 990         hammer2_spin_ex(&parent->core.spin);
 991
 992         return (0);
 993 }
 994
 995 /*
 996  * flush helper (backend threaded)
 997  *
 998  * Flushes core chains, issues disk sync, flushes volume roots.
 999  *
1000  * Primarily called from vfs_sync().
1001  */
1002 void
1003 hammer2_inode_xop_flush(hammer2_xop_t *arg, int clindex)
1004 {
1005         hammer2_xop_flush_t *xop = &arg->xop_flush;
1006         hammer2_chain_t *chain;
1007         hammer2_chain_t *parent;
1008         hammer2_dev_t *hmp;
1009         int error = 0;
1010         int total_error = 0;
1011         int j;
1012
1013         /*
1014          * Flush core chains
1015          */
1016         chain = hammer2_inode_chain(xop->head.ip, clindex,
1017                                     HAMMER2_RESOLVE_ALWAYS);
1018         if (chain) {
1019                 hmp = chain->hmp;
1020                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
1021                         hammer2_flush(chain, 1);
1022                         parent = chain->parent;
1023                         KKASSERT(chain->pmp != parent->pmp);
1024                         hammer2_chain_setflush(parent);
1025                 }
1026                 hammer2_chain_unlock(chain);
1027                 hammer2_chain_drop(chain);
1028                 chain = NULL;
1029         } else {
1030                 hmp = NULL;
1031         }
1032
1033         /*
1034          * Flush volume roots.  Avoid replication, we only want to
1035          * flush each hammer2_dev (hmp) once.
1036          */
1037         for (j = clindex - 1; j >= 0; --j) {
1038                 if ((chain = xop->head.ip->cluster.array[j].chain) != NULL) {
1039                         if (chain->hmp == hmp) {
1040                                 chain = NULL;   /* safety */
1041                                 goto skip;
1042                         }
1043                 }
1044         }
1045         chain = NULL;   /* safety */
1046
1047         /*
1048          * spmp transaction.  The super-root is never directly mounted so
1049          * there shouldn't be any vnodes, let alone any dirty vnodes
1050          * associated with it.
1051          */
1052         hammer2_trans_init(hmp->spmp, HAMMER2_TRANS_ISFLUSH);
1053
1054         /*
1055          * Media mounts have two 'roots', vchain for the topology
1056          * and fchain for the free block table.  Flush both.
1057          *
1058          * Note that the topology and free block table are handled
1059          * independently, so the free block table can wind up being
1060          * ahead of the topology.  We depend on the bulk free scan
1061          * code to deal with any loose ends.
1062          */
1063         hammer2_chain_ref(&hmp->vchain);
1064         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1065         hammer2_chain_ref(&hmp->fchain);
1066         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1067         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1068                 /*
1069                  * This will also modify vchain as a side effect,
1070                  * mark vchain as modified now.
1071                  */
1072                 hammer2_voldata_modify(hmp);
1073                 chain = &hmp->fchain;
1074                 hammer2_flush(chain, 1);
1075                 KKASSERT(chain == &hmp->fchain);
1076         }
1077         hammer2_chain_unlock(&hmp->fchain);
1078         hammer2_chain_unlock(&hmp->vchain);
1079         hammer2_chain_drop(&hmp->fchain);
1080         /* vchain dropped down below */
1081
1082         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1083         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1084                 chain = &hmp->vchain;
1085                 hammer2_flush(chain, 1);
1086                 KKASSERT(chain == &hmp->vchain);
1087         }
1088         hammer2_chain_unlock(&hmp->vchain);
1089         hammer2_chain_drop(&hmp->vchain);
1090
1091         error = 0;
1092
1093         /*
1094          * We can't safely flush the volume header until we have
1095          * flushed any device buffers which have built up.
1096          *
1097          * XXX this isn't being incremental
1098          */
1099         vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1100         error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1101         vn_unlock(hmp->devvp);
1102
1103         /*
1104          * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1105          * volume header needs synchronization via hmp->volsync.
1106          *
1107          * XXX synchronize the flag & data with only this flush XXX
1108          */
1109         if (error == 0 &&
1110             (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1111                 struct buf *bp;
1112
1113                 /*
1114                  * Synchronize the disk before flushing the volume
1115                  * header.
1116                  */
1117                 bp = getpbuf(NULL);
1118                 bp->b_bio1.bio_offset = 0;
1119                 bp->b_bufsize = 0;
1120                 bp->b_bcount = 0;
1121                 bp->b_cmd = BUF_CMD_FLUSH;
1122                 bp->b_bio1.bio_done = biodone_sync;
1123                 bp->b_bio1.bio_flags |= BIO_SYNC;
1124                 vn_strategy(hmp->devvp, &bp->b_bio1);
1125                 biowait(&bp->b_bio1, "h2vol");
1126                 relpbuf(bp, NULL);
1127
1128                 /*
1129                  * Then we can safely flush the version of the
1130                  * volume header synchronized by the flush code.
1131                  */
1132                 j = hmp->volhdrno + 1;
1133                 if (j >= HAMMER2_NUM_VOLHDRS)
1134                         j = 0;
1135                 if (j * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1136                     hmp->volsync.volu_size) {
1137                         j = 0;
1138                 }
1139                 kprintf("sync volhdr %d %jd\n",
1140                         j, (intmax_t)hmp->volsync.volu_size);
1141                 bp = getblk(hmp->devvp, j * HAMMER2_ZONE_BYTES64,
1142                             HAMMER2_PBUFSIZE, 0, 0);
1143                 atomic_clear_int(&hmp->vchain.flags,
1144                                  HAMMER2_CHAIN_VOLUMESYNC);
1145                 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1146                 bawrite(bp);
1147                 hmp->volhdrno = j;
1148         }
1149         if (error)
1150                 total_error = error;
1151
1152         hammer2_trans_done(hmp->spmp);  /* spmp trans */
1153 skip:
1154         error = hammer2_xop_feed(&xop->head, NULL, clindex, total_error);
1155 }